Integrate watchdog into render worker (#88)

* Add a watchdog to base_worker

* Logging cleanup

* Prevent multiple watchdogs from running if render process restarts

* Add process timeout parameter to Config

* Refactor

* Add error handling to process output parsing

* Fix issue where start_time was not getting set consistently
This commit is contained in:
2024-08-06 10:48:24 -05:00
committed by GitHub
parent 90d5e9b7af
commit 6afb6e65a6
5 changed files with 99 additions and 41 deletions

View File

@@ -5,6 +5,7 @@ import logging
import os
import subprocess
import threading
import time
from datetime import datetime
import psutil
@@ -94,10 +95,12 @@ class BaseRenderWorker(Base):
self.errors = []
# Threads and processes
self.__thread = threading.Thread(target=self.run, args=())
self.__thread = threading.Thread(target=self.__run, args=())
self.__thread.daemon = True
self.__process = None
self.last_output = None
self.__last_output_time = None
self.watchdog_timeout = 120
def __repr__(self):
return f"<{self.__class__.__name__}|{self.id}|{self.name}|{self.status}|{self.input_path}>"
@@ -175,11 +178,12 @@ class BaseRenderWorker(Base):
self.status = RenderStatus.RUNNING
self.start_time = datetime.now()
logger.info(f'Starting {self.engine.name()} {self.renderer_version} Render for {self.input_path} | '
f'Frame Count: {self.total_frames}')
self.__thread.start()
def run(self):
def __run(self):
logger.info(f'Starting {self.engine.name()} {self.renderer_version} Render for {self.input_path} | '
f'Frame Count: {self.total_frames}')
# Setup logging
log_dir = os.path.dirname(self.log_path())
os.makedirs(log_dir, exist_ok=True)
@@ -209,49 +213,43 @@ class BaseRenderWorker(Base):
logger.warning(f"Restarting render - Attempt #{failed_attempts + 1}")
self.status = RenderStatus.RUNNING
# Start process and get updates
self.__process = subprocess.Popen(subprocess_cmds, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
universal_newlines=False)
for c in io.TextIOWrapper(self.__process.stdout, encoding="utf-8"): # or another encoding
f.write(c)
f.flush()
os.fsync(f.fileno())
self.last_output = c.strip()
self._parse_stdout(c.strip())
f.write('\n')
# Check return codes and process
return_code = self.__process.wait()
return_code = self.__setup_and_run_process(f, subprocess_cmds)
self.end_time = datetime.now()
if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]: # user cancelled
message = f"{'=' * 50}\n\n{self.engine.name()} render ended with code {return_code} " \
f"after {self.time_elapsed()}\n\n"
f.write(message)
# Teardown
if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]:
message = f"{self.engine.name()} render ended with status '{self.status}' " \
f"after {self.time_elapsed()}"
f.write(message)
return
# if file output hasn't increased, return as error, otherwise restart process.
if len(self.file_list()) <= initial_file_count:
err_msg = f"File count has not increased. Count is still {len(self.file_list())}"
f.write(f'Error: {err_msg}\n\n')
self.errors.append(err_msg)
self.status = RenderStatus.ERROR
# Handle completed - All else counts as failed attempt
if (self.status == RenderStatus.RUNNING) and not return_code:
file_count_has_increased = len(self.file_list()) > initial_file_count
if (self.status == RenderStatus.RUNNING) and file_count_has_increased and not return_code:
message = (f"{'=' * 50}\n\n{self.engine.name()} render completed successfully in "
f"{self.time_elapsed()}\n")
f.write(message)
break
# Handle non-zero return codes
message = f"{'=' * 50}\n\n{self.engine.name()} render failed with code {return_code} " \
f"after {self.time_elapsed()}\n\n"
f.write(message)
self.errors.append(message)
failed_attempts += 1
if return_code:
err_msg = f"{self.engine.name()} render failed with code {return_code}"
logger.error(err_msg)
self.errors.append(err_msg)
# handle instances where renderer exits ok but doesnt generate files
if not return_code and not file_count_has_increased:
err_msg = (f"{self.engine.name()} render exited ok, but file count has not increased. "
f"Count is still {len(self.file_list())}")
f.write(f'Error: {err_msg}\n\n')
self.errors.append(err_msg)
# only count the attempt as failed if renderer creates no output - ignore error codes for now
if not file_count_has_increased:
failed_attempts += 1
if self.children:
from src.distributed_job_manager import DistributedJobManager
@@ -263,6 +261,65 @@ class BaseRenderWorker(Base):
self.status = RenderStatus.COMPLETED
logger.info(f"Render {self.id}-{self.name} completed successfully after {self.time_elapsed()}")
def __setup_and_run_process(self, f, subprocess_cmds):
def watchdog():
logger.debug(f'Starting process watchdog for {self} with {self.watchdog_timeout}s timeout')
while self.__process.poll() is None:
time_since_last_update = time.time() - self.__last_output_time
if time_since_last_update > self.watchdog_timeout:
logger.error(f"Process for {self} terminated due to exceeding timeout ({self.watchdog_timeout}s)")
self.__process.kill()
break
# logger.debug(f'Watchdog for {self} - Time since last update: {time_since_last_update}')
time.sleep(1)
logger.debug(f'Stopping process watchdog for {self}')
return_code = -1
watchdog_thread = threading.Thread(target=watchdog)
watchdog_thread.daemon = True
try:
# Start process and get updates
self.__process = subprocess.Popen(subprocess_cmds, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
universal_newlines=False)
# Start watchdog
self.__last_output_time = time.time()
watchdog_thread.start()
for c in io.TextIOWrapper(self.__process.stdout, encoding="utf-8"): # or another encoding
self.last_output = c.strip()
self.__last_output_time = time.time()
try:
f.write(c)
f.flush()
os.fsync(f.fileno())
except Exception as e:
logger.error(f"Error saving log to disk: {e}")
try:
self._parse_stdout(c.strip())
except Exception as e:
logger.error(f'Error parsing stdout: {e}')
f.write('\n')
# Check return codes and process
return_code = self.__process.wait()
except Exception as e:
message = f'Uncaught error running render process: {e}'
f.write(message)
logger.exception(message)
self.__process.kill()
# let watchdog end before continuing - prevents multiple watchdogs running when process restarts
if watchdog_thread.is_alive():
watchdog_thread.join()
return return_code
def post_processing(self):
pass