Parent creates local subjobs instead of truncating original (#95)

* Parent worker now creates subjob on local host and waits for it

* Improve wait_for_subjobs logic

* Fix setting end_time for base_worker

* API cleanup

* Code refactoring

* Cleanup
This commit is contained in:
2024-08-10 21:19:01 -05:00
committed by GitHub
parent f9b51886ab
commit e757506787
3 changed files with 168 additions and 138 deletions

View File

@@ -90,7 +90,7 @@ class BaseRenderWorker(Base):
self.end_time = None
# History
self.status = RenderStatus.NOT_STARTED
self.status = RenderStatus.CONFIGURING
self.warnings = []
self.errors = []
@@ -158,7 +158,7 @@ class BaseRenderWorker(Base):
def start(self):
if self.status not in [RenderStatus.SCHEDULED, RenderStatus.NOT_STARTED]:
if self.status not in [RenderStatus.SCHEDULED, RenderStatus.NOT_STARTED, RenderStatus.CONFIGURING]:
logger.error(f"Trying to start job with status: {self.status}")
return
@@ -176,90 +176,114 @@ class BaseRenderWorker(Base):
self.errors.append(msg)
return
self.status = RenderStatus.RUNNING
self.status = RenderStatus.RUNNING if not self.children else RenderStatus.WAITING_FOR_SUBJOBS
self.start_time = datetime.now()
self.__thread.start()
# handle multiple attempts at running subprocess
def __run__subprocess_cycle(self, log_file):
subprocess_cmds = self.generate_subprocess()
initial_file_count = len(self.file_list())
failed_attempts = 0
log_file.write(f"Running command: {subprocess_cmds}\n")
log_file.write('=' * 80 + '\n\n')
while True:
# Log attempt #
if failed_attempts:
if failed_attempts >= self.maximum_attempts:
err_msg = f"Maximum attempts exceeded ({self.maximum_attempts})"
logger.error(err_msg)
self.status = RenderStatus.ERROR
self.errors.append(err_msg)
return
else:
log_file.write(f'\n{"=" * 20} Attempt #{failed_attempts + 1} {"=" * 20}\n\n')
logger.warning(f"Restarting render - Attempt #{failed_attempts + 1}")
self.status = RenderStatus.RUNNING
return_code = self.__setup_and_run_process(log_file, subprocess_cmds)
message = f"{'=' * 50}\n\n{self.engine.name()} render ended with code {return_code} " \
f"after {self.time_elapsed()}\n\n"
log_file.write(message)
# don't try again if we've been cancelled
if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]:
return
# if file output hasn't increased, return as error, otherwise restart process.
file_count_has_increased = len(self.file_list()) > initial_file_count
if (self.status == RenderStatus.RUNNING) and file_count_has_increased and not return_code:
break
if return_code:
err_msg = f"{self.engine.name()} render failed with code {return_code}"
logger.error(err_msg)
self.errors.append(err_msg)
# handle instances where renderer exits ok but doesnt generate files
if not return_code and not file_count_has_increased:
err_msg = (f"{self.engine.name()} render exited ok, but file count has not increased. "
f"Count is still {len(self.file_list())}")
log_file.write(f'Error: {err_msg}\n\n')
self.errors.append(err_msg)
# only count the attempt as failed if renderer creates no output - reset counter on successful output
failed_attempts = 0 if file_count_has_increased else failed_attempts + 1
def __run__wait_for_subjobs(self, logfile):
from src.distributed_job_manager import DistributedJobManager
DistributedJobManager.wait_for_subjobs(parent_job=self)
@staticmethod
def log_and_print(message, log_file):
logger.info(message)
log_file.write(f"{message}\n")
def __run(self):
logger.info(f'Starting {self.engine.name()} {self.renderer_version} Render for {self.input_path} | '
f'Frame Count: {self.total_frames}')
# Setup logging
log_dir = os.path.dirname(self.log_path())
os.makedirs(log_dir, exist_ok=True)
subprocess_cmds = self.generate_subprocess()
initial_file_count = len(self.file_list())
failed_attempts = 0
with open(self.log_path(), "a") as log_file:
with open(self.log_path(), "a") as f:
self.log_and_print(f"{self.start_time.isoformat()} - Starting "
f"{self.engine.name()} {self.renderer_version} render job for {self.name} "
f"({self.input_path})", log_file)
log_file.write(f"\n")
if not self.children:
self.__run__subprocess_cycle(log_file)
else:
self.__run__wait_for_subjobs(log_file)
f.write(f"{self.start_time.isoformat()} - Starting {self.engine.name()} {self.renderer_version} "
f"render for {self.input_path}\n\n")
f.write(f"Running command: {subprocess_cmds}\n")
f.write('=' * 80 + '\n\n')
while True:
# Log attempt #
if failed_attempts:
if failed_attempts >= self.maximum_attempts:
err_msg = f"Maximum attempts exceeded ({self.maximum_attempts})"
logger.error(err_msg)
self.status = RenderStatus.ERROR
self.errors.append(err_msg)
return
else:
f.write(f'\n{"=" * 20} Attempt #{failed_attempts + 1} {"=" * 20}\n\n')
logger.warning(f"Restarting render - Attempt #{failed_attempts + 1}")
self.status = RenderStatus.RUNNING
return_code = self.__setup_and_run_process(f, subprocess_cmds)
if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]:
self.end_time = datetime.now()
message = f"{self.engine.name()} render ended with status '{self.status}' " \
f"after {self.time_elapsed()}"
self.log_and_print(message, log_file)
return
message = f"{'=' * 50}\n\n{self.engine.name()} render ended with code {return_code} " \
f"after {self.time_elapsed()}\n\n"
f.write(message)
# Validate Output
file_list_length = len(self.file_list())
expected_list_length = (self.end_frame - self.start_frame + 1) if self.end_frame else 1
# Teardown
if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]:
message = f"{self.engine.name()} render ended with status '{self.status}' " \
f"after {self.time_elapsed()}"
f.write(message)
return
if file_list_length not in (expected_list_length, 1):
logger.error(f"Expected length: {expected_list_length} | actual length: {len(self.file_list())}")
# todo: create new subjob
# if file output hasn't increased, return as error, otherwise restart process.
file_count_has_increased = len(self.file_list()) > initial_file_count
if (self.status == RenderStatus.RUNNING) and file_count_has_increased and not return_code:
message = (f"{'=' * 50}\n\n{self.engine.name()} render completed successfully in "
f"{self.time_elapsed()}\n")
f.write(message)
break
# Post Render Work
if not self.parent:
logger.debug(f"Starting post-processing work for {self}")
self.post_processing()
logger.debug(f"Completed post-processing work for {self}")
if return_code:
err_msg = f"{self.engine.name()} render failed with code {return_code}"
logger.error(err_msg)
self.errors.append(err_msg)
# handle instances where renderer exits ok but doesnt generate files
if not return_code and not file_count_has_increased:
err_msg = (f"{self.engine.name()} render exited ok, but file count has not increased. "
f"Count is still {len(self.file_list())}")
f.write(f'Error: {err_msg}\n\n')
self.errors.append(err_msg)
# only count the attempt as failed if renderer creates no output - ignore error codes for now
if not file_count_has_increased:
failed_attempts += 1
if self.children:
from src.distributed_job_manager import DistributedJobManager
DistributedJobManager.wait_for_subjobs(local_job=self)
# Post Render Work
logger.debug("Starting post-processing work")
self.post_processing()
self.status = RenderStatus.COMPLETED
logger.info(f"Render {self.id}-{self.name} completed successfully after {self.time_elapsed()}")
self.status = RenderStatus.COMPLETED
self.end_time = datetime.now()
message = f"Render {self.name} completed successfully after {self.time_elapsed()}"
self.log_and_print(message, log_file)
def __setup_and_run_process(self, f, subprocess_cmds):