mirror of
https://github.com/blw1138/Zordon.git
synced 2026-02-05 21:56:10 +00:00
Parent creates local subjobs instead of truncating original (#95)
* Parent worker now creates subjob on local host and waits for it * Improve wait_for_subjobs logic * Fix setting end_time for base_worker * API cleanup * Code refactoring * Cleanup
This commit is contained in:
@@ -90,7 +90,7 @@ class BaseRenderWorker(Base):
|
||||
self.end_time = None
|
||||
|
||||
# History
|
||||
self.status = RenderStatus.NOT_STARTED
|
||||
self.status = RenderStatus.CONFIGURING
|
||||
self.warnings = []
|
||||
self.errors = []
|
||||
|
||||
@@ -158,7 +158,7 @@ class BaseRenderWorker(Base):
|
||||
|
||||
def start(self):
|
||||
|
||||
if self.status not in [RenderStatus.SCHEDULED, RenderStatus.NOT_STARTED]:
|
||||
if self.status not in [RenderStatus.SCHEDULED, RenderStatus.NOT_STARTED, RenderStatus.CONFIGURING]:
|
||||
logger.error(f"Trying to start job with status: {self.status}")
|
||||
return
|
||||
|
||||
@@ -176,90 +176,114 @@ class BaseRenderWorker(Base):
|
||||
self.errors.append(msg)
|
||||
return
|
||||
|
||||
self.status = RenderStatus.RUNNING
|
||||
self.status = RenderStatus.RUNNING if not self.children else RenderStatus.WAITING_FOR_SUBJOBS
|
||||
self.start_time = datetime.now()
|
||||
self.__thread.start()
|
||||
|
||||
# handle multiple attempts at running subprocess
|
||||
def __run__subprocess_cycle(self, log_file):
|
||||
subprocess_cmds = self.generate_subprocess()
|
||||
initial_file_count = len(self.file_list())
|
||||
failed_attempts = 0
|
||||
|
||||
log_file.write(f"Running command: {subprocess_cmds}\n")
|
||||
log_file.write('=' * 80 + '\n\n')
|
||||
|
||||
while True:
|
||||
# Log attempt #
|
||||
if failed_attempts:
|
||||
if failed_attempts >= self.maximum_attempts:
|
||||
err_msg = f"Maximum attempts exceeded ({self.maximum_attempts})"
|
||||
logger.error(err_msg)
|
||||
self.status = RenderStatus.ERROR
|
||||
self.errors.append(err_msg)
|
||||
return
|
||||
else:
|
||||
log_file.write(f'\n{"=" * 20} Attempt #{failed_attempts + 1} {"=" * 20}\n\n')
|
||||
logger.warning(f"Restarting render - Attempt #{failed_attempts + 1}")
|
||||
self.status = RenderStatus.RUNNING
|
||||
|
||||
return_code = self.__setup_and_run_process(log_file, subprocess_cmds)
|
||||
|
||||
message = f"{'=' * 50}\n\n{self.engine.name()} render ended with code {return_code} " \
|
||||
f"after {self.time_elapsed()}\n\n"
|
||||
log_file.write(message)
|
||||
|
||||
# don't try again if we've been cancelled
|
||||
if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]:
|
||||
return
|
||||
|
||||
# if file output hasn't increased, return as error, otherwise restart process.
|
||||
file_count_has_increased = len(self.file_list()) > initial_file_count
|
||||
if (self.status == RenderStatus.RUNNING) and file_count_has_increased and not return_code:
|
||||
break
|
||||
|
||||
if return_code:
|
||||
err_msg = f"{self.engine.name()} render failed with code {return_code}"
|
||||
logger.error(err_msg)
|
||||
self.errors.append(err_msg)
|
||||
|
||||
# handle instances where renderer exits ok but doesnt generate files
|
||||
if not return_code and not file_count_has_increased:
|
||||
err_msg = (f"{self.engine.name()} render exited ok, but file count has not increased. "
|
||||
f"Count is still {len(self.file_list())}")
|
||||
log_file.write(f'Error: {err_msg}\n\n')
|
||||
self.errors.append(err_msg)
|
||||
|
||||
# only count the attempt as failed if renderer creates no output - reset counter on successful output
|
||||
failed_attempts = 0 if file_count_has_increased else failed_attempts + 1
|
||||
|
||||
def __run__wait_for_subjobs(self, logfile):
|
||||
from src.distributed_job_manager import DistributedJobManager
|
||||
DistributedJobManager.wait_for_subjobs(parent_job=self)
|
||||
|
||||
@staticmethod
|
||||
def log_and_print(message, log_file):
|
||||
logger.info(message)
|
||||
log_file.write(f"{message}\n")
|
||||
|
||||
def __run(self):
|
||||
logger.info(f'Starting {self.engine.name()} {self.renderer_version} Render for {self.input_path} | '
|
||||
f'Frame Count: {self.total_frames}')
|
||||
|
||||
# Setup logging
|
||||
log_dir = os.path.dirname(self.log_path())
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
|
||||
subprocess_cmds = self.generate_subprocess()
|
||||
initial_file_count = len(self.file_list())
|
||||
failed_attempts = 0
|
||||
with open(self.log_path(), "a") as log_file:
|
||||
|
||||
with open(self.log_path(), "a") as f:
|
||||
self.log_and_print(f"{self.start_time.isoformat()} - Starting "
|
||||
f"{self.engine.name()} {self.renderer_version} render job for {self.name} "
|
||||
f"({self.input_path})", log_file)
|
||||
log_file.write(f"\n")
|
||||
if not self.children:
|
||||
self.__run__subprocess_cycle(log_file)
|
||||
else:
|
||||
self.__run__wait_for_subjobs(log_file)
|
||||
|
||||
f.write(f"{self.start_time.isoformat()} - Starting {self.engine.name()} {self.renderer_version} "
|
||||
f"render for {self.input_path}\n\n")
|
||||
f.write(f"Running command: {subprocess_cmds}\n")
|
||||
f.write('=' * 80 + '\n\n')
|
||||
|
||||
while True:
|
||||
# Log attempt #
|
||||
if failed_attempts:
|
||||
if failed_attempts >= self.maximum_attempts:
|
||||
err_msg = f"Maximum attempts exceeded ({self.maximum_attempts})"
|
||||
logger.error(err_msg)
|
||||
self.status = RenderStatus.ERROR
|
||||
self.errors.append(err_msg)
|
||||
return
|
||||
else:
|
||||
f.write(f'\n{"=" * 20} Attempt #{failed_attempts + 1} {"=" * 20}\n\n')
|
||||
logger.warning(f"Restarting render - Attempt #{failed_attempts + 1}")
|
||||
self.status = RenderStatus.RUNNING
|
||||
|
||||
return_code = self.__setup_and_run_process(f, subprocess_cmds)
|
||||
if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]:
|
||||
self.end_time = datetime.now()
|
||||
message = f"{self.engine.name()} render ended with status '{self.status}' " \
|
||||
f"after {self.time_elapsed()}"
|
||||
self.log_and_print(message, log_file)
|
||||
return
|
||||
|
||||
message = f"{'=' * 50}\n\n{self.engine.name()} render ended with code {return_code} " \
|
||||
f"after {self.time_elapsed()}\n\n"
|
||||
f.write(message)
|
||||
# Validate Output
|
||||
file_list_length = len(self.file_list())
|
||||
expected_list_length = (self.end_frame - self.start_frame + 1) if self.end_frame else 1
|
||||
|
||||
# Teardown
|
||||
if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]:
|
||||
message = f"{self.engine.name()} render ended with status '{self.status}' " \
|
||||
f"after {self.time_elapsed()}"
|
||||
f.write(message)
|
||||
return
|
||||
if file_list_length not in (expected_list_length, 1):
|
||||
logger.error(f"Expected length: {expected_list_length} | actual length: {len(self.file_list())}")
|
||||
# todo: create new subjob
|
||||
|
||||
# if file output hasn't increased, return as error, otherwise restart process.
|
||||
file_count_has_increased = len(self.file_list()) > initial_file_count
|
||||
if (self.status == RenderStatus.RUNNING) and file_count_has_increased and not return_code:
|
||||
message = (f"{'=' * 50}\n\n{self.engine.name()} render completed successfully in "
|
||||
f"{self.time_elapsed()}\n")
|
||||
f.write(message)
|
||||
break
|
||||
# Post Render Work
|
||||
if not self.parent:
|
||||
logger.debug(f"Starting post-processing work for {self}")
|
||||
self.post_processing()
|
||||
logger.debug(f"Completed post-processing work for {self}")
|
||||
|
||||
if return_code:
|
||||
err_msg = f"{self.engine.name()} render failed with code {return_code}"
|
||||
logger.error(err_msg)
|
||||
self.errors.append(err_msg)
|
||||
|
||||
# handle instances where renderer exits ok but doesnt generate files
|
||||
if not return_code and not file_count_has_increased:
|
||||
err_msg = (f"{self.engine.name()} render exited ok, but file count has not increased. "
|
||||
f"Count is still {len(self.file_list())}")
|
||||
f.write(f'Error: {err_msg}\n\n')
|
||||
self.errors.append(err_msg)
|
||||
|
||||
# only count the attempt as failed if renderer creates no output - ignore error codes for now
|
||||
if not file_count_has_increased:
|
||||
failed_attempts += 1
|
||||
|
||||
if self.children:
|
||||
from src.distributed_job_manager import DistributedJobManager
|
||||
DistributedJobManager.wait_for_subjobs(local_job=self)
|
||||
|
||||
# Post Render Work
|
||||
logger.debug("Starting post-processing work")
|
||||
self.post_processing()
|
||||
self.status = RenderStatus.COMPLETED
|
||||
logger.info(f"Render {self.id}-{self.name} completed successfully after {self.time_elapsed()}")
|
||||
self.status = RenderStatus.COMPLETED
|
||||
self.end_time = datetime.now()
|
||||
message = f"Render {self.name} completed successfully after {self.time_elapsed()}"
|
||||
self.log_and_print(message, log_file)
|
||||
|
||||
def __setup_and_run_process(self, f, subprocess_cmds):
|
||||
|
||||
|
||||
Reference in New Issue
Block a user