Parent creates local subjobs instead of truncating original (#95)

* Parent worker now creates subjob on local host and waits for it

* Improve wait_for_subjobs logic

* Fix setting end_time for base_worker

* API cleanup

* Code refactoring

* Cleanup
This commit is contained in:
2024-08-10 21:19:01 -05:00
committed by GitHub
parent f9b51886ab
commit e757506787
3 changed files with 168 additions and 138 deletions

View File

@@ -161,13 +161,9 @@ def filtered_jobs_json(status_val):
@server.post('/api/job/<job_id>/send_subjob_update_notification') @server.post('/api/job/<job_id>/send_subjob_update_notification')
def subjob_update_notification(job_id): def subjob_update_notification(job_id):
try: subjob_details = request.json
subjob_details = request.json DistributedJobManager.handle_subjob_update_notification(RenderQueue.job_with_id(job_id), subjob_data=subjob_details)
logger.info(f"Subjob to job id: {job_id} is now {subjob_details['status']}") return Response(status=200)
DistributedJobManager.handle_subjob_update_notification(RenderQueue.job_with_id(job_id), subjob_data=subjob_details)
return Response(status=200)
except JobNotFoundError:
return "Job not found", 404
@server.get('/api/job/<job_id>') @server.get('/api/job/<job_id>')

View File

@@ -179,7 +179,7 @@ class DistributedJobManager:
if job_data.get("enable_split_jobs", False) and (worker.total_frames > 1) and not worker.parent: if job_data.get("enable_split_jobs", False) and (worker.total_frames > 1) and not worker.parent:
cls.split_into_subjobs_async(worker, job_data, loaded_project_local_path) cls.split_into_subjobs_async(worker, job_data, loaded_project_local_path)
else: else:
logger.debug("Not splitting into subjobs") worker.status = RenderStatus.NOT_STARTED
RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False)) RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False))
PreviewManager.update_previews_for_job(worker) PreviewManager.update_previews_for_job(worker)
@@ -211,11 +211,13 @@ class DistributedJobManager:
if old_status != subjob_status.value: if old_status != subjob_status.value:
logger.debug(f"Subjob status changed: {logname} -> {subjob_status.value}") logger.debug(f"Subjob status changed: {logname} -> {subjob_status.value}")
cls.download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname) download_success = cls.download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname)
if subjob_data['status'] == 'completed' and download_success:
local_job.children[subjob_key]['download_status'] = 'completed'
@staticmethod @staticmethod
def download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname): def download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname):
success = True
try: try:
local_files = [os.path.basename(x) for x in local_job.file_list()] local_files = [os.path.basename(x) for x in local_job.file_list()]
subjob_proxy = RenderServerProxy(subjob_hostname) subjob_proxy = RenderServerProxy(subjob_hostname)
@@ -231,8 +233,11 @@ class DistributedJobManager:
logger.debug(f'Downloaded successfully - {local_save_path}') logger.debug(f'Downloaded successfully - {local_save_path}')
except Exception as e: except Exception as e:
logger.error(f"Error downloading file '{subjob_filename}' from {subjob_hostname}: {e}") logger.error(f"Error downloading file '{subjob_filename}' from {subjob_hostname}: {e}")
success = False
except Exception as e: except Exception as e:
logger.exception(f'Uncaught exception while trying to download from subjob: {e}') logger.exception(f'Uncaught exception while trying to download from subjob: {e}')
success = False
return success
@staticmethod @staticmethod
def download_all_from_subjob(local_job, subjob_id, subjob_hostname): def download_all_from_subjob(local_job, subjob_id, subjob_hostname):
@@ -279,62 +284,67 @@ class DistributedJobManager:
return local_job.children[child_key].get('download_status', None) == 'complete' return local_job.children[child_key].get('download_status', None) == 'complete'
@classmethod @classmethod
def wait_for_subjobs(cls, local_job): def wait_for_subjobs(cls, parent_job):
# todo: rewrite this method logger.debug(f"Waiting for subjobs for job {parent_job}")
logger.debug(f"Waiting for subjobs for job {local_job}") parent_job.status = RenderStatus.WAITING_FOR_SUBJOBS
local_job.status = RenderStatus.WAITING_FOR_SUBJOBS
statuses_to_download = [RenderStatus.CANCELLED, RenderStatus.ERROR, RenderStatus.COMPLETED] statuses_to_download = [RenderStatus.CANCELLED, RenderStatus.ERROR, RenderStatus.COMPLETED]
def subjobs_not_downloaded(): def subjobs_not_downloaded():
return {k: v for k, v in local_job.children.items() if 'download_status' not in v or return {k: v for k, v in parent_job.children.items() if 'download_status' not in v or
v['download_status'] == 'working' or v['download_status'] is None} v['download_status'] == 'working' or v['download_status'] is None}
logger.info(f'Waiting on {len(subjobs_not_downloaded())} subjobs for {local_job.id}') logger.info(f'Waiting on {len(subjobs_not_downloaded())} subjobs for {parent_job.id}')
while len(subjobs_not_downloaded()): server_delay = 10
for child_key, subjob_cached_data in subjobs_not_downloaded().items(): sleep_counter = 0
while parent_job.status == RenderStatus.WAITING_FOR_SUBJOBS:
subjob_id = child_key.split('@')[0] if sleep_counter % server_delay == 0: # only ping servers every x seconds
subjob_hostname = child_key.split('@')[-1] for child_key, subjob_cached_data in subjobs_not_downloaded().items():
# Fetch info from server and handle failing case subjob_id = child_key.split('@')[0]
subjob_data = RenderServerProxy(subjob_hostname).get_job_info(subjob_id) subjob_hostname = child_key.split('@')[-1]
if not subjob_data:
logger.warning(f"No response from: {subjob_hostname}")
# todo: handle timeout / missing server situations
continue
# Update parent job cache but keep the download status # Fetch info from server and handle failing case
download_status = local_job.children[child_key].get('download_status', None) subjob_data = RenderServerProxy(subjob_hostname).get_job_info(subjob_id)
local_job.children[child_key] = subjob_data if not subjob_data:
local_job.children[child_key]['download_status'] = download_status logger.warning(f"No response from {subjob_hostname}")
# timeout / missing server situations
parent_job.children[child_key]['download_status'] = f'error: No response from {subjob_hostname}'
continue
status = string_to_status(subjob_data.get('status', '')) # Update parent job cache but keep the download status
status_msg = f"Subjob {child_key} | {status} | " \ download_status = parent_job.children[child_key].get('download_status', None)
f"{float(subjob_data.get('percent_complete')) * 100.0}%" parent_job.children[child_key] = subjob_data
logger.debug(status_msg) parent_job.children[child_key]['download_status'] = download_status
# Still working in another thread - keep waiting status = string_to_status(subjob_data.get('status', ''))
if download_status == 'working': status_msg = f"Subjob {child_key} | {status} | " \
continue f"{float(subjob_data.get('percent_complete')) * 100.0}%"
logger.debug(status_msg)
# Check if job is finished, but has not had files copied yet over yet # Check if job is finished, but has not had files copied yet over yet
if download_status is None and subjob_data['file_count'] and status in statuses_to_download: if download_status is None and subjob_data['file_count'] and status in statuses_to_download:
try: try:
cls.download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname) cls.download_missing_frames_from_subjob(parent_job, subjob_id, subjob_hostname)
except Exception as e: parent_job.children[child_key]['download_status'] = 'complete'
logger.error(f"Error downloading missing frames from subjob: {e}") except Exception as e:
logger.error(f"Error downloading missing frames from subjob: {e}")
parent_job.children[child_key]['download_status'] = 'error: {}'
# Any finished jobs not successfully downloaded at this point are skipped # Any finished jobs not successfully downloaded at this point are skipped
if local_job.children[child_key].get('download_status', None) is None and \ if parent_job.children[child_key].get('download_status', None) is None and \
status in statuses_to_download: status in statuses_to_download:
logger.warning(f"Skipping waiting on downloading from subjob: {child_key}") logger.warning(f"Skipping waiting on downloading from subjob: {child_key}")
local_job.children[child_key]['download_status'] = 'skipped' parent_job.children[child_key]['download_status'] = 'skipped'
if subjobs_not_downloaded(): if subjobs_not_downloaded():
logger.debug(f"Waiting on {len(subjobs_not_downloaded())} subjobs on " logger.debug(f"Waiting on {len(subjobs_not_downloaded())} subjobs on "
f"{', '.join(list(subjobs_not_downloaded().keys()))}") f"{', '.join(list(subjobs_not_downloaded().keys()))}")
time.sleep(5) time.sleep(1)
sleep_counter += 1
else: # exit the loop
parent_job.status = RenderStatus.RUNNING
# -------------------------------------------- # --------------------------------------------
# Creating Subjobs # Creating Subjobs
@@ -366,9 +376,15 @@ class DistributedJobManager:
""" """
# Check availability # Check availability
parent_worker.status = RenderStatus.CONFIGURING available_servers = specific_servers if specific_servers else cls.find_available_servers(parent_worker.renderer,
available_servers = specific_servers if specific_servers else cls.find_available_servers(parent_worker.renderer, system_os) system_os)
logger.debug(f"Splitting into subjobs - Available servers: {available_servers}") # skip if theres no external servers found
external_servers = [x for x in available_servers if x['hostname'] != parent_worker.hostname]
if not external_servers:
parent_worker.status = RenderStatus.NOT_STARTED
return
logger.debug(f"Splitting into subjobs - Available servers: {[x['hostname'] for x in available_servers]}")
all_subjob_server_data = cls.distribute_server_work(parent_worker.start_frame, parent_worker.end_frame, available_servers) all_subjob_server_data = cls.distribute_server_work(parent_worker.start_frame, parent_worker.end_frame, available_servers)
# Prep and submit these sub-jobs # Prep and submit these sub-jobs
@@ -376,25 +392,19 @@ class DistributedJobManager:
try: try:
for subjob_data in all_subjob_server_data: for subjob_data in all_subjob_server_data:
subjob_hostname = subjob_data['hostname'] subjob_hostname = subjob_data['hostname']
if subjob_hostname != parent_worker.hostname: post_results = cls.__create_subjob(job_data, project_path, subjob_data, subjob_hostname,
post_results = cls.__create_subjob(job_data, project_path, subjob_data, subjob_hostname, parent_worker)
parent_worker) if not post_results.ok:
if not post_results.ok: ValueError(f"Failed to create subjob on {subjob_hostname}")
ValueError(f"Failed to create subjob on {subjob_hostname}")
# save child info # save child info
submission_results = post_results.json()[0] submission_results = post_results.json()[0]
child_key = f"{submission_results['id']}@{subjob_hostname}" child_key = f"{submission_results['id']}@{subjob_hostname}"
parent_worker.children[child_key] = submission_results parent_worker.children[child_key] = submission_results
else:
# truncate parent render_job
parent_worker.start_frame = max(subjob_data['frame_range'][0], parent_worker.start_frame)
parent_worker.end_frame = min(subjob_data['frame_range'][-1], parent_worker.end_frame)
logger.info(f"Local job now rendering from {parent_worker.start_frame} to {parent_worker.end_frame}")
# start subjobs # start subjobs
logger.debug(f"Created {len(all_subjob_server_data) - 1} subjobs successfully") logger.debug(f"Created {len(all_subjob_server_data)} subjobs successfully")
parent_worker.name = f"{parent_worker.name}[{parent_worker.start_frame}-{parent_worker.end_frame}]" parent_worker.name = f"{parent_worker.name} (Parent)"
parent_worker.status = RenderStatus.NOT_STARTED # todo: this won't work with scheduled starts parent_worker.status = RenderStatus.NOT_STARTED # todo: this won't work with scheduled starts
except Exception as e: except Exception as e:
# cancel all the subjobs # cancel all the subjobs

View File

@@ -90,7 +90,7 @@ class BaseRenderWorker(Base):
self.end_time = None self.end_time = None
# History # History
self.status = RenderStatus.NOT_STARTED self.status = RenderStatus.CONFIGURING
self.warnings = [] self.warnings = []
self.errors = [] self.errors = []
@@ -158,7 +158,7 @@ class BaseRenderWorker(Base):
def start(self): def start(self):
if self.status not in [RenderStatus.SCHEDULED, RenderStatus.NOT_STARTED]: if self.status not in [RenderStatus.SCHEDULED, RenderStatus.NOT_STARTED, RenderStatus.CONFIGURING]:
logger.error(f"Trying to start job with status: {self.status}") logger.error(f"Trying to start job with status: {self.status}")
return return
@@ -176,90 +176,114 @@ class BaseRenderWorker(Base):
self.errors.append(msg) self.errors.append(msg)
return return
self.status = RenderStatus.RUNNING self.status = RenderStatus.RUNNING if not self.children else RenderStatus.WAITING_FOR_SUBJOBS
self.start_time = datetime.now() self.start_time = datetime.now()
self.__thread.start() self.__thread.start()
# handle multiple attempts at running subprocess
def __run__subprocess_cycle(self, log_file):
subprocess_cmds = self.generate_subprocess()
initial_file_count = len(self.file_list())
failed_attempts = 0
log_file.write(f"Running command: {subprocess_cmds}\n")
log_file.write('=' * 80 + '\n\n')
while True:
# Log attempt #
if failed_attempts:
if failed_attempts >= self.maximum_attempts:
err_msg = f"Maximum attempts exceeded ({self.maximum_attempts})"
logger.error(err_msg)
self.status = RenderStatus.ERROR
self.errors.append(err_msg)
return
else:
log_file.write(f'\n{"=" * 20} Attempt #{failed_attempts + 1} {"=" * 20}\n\n')
logger.warning(f"Restarting render - Attempt #{failed_attempts + 1}")
self.status = RenderStatus.RUNNING
return_code = self.__setup_and_run_process(log_file, subprocess_cmds)
message = f"{'=' * 50}\n\n{self.engine.name()} render ended with code {return_code} " \
f"after {self.time_elapsed()}\n\n"
log_file.write(message)
# don't try again if we've been cancelled
if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]:
return
# if file output hasn't increased, return as error, otherwise restart process.
file_count_has_increased = len(self.file_list()) > initial_file_count
if (self.status == RenderStatus.RUNNING) and file_count_has_increased and not return_code:
break
if return_code:
err_msg = f"{self.engine.name()} render failed with code {return_code}"
logger.error(err_msg)
self.errors.append(err_msg)
# handle instances where renderer exits ok but doesnt generate files
if not return_code and not file_count_has_increased:
err_msg = (f"{self.engine.name()} render exited ok, but file count has not increased. "
f"Count is still {len(self.file_list())}")
log_file.write(f'Error: {err_msg}\n\n')
self.errors.append(err_msg)
# only count the attempt as failed if renderer creates no output - reset counter on successful output
failed_attempts = 0 if file_count_has_increased else failed_attempts + 1
def __run__wait_for_subjobs(self, logfile):
from src.distributed_job_manager import DistributedJobManager
DistributedJobManager.wait_for_subjobs(parent_job=self)
@staticmethod
def log_and_print(message, log_file):
logger.info(message)
log_file.write(f"{message}\n")
def __run(self): def __run(self):
logger.info(f'Starting {self.engine.name()} {self.renderer_version} Render for {self.input_path} | '
f'Frame Count: {self.total_frames}')
# Setup logging # Setup logging
log_dir = os.path.dirname(self.log_path()) log_dir = os.path.dirname(self.log_path())
os.makedirs(log_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True)
subprocess_cmds = self.generate_subprocess() with open(self.log_path(), "a") as log_file:
initial_file_count = len(self.file_list())
failed_attempts = 0
with open(self.log_path(), "a") as f: self.log_and_print(f"{self.start_time.isoformat()} - Starting "
f"{self.engine.name()} {self.renderer_version} render job for {self.name} "
f"({self.input_path})", log_file)
log_file.write(f"\n")
if not self.children:
self.__run__subprocess_cycle(log_file)
else:
self.__run__wait_for_subjobs(log_file)
f.write(f"{self.start_time.isoformat()} - Starting {self.engine.name()} {self.renderer_version} " if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]:
f"render for {self.input_path}\n\n")
f.write(f"Running command: {subprocess_cmds}\n")
f.write('=' * 80 + '\n\n')
while True:
# Log attempt #
if failed_attempts:
if failed_attempts >= self.maximum_attempts:
err_msg = f"Maximum attempts exceeded ({self.maximum_attempts})"
logger.error(err_msg)
self.status = RenderStatus.ERROR
self.errors.append(err_msg)
return
else:
f.write(f'\n{"=" * 20} Attempt #{failed_attempts + 1} {"=" * 20}\n\n')
logger.warning(f"Restarting render - Attempt #{failed_attempts + 1}")
self.status = RenderStatus.RUNNING
return_code = self.__setup_and_run_process(f, subprocess_cmds)
self.end_time = datetime.now() self.end_time = datetime.now()
message = f"{self.engine.name()} render ended with status '{self.status}' " \
f"after {self.time_elapsed()}"
self.log_and_print(message, log_file)
return
message = f"{'=' * 50}\n\n{self.engine.name()} render ended with code {return_code} " \ # Validate Output
f"after {self.time_elapsed()}\n\n" file_list_length = len(self.file_list())
f.write(message) expected_list_length = (self.end_frame - self.start_frame + 1) if self.end_frame else 1
# Teardown if file_list_length not in (expected_list_length, 1):
if self.status in [RenderStatus.CANCELLED, RenderStatus.ERROR]: logger.error(f"Expected length: {expected_list_length} | actual length: {len(self.file_list())}")
message = f"{self.engine.name()} render ended with status '{self.status}' " \ # todo: create new subjob
f"after {self.time_elapsed()}"
f.write(message)
return
# if file output hasn't increased, return as error, otherwise restart process. # Post Render Work
file_count_has_increased = len(self.file_list()) > initial_file_count if not self.parent:
if (self.status == RenderStatus.RUNNING) and file_count_has_increased and not return_code: logger.debug(f"Starting post-processing work for {self}")
message = (f"{'=' * 50}\n\n{self.engine.name()} render completed successfully in " self.post_processing()
f"{self.time_elapsed()}\n") logger.debug(f"Completed post-processing work for {self}")
f.write(message)
break
if return_code: self.status = RenderStatus.COMPLETED
err_msg = f"{self.engine.name()} render failed with code {return_code}" self.end_time = datetime.now()
logger.error(err_msg) message = f"Render {self.name} completed successfully after {self.time_elapsed()}"
self.errors.append(err_msg) self.log_and_print(message, log_file)
# handle instances where renderer exits ok but doesnt generate files
if not return_code and not file_count_has_increased:
err_msg = (f"{self.engine.name()} render exited ok, but file count has not increased. "
f"Count is still {len(self.file_list())}")
f.write(f'Error: {err_msg}\n\n')
self.errors.append(err_msg)
# only count the attempt as failed if renderer creates no output - ignore error codes for now
if not file_count_has_increased:
failed_attempts += 1
if self.children:
from src.distributed_job_manager import DistributedJobManager
DistributedJobManager.wait_for_subjobs(local_job=self)
# Post Render Work
logger.debug("Starting post-processing work")
self.post_processing()
self.status = RenderStatus.COMPLETED
logger.info(f"Render {self.id}-{self.name} completed successfully after {self.time_elapsed()}")
def __setup_and_run_process(self, f, subprocess_cmds): def __setup_and_run_process(self, f, subprocess_cmds):