mirror of
https://github.com/blw1138/Zordon.git
synced 2025-12-18 01:08:12 +00:00
Parent creates local subjobs instead of truncating original (#95)
* Parent worker now creates subjob on local host and waits for it * Improve wait_for_subjobs logic * Fix setting end_time for base_worker * API cleanup * Code refactoring * Cleanup
This commit is contained in:
@@ -179,7 +179,7 @@ class DistributedJobManager:
|
||||
if job_data.get("enable_split_jobs", False) and (worker.total_frames > 1) and not worker.parent:
|
||||
cls.split_into_subjobs_async(worker, job_data, loaded_project_local_path)
|
||||
else:
|
||||
logger.debug("Not splitting into subjobs")
|
||||
worker.status = RenderStatus.NOT_STARTED
|
||||
|
||||
RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False))
|
||||
PreviewManager.update_previews_for_job(worker)
|
||||
@@ -211,11 +211,13 @@ class DistributedJobManager:
|
||||
if old_status != subjob_status.value:
|
||||
logger.debug(f"Subjob status changed: {logname} -> {subjob_status.value}")
|
||||
|
||||
cls.download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname)
|
||||
download_success = cls.download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname)
|
||||
if subjob_data['status'] == 'completed' and download_success:
|
||||
local_job.children[subjob_key]['download_status'] = 'completed'
|
||||
|
||||
@staticmethod
|
||||
def download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname):
|
||||
|
||||
success = True
|
||||
try:
|
||||
local_files = [os.path.basename(x) for x in local_job.file_list()]
|
||||
subjob_proxy = RenderServerProxy(subjob_hostname)
|
||||
@@ -231,8 +233,11 @@ class DistributedJobManager:
|
||||
logger.debug(f'Downloaded successfully - {local_save_path}')
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading file '{subjob_filename}' from {subjob_hostname}: {e}")
|
||||
success = False
|
||||
except Exception as e:
|
||||
logger.exception(f'Uncaught exception while trying to download from subjob: {e}')
|
||||
success = False
|
||||
return success
|
||||
|
||||
@staticmethod
|
||||
def download_all_from_subjob(local_job, subjob_id, subjob_hostname):
|
||||
@@ -279,62 +284,67 @@ class DistributedJobManager:
|
||||
return local_job.children[child_key].get('download_status', None) == 'complete'
|
||||
|
||||
@classmethod
|
||||
def wait_for_subjobs(cls, local_job):
|
||||
# todo: rewrite this method
|
||||
logger.debug(f"Waiting for subjobs for job {local_job}")
|
||||
local_job.status = RenderStatus.WAITING_FOR_SUBJOBS
|
||||
def wait_for_subjobs(cls, parent_job):
|
||||
logger.debug(f"Waiting for subjobs for job {parent_job}")
|
||||
parent_job.status = RenderStatus.WAITING_FOR_SUBJOBS
|
||||
statuses_to_download = [RenderStatus.CANCELLED, RenderStatus.ERROR, RenderStatus.COMPLETED]
|
||||
|
||||
def subjobs_not_downloaded():
|
||||
return {k: v for k, v in local_job.children.items() if 'download_status' not in v or
|
||||
return {k: v for k, v in parent_job.children.items() if 'download_status' not in v or
|
||||
v['download_status'] == 'working' or v['download_status'] is None}
|
||||
|
||||
logger.info(f'Waiting on {len(subjobs_not_downloaded())} subjobs for {local_job.id}')
|
||||
logger.info(f'Waiting on {len(subjobs_not_downloaded())} subjobs for {parent_job.id}')
|
||||
|
||||
while len(subjobs_not_downloaded()):
|
||||
for child_key, subjob_cached_data in subjobs_not_downloaded().items():
|
||||
server_delay = 10
|
||||
sleep_counter = 0
|
||||
while parent_job.status == RenderStatus.WAITING_FOR_SUBJOBS:
|
||||
|
||||
subjob_id = child_key.split('@')[0]
|
||||
subjob_hostname = child_key.split('@')[-1]
|
||||
if sleep_counter % server_delay == 0: # only ping servers every x seconds
|
||||
for child_key, subjob_cached_data in subjobs_not_downloaded().items():
|
||||
|
||||
# Fetch info from server and handle failing case
|
||||
subjob_data = RenderServerProxy(subjob_hostname).get_job_info(subjob_id)
|
||||
if not subjob_data:
|
||||
logger.warning(f"No response from: {subjob_hostname}")
|
||||
# todo: handle timeout / missing server situations
|
||||
continue
|
||||
subjob_id = child_key.split('@')[0]
|
||||
subjob_hostname = child_key.split('@')[-1]
|
||||
|
||||
# Update parent job cache but keep the download status
|
||||
download_status = local_job.children[child_key].get('download_status', None)
|
||||
local_job.children[child_key] = subjob_data
|
||||
local_job.children[child_key]['download_status'] = download_status
|
||||
# Fetch info from server and handle failing case
|
||||
subjob_data = RenderServerProxy(subjob_hostname).get_job_info(subjob_id)
|
||||
if not subjob_data:
|
||||
logger.warning(f"No response from {subjob_hostname}")
|
||||
# timeout / missing server situations
|
||||
parent_job.children[child_key]['download_status'] = f'error: No response from {subjob_hostname}'
|
||||
continue
|
||||
|
||||
status = string_to_status(subjob_data.get('status', ''))
|
||||
status_msg = f"Subjob {child_key} | {status} | " \
|
||||
f"{float(subjob_data.get('percent_complete')) * 100.0}%"
|
||||
logger.debug(status_msg)
|
||||
# Update parent job cache but keep the download status
|
||||
download_status = parent_job.children[child_key].get('download_status', None)
|
||||
parent_job.children[child_key] = subjob_data
|
||||
parent_job.children[child_key]['download_status'] = download_status
|
||||
|
||||
# Still working in another thread - keep waiting
|
||||
if download_status == 'working':
|
||||
continue
|
||||
status = string_to_status(subjob_data.get('status', ''))
|
||||
status_msg = f"Subjob {child_key} | {status} | " \
|
||||
f"{float(subjob_data.get('percent_complete')) * 100.0}%"
|
||||
logger.debug(status_msg)
|
||||
|
||||
# Check if job is finished, but has not had files copied yet over yet
|
||||
if download_status is None and subjob_data['file_count'] and status in statuses_to_download:
|
||||
try:
|
||||
cls.download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname)
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading missing frames from subjob: {e}")
|
||||
# Check if job is finished, but has not had files copied yet over yet
|
||||
if download_status is None and subjob_data['file_count'] and status in statuses_to_download:
|
||||
try:
|
||||
cls.download_missing_frames_from_subjob(parent_job, subjob_id, subjob_hostname)
|
||||
parent_job.children[child_key]['download_status'] = 'complete'
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading missing frames from subjob: {e}")
|
||||
parent_job.children[child_key]['download_status'] = 'error: {}'
|
||||
|
||||
# Any finished jobs not successfully downloaded at this point are skipped
|
||||
if local_job.children[child_key].get('download_status', None) is None and \
|
||||
status in statuses_to_download:
|
||||
logger.warning(f"Skipping waiting on downloading from subjob: {child_key}")
|
||||
local_job.children[child_key]['download_status'] = 'skipped'
|
||||
# Any finished jobs not successfully downloaded at this point are skipped
|
||||
if parent_job.children[child_key].get('download_status', None) is None and \
|
||||
status in statuses_to_download:
|
||||
logger.warning(f"Skipping waiting on downloading from subjob: {child_key}")
|
||||
parent_job.children[child_key]['download_status'] = 'skipped'
|
||||
|
||||
if subjobs_not_downloaded():
|
||||
logger.debug(f"Waiting on {len(subjobs_not_downloaded())} subjobs on "
|
||||
f"{', '.join(list(subjobs_not_downloaded().keys()))}")
|
||||
time.sleep(5)
|
||||
time.sleep(1)
|
||||
sleep_counter += 1
|
||||
else: # exit the loop
|
||||
parent_job.status = RenderStatus.RUNNING
|
||||
|
||||
# --------------------------------------------
|
||||
# Creating Subjobs
|
||||
@@ -366,9 +376,15 @@ class DistributedJobManager:
|
||||
"""
|
||||
|
||||
# Check availability
|
||||
parent_worker.status = RenderStatus.CONFIGURING
|
||||
available_servers = specific_servers if specific_servers else cls.find_available_servers(parent_worker.renderer, system_os)
|
||||
logger.debug(f"Splitting into subjobs - Available servers: {available_servers}")
|
||||
available_servers = specific_servers if specific_servers else cls.find_available_servers(parent_worker.renderer,
|
||||
system_os)
|
||||
# skip if theres no external servers found
|
||||
external_servers = [x for x in available_servers if x['hostname'] != parent_worker.hostname]
|
||||
if not external_servers:
|
||||
parent_worker.status = RenderStatus.NOT_STARTED
|
||||
return
|
||||
|
||||
logger.debug(f"Splitting into subjobs - Available servers: {[x['hostname'] for x in available_servers]}")
|
||||
all_subjob_server_data = cls.distribute_server_work(parent_worker.start_frame, parent_worker.end_frame, available_servers)
|
||||
|
||||
# Prep and submit these sub-jobs
|
||||
@@ -376,25 +392,19 @@ class DistributedJobManager:
|
||||
try:
|
||||
for subjob_data in all_subjob_server_data:
|
||||
subjob_hostname = subjob_data['hostname']
|
||||
if subjob_hostname != parent_worker.hostname:
|
||||
post_results = cls.__create_subjob(job_data, project_path, subjob_data, subjob_hostname,
|
||||
parent_worker)
|
||||
if not post_results.ok:
|
||||
ValueError(f"Failed to create subjob on {subjob_hostname}")
|
||||
post_results = cls.__create_subjob(job_data, project_path, subjob_data, subjob_hostname,
|
||||
parent_worker)
|
||||
if not post_results.ok:
|
||||
ValueError(f"Failed to create subjob on {subjob_hostname}")
|
||||
|
||||
# save child info
|
||||
submission_results = post_results.json()[0]
|
||||
child_key = f"{submission_results['id']}@{subjob_hostname}"
|
||||
parent_worker.children[child_key] = submission_results
|
||||
else:
|
||||
# truncate parent render_job
|
||||
parent_worker.start_frame = max(subjob_data['frame_range'][0], parent_worker.start_frame)
|
||||
parent_worker.end_frame = min(subjob_data['frame_range'][-1], parent_worker.end_frame)
|
||||
logger.info(f"Local job now rendering from {parent_worker.start_frame} to {parent_worker.end_frame}")
|
||||
# save child info
|
||||
submission_results = post_results.json()[0]
|
||||
child_key = f"{submission_results['id']}@{subjob_hostname}"
|
||||
parent_worker.children[child_key] = submission_results
|
||||
|
||||
# start subjobs
|
||||
logger.debug(f"Created {len(all_subjob_server_data) - 1} subjobs successfully")
|
||||
parent_worker.name = f"{parent_worker.name}[{parent_worker.start_frame}-{parent_worker.end_frame}]"
|
||||
logger.debug(f"Created {len(all_subjob_server_data)} subjobs successfully")
|
||||
parent_worker.name = f"{parent_worker.name} (Parent)"
|
||||
parent_worker.status = RenderStatus.NOT_STARTED # todo: this won't work with scheduled starts
|
||||
except Exception as e:
|
||||
# cancel all the subjobs
|
||||
|
||||
Reference in New Issue
Block a user