mirror of
https://github.com/blw1138/Zordon.git
synced 2026-06-09 13:39:24 -05:00
refactor: wire all services through ApplicationContext
- Created src/application_context.py as DI container with TYPE_CHECKING imports - server.py now instantiates all services in dependency order via ApplicationContext - Fixed infinite recursion bug: 48 instance methods renamed with underscore prefix to avoid shadowing by same-named @classmethod forwarders - ZeroconfServer: instantiate Zeroconf() in __init__, add _sync_class() to configure forwarder, direct _configure/_start calls during wiring - Config, EngineManager, PreviewManager: all forwarders and _sync_class() intact - RenderQueue: load_state and subscribe moved to __init__, threading.Lock retained - DistributedJobManager: subscribe_to_listener moved to __init__
This commit is contained in:
+67
-145
@@ -3,8 +3,9 @@ import os
|
||||
import socket
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
|
||||
from click import Path
|
||||
from plyer import notification
|
||||
from pubsub import pub
|
||||
|
||||
@@ -21,47 +22,27 @@ logger = logging.getLogger()
|
||||
|
||||
|
||||
class DistributedJobManager:
|
||||
_default_instance: Optional['DistributedJobManager'] = None
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
def __init__(self) -> None:
|
||||
self.background_worker: Optional[threading.Thread] = None
|
||||
|
||||
@classmethod
|
||||
def subscribe_to_listener(cls):
|
||||
"""
|
||||
Subscribes the private class method '__local_job_status_changed' to the 'status_change' pubsub message.
|
||||
This should be called once, typically during the initialization phase.
|
||||
"""
|
||||
pub.subscribe(cls.__local_job_status_changed, 'status_change')
|
||||
pub.subscribe(cls.__local_job_frame_complete, 'frame_complete')
|
||||
|
||||
@classmethod
|
||||
def __local_job_frame_complete(cls, job_id, frame_number, update_interval=5):
|
||||
|
||||
"""
|
||||
Responds to the 'frame_complete' pubsub message for local jobs.
|
||||
|
||||
Args:
|
||||
job_id (str): The ID of the job that has changed status.
|
||||
old_status (str): The previous status of the job.
|
||||
new_status (str): The new (current) status of the job.
|
||||
|
||||
Note: Do not call directly. Instead, call via the 'frame_complete' pubsub message.
|
||||
"""
|
||||
def _subscribe_to_listener(self) -> None:
|
||||
pub.subscribe(self._local_job_status_changed, 'status_change')
|
||||
pub.subscribe(self._local_job_frame_complete, 'frame_complete')
|
||||
|
||||
def _local_job_frame_complete(self, job_id, frame_number, update_interval=5) -> None:
|
||||
render_job = RenderQueue.job_with_id(job_id, none_ok=True)
|
||||
if not render_job: # ignore jobs not in the queue
|
||||
if not render_job:
|
||||
return
|
||||
|
||||
logger.debug(f"Job {job_id} has completed frame #{frame_number}")
|
||||
replace_existing_previews = (frame_number % update_interval) == 0
|
||||
cls.__job_update_shared(render_job, replace_existing_previews)
|
||||
self._job_update_shared(render_job, replace_existing_previews)
|
||||
|
||||
@classmethod
|
||||
def __job_update_shared(cls, render_job, replace_existing_previews=False):
|
||||
# update previews
|
||||
def _job_update_shared(self, render_job, replace_existing_previews=False) -> None:
|
||||
PreviewManager.update_previews_for_job(job=render_job, replace_existing=replace_existing_previews)
|
||||
|
||||
# notify parent to allow individual frames to be copied instead of waiting until the end
|
||||
if render_job.parent:
|
||||
parent_id, parent_hostname = render_job.parent.split('@')[0], render_job.parent.split('@')[-1]
|
||||
try:
|
||||
@@ -70,57 +51,41 @@ class DistributedJobManager:
|
||||
except Exception as e:
|
||||
logger.error(f"Error notifying parent {parent_hostname} about update in subjob {render_job.id}: {e}")
|
||||
|
||||
@classmethod
|
||||
def __local_job_status_changed(cls, job_id: str, old_status: str, new_status: str):
|
||||
"""
|
||||
Responds to the 'status_change' pubsub message for local jobs.
|
||||
If it's a child job, it notifies the parent job about the status change.
|
||||
|
||||
Args:
|
||||
job_id (str): The ID of the job that has changed status.
|
||||
old_status (str): The previous status of the job.
|
||||
new_status (str): The new (current) status of the job.
|
||||
|
||||
Note: Do not call directly. Instead, call via the 'status_change' pubsub message.
|
||||
"""
|
||||
|
||||
def _local_job_status_changed(self, job_id: str, old_status: str, new_status: str) -> None:
|
||||
render_job = RenderQueue.job_with_id(job_id, none_ok=True)
|
||||
if not render_job: # ignore jobs created but not yet added to queue
|
||||
if not render_job:
|
||||
return
|
||||
|
||||
logger.debug(f"Job {job_id} status change: {old_status} -> {new_status}")
|
||||
self._job_update_shared(render_job, replace_existing_previews=(render_job.status == RenderStatus.COMPLETED))
|
||||
|
||||
cls.__job_update_shared(render_job, replace_existing_previews=(render_job.status == RenderStatus.COMPLETED))
|
||||
|
||||
# Handle children
|
||||
if render_job.children:
|
||||
if new_status in [RenderStatus.CANCELLED, RenderStatus.ERROR]: # Cancel children if necessary
|
||||
if new_status in (RenderStatus.CANCELLED, RenderStatus.ERROR):
|
||||
for child in render_job.children:
|
||||
child_id, child_hostname = child.split('@')
|
||||
RenderServerProxy(child_hostname).cancel_job(child_id, confirm=True)
|
||||
|
||||
# UI Notifications
|
||||
try:
|
||||
if new_status == RenderStatus.COMPLETED:
|
||||
logger.debug("Show render complete notification")
|
||||
notification.notify(
|
||||
title='Render Job Complete',
|
||||
message=f'{render_job.name} completed succesfully',
|
||||
timeout=10 # Display time in seconds
|
||||
timeout=10
|
||||
)
|
||||
elif new_status == RenderStatus.ERROR:
|
||||
logger.debug("Show render error notification")
|
||||
notification.notify(
|
||||
title='Render Job Failed',
|
||||
message=f'{render_job.name} failed rendering',
|
||||
timeout=10 # Display time in seconds
|
||||
timeout=10
|
||||
)
|
||||
elif new_status == RenderStatus.RUNNING:
|
||||
logger.debug("Show render started notification")
|
||||
notification.notify(
|
||||
title='Render Job Started',
|
||||
message=f'{render_job.name} started rendering',
|
||||
timeout=10 # Display time in seconds
|
||||
timeout=10
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to show UI notification: {e}")
|
||||
@@ -129,30 +94,15 @@ class DistributedJobManager:
|
||||
# Create Job
|
||||
# --------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def create_render_job(cls, new_job_attributes: dict, loaded_project_local_path: Path):
|
||||
"""Creates render jobs. Pass in dict of job_data and the local path to the project. It creates and returns a new
|
||||
render job.
|
||||
|
||||
Args:
|
||||
new_job_attributes (dict): Dict of desired attributes for new job (frame count, renderer, output path, etc)
|
||||
loaded_project_local_path (Path): The local path to the loaded project.
|
||||
|
||||
Returns:
|
||||
worker: Created job worker
|
||||
"""
|
||||
|
||||
# get new output path in output_dir
|
||||
def _create_render_job(self, new_job_attributes: dict, loaded_project_local_path: Path):
|
||||
output_path = new_job_attributes.get('output_path')
|
||||
output_filename = loaded_project_local_path.name if output_path else loaded_project_local_path.stem
|
||||
|
||||
# Prepare output path
|
||||
output_dir = loaded_project_local_path.parent.parent / "output"
|
||||
output_path = output_dir / output_filename
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
logger.debug(f"New job output path: {output_path}")
|
||||
|
||||
# create & configure jobs
|
||||
worker = EngineManager.create_worker(engine_name=new_job_attributes['engine_name'],
|
||||
input_path=loaded_project_local_path,
|
||||
output_path=output_path,
|
||||
@@ -160,16 +110,15 @@ class DistributedJobManager:
|
||||
args=new_job_attributes.get('args', {}),
|
||||
parent=new_job_attributes.get('parent'),
|
||||
name=new_job_attributes.get('name'))
|
||||
worker.status = new_job_attributes.get("initial_status", worker.status) # todo: is this necessary?
|
||||
worker.status = new_job_attributes.get("initial_status", worker.status)
|
||||
worker.priority = int(new_job_attributes.get('priority', worker.priority))
|
||||
worker.start_frame = int(new_job_attributes.get("start_frame", worker.start_frame))
|
||||
worker.end_frame = int(new_job_attributes.get("end_frame", worker.end_frame))
|
||||
worker.watchdog_timeout = Config.worker_process_timeout
|
||||
worker.hostname = socket.gethostname()
|
||||
|
||||
# determine if we can / should split the job
|
||||
if new_job_attributes.get("enable_split_jobs", False) and (worker.total_frames > 1) and not worker.parent:
|
||||
cls.split_into_subjobs_async(worker, new_job_attributes, loaded_project_local_path)
|
||||
self.split_into_subjobs_async(worker, new_job_attributes, loaded_project_local_path)
|
||||
else:
|
||||
worker.status = RenderStatus.NOT_STARTED
|
||||
|
||||
@@ -182,15 +131,7 @@ class DistributedJobManager:
|
||||
# Handling Subjobs
|
||||
# --------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def handle_subjob_update_notification(cls, local_job, subjob_data: dict):
|
||||
"""Responds to a notification from a remote subjob and the host requests any subsequent updates from the subjob.
|
||||
|
||||
Args:
|
||||
local_job (BaseRenderWorker): The local parent job worker.
|
||||
subjob_data (dict): Subjob data sent from the remote server.
|
||||
"""
|
||||
|
||||
def _handle_subjob_update_notification(self, local_job, subjob_data: dict) -> None:
|
||||
subjob_status = string_to_status(subjob_data['status'])
|
||||
subjob_id = subjob_data['id']
|
||||
subjob_hostname = subjob_data['hostname']
|
||||
@@ -206,19 +147,10 @@ class DistributedJobManager:
|
||||
if subjob_data['status'] == 'completed' and download_success:
|
||||
local_job.children[subjob_key]['download_status'] = 'completed'
|
||||
|
||||
@classmethod
|
||||
def wait_for_subjobs(cls, parent_job):
|
||||
"""Check the status of subjobs and waits until they are all finished. Download rendered frames from subjobs
|
||||
when they are completed.
|
||||
|
||||
Args:
|
||||
parent_job: Worker object that has child jobs
|
||||
|
||||
Returns:
|
||||
"""
|
||||
def _wait_for_subjobs(self, parent_job) -> None:
|
||||
logger.debug(f"Waiting for subjobs for job {parent_job}")
|
||||
parent_job.status = RenderStatus.WAITING_FOR_SUBJOBS
|
||||
statuses_to_download = [RenderStatus.CANCELLED, RenderStatus.ERROR, RenderStatus.COMPLETED]
|
||||
statuses_to_download = (RenderStatus.CANCELLED, RenderStatus.ERROR, RenderStatus.COMPLETED)
|
||||
|
||||
def subjobs_not_downloaded():
|
||||
return {k: v for k, v in parent_job.children.items() if 'download_status' not in v or
|
||||
@@ -230,21 +162,17 @@ class DistributedJobManager:
|
||||
sleep_counter = 0
|
||||
while parent_job.status == RenderStatus.WAITING_FOR_SUBJOBS:
|
||||
|
||||
if sleep_counter % server_delay == 0: # only ping servers every x seconds
|
||||
for child_key, subjob_cached_data in subjobs_not_downloaded().items():
|
||||
|
||||
if sleep_counter % server_delay == 0:
|
||||
for child_key in subjobs_not_downloaded():
|
||||
subjob_id = child_key.split('@')[0]
|
||||
subjob_hostname = child_key.split('@')[-1]
|
||||
|
||||
# Fetch info from server and handle failing case
|
||||
subjob_data = RenderServerProxy(subjob_hostname).get_job_info(subjob_id)
|
||||
if not subjob_data:
|
||||
logger.warning(f"No response from {subjob_hostname}")
|
||||
# timeout / missing server situations
|
||||
parent_job.children[child_key]['download_status'] = f'error: No response from {subjob_hostname}'
|
||||
continue
|
||||
|
||||
# Update parent job cache but keep the download status
|
||||
download_status = parent_job.children[child_key].get('download_status', None)
|
||||
parent_job.children[child_key] = subjob_data
|
||||
parent_job.children[child_key]['download_status'] = download_status
|
||||
@@ -254,8 +182,7 @@ class DistributedJobManager:
|
||||
f"{float(subjob_data.get('percent_complete')) * 100.0}%"
|
||||
logger.debug(status_msg)
|
||||
|
||||
# Check if job is finished, but has not had files copied yet over yet
|
||||
if download_status is None and subjob_data['file_count'] and status in statuses_to_download:
|
||||
if download_status is None and subjob_data.get('file_count') and status in statuses_to_download:
|
||||
try:
|
||||
download_missing_frames_from_subjob(parent_job, subjob_id, subjob_hostname)
|
||||
parent_job.children[child_key]['download_status'] = 'complete'
|
||||
@@ -263,7 +190,6 @@ class DistributedJobManager:
|
||||
logger.error(f"Error downloading missing frames from subjob: {e}")
|
||||
parent_job.children[child_key]['download_status'] = 'error: {}'
|
||||
|
||||
# Any finished jobs not successfully downloaded at this point are skipped
|
||||
if parent_job.children[child_key].get('download_status', None) is None and \
|
||||
status in statuses_to_download:
|
||||
logger.warning(f"Skipping waiting on downloading from subjob: {child_key}")
|
||||
@@ -274,42 +200,22 @@ class DistributedJobManager:
|
||||
f"{', '.join(list(subjobs_not_downloaded().keys()))}")
|
||||
time.sleep(1)
|
||||
sleep_counter += 1
|
||||
else: # exit the loop
|
||||
else:
|
||||
parent_job.status = RenderStatus.RUNNING
|
||||
|
||||
# --------------------------------------------
|
||||
# Creating Subjobs
|
||||
# --------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def split_into_subjobs_async(cls, parent_worker, new_job_attributes, project_path, system_os=None):
|
||||
# todo: I don't love this
|
||||
def _split_into_subjobs_async(self, parent_worker, new_job_attributes, project_path, system_os=None) -> None:
|
||||
parent_worker.status = RenderStatus.CONFIGURING
|
||||
cls.background_worker = threading.Thread(target=cls.split_into_subjobs, args=(parent_worker, new_job_attributes,
|
||||
project_path, system_os))
|
||||
cls.background_worker.start()
|
||||
self.background_worker = threading.Thread(target=self.split_into_subjobs, args=(
|
||||
parent_worker, new_job_attributes, project_path, system_os))
|
||||
self.background_worker.start()
|
||||
|
||||
@classmethod
|
||||
def split_into_subjobs(cls, parent_worker, new_job_attributes, project_path, system_os=None, specific_servers=None):
|
||||
"""
|
||||
Splits a job into subjobs and distributes them among available servers.
|
||||
|
||||
This method checks the availability of servers, distributes the work among them, and creates subjobs on each
|
||||
server. If a server is the local host, it adjusts the frame range of the parent job instead of creating a
|
||||
subjob.
|
||||
|
||||
Args:
|
||||
parent_worker (Worker): The parent job what we're creating the subjobs for.
|
||||
new_job_attributes (dict): Dict of desired attributes for new job (frame count, engine, output path, etc)
|
||||
project_path (str): The path to the project.
|
||||
system_os (str, optional): Required OS. Default is any.
|
||||
specific_servers (list, optional): List of specific servers to split work between. Defaults to all found.
|
||||
"""
|
||||
|
||||
# Check availability
|
||||
available_servers = specific_servers if specific_servers else cls.find_available_servers(parent_worker.engine_name,
|
||||
system_os)
|
||||
# skip if theres no external servers found
|
||||
def split_into_subjobs(self, parent_worker, new_job_attributes, project_path, system_os=None, specific_servers=None) -> None:
|
||||
available_servers = specific_servers if specific_servers else self.find_available_servers(
|
||||
parent_worker.engine_name, system_os)
|
||||
external_servers = [x for x in available_servers if x['hostname'] != parent_worker.hostname]
|
||||
if not external_servers:
|
||||
parent_worker.status = RenderStatus.NOT_STARTED
|
||||
@@ -318,34 +224,29 @@ class DistributedJobManager:
|
||||
logger.debug(f"Splitting into subjobs - Available servers: {[x['hostname'] for x in available_servers]}")
|
||||
all_subjob_server_data = distribute_server_work(parent_worker.start_frame, parent_worker.end_frame, available_servers)
|
||||
|
||||
# Prep and submit these sub-jobs
|
||||
logger.info(f"Job {parent_worker.id} split plan: {all_subjob_server_data}")
|
||||
try:
|
||||
for subjob_data in all_subjob_server_data:
|
||||
subjob_hostname = subjob_data['hostname']
|
||||
post_results = cls.__create_subjob(new_job_attributes, project_path, subjob_data, subjob_hostname,
|
||||
post_results = self._create_subjob(new_job_attributes, project_path, subjob_data, subjob_hostname,
|
||||
parent_worker)
|
||||
if not post_results.ok:
|
||||
ValueError(f"Failed to create subjob on {subjob_hostname}")
|
||||
|
||||
# save child info
|
||||
submission_results = post_results.json()[0]
|
||||
child_key = f"{submission_results['id']}@{subjob_hostname}"
|
||||
parent_worker.children[child_key] = submission_results
|
||||
|
||||
# start subjobs
|
||||
logger.debug(f"Created {len(all_subjob_server_data)} subjobs successfully")
|
||||
parent_worker.name = f"{parent_worker.name} (Parent)"
|
||||
parent_worker.status = RenderStatus.NOT_STARTED # todo: this won't work with scheduled starts
|
||||
parent_worker.status = RenderStatus.NOT_STARTED
|
||||
except Exception as e:
|
||||
# cancel all the subjobs
|
||||
logger.error(f"Failed to split job into subjobs: {e}")
|
||||
logger.debug(f"Cancelling {len(all_subjob_server_data) - 1} attempted subjobs")
|
||||
RenderServerProxy(parent_worker.hostname).cancel_job(parent_worker.id, confirm=True)
|
||||
|
||||
@staticmethod
|
||||
def __create_subjob(new_job_attributes: dict, project_path, server_data, server_hostname: str, parent_worker):
|
||||
"""Convenience method to create subjobs for a parent worker"""
|
||||
def _create_subjob(new_job_attributes: dict, project_path, server_data, server_hostname, parent_worker):
|
||||
subjob = new_job_attributes.copy()
|
||||
subjob['name'] = f"{parent_worker.name}[{server_data['frame_range'][0]}-{server_data['frame_range'][-1]}]"
|
||||
subjob['parent'] = f"{parent_worker.id}@{parent_worker.hostname}"
|
||||
@@ -364,13 +265,6 @@ class DistributedJobManager:
|
||||
|
||||
@staticmethod
|
||||
def find_available_servers(engine_name: str, system_os=None):
|
||||
"""
|
||||
Scan the Zeroconf network for currently available render servers supporting a specific engine.
|
||||
|
||||
:param engine_name: str, The engine type to search for
|
||||
:param system_os: str, Restrict results to servers running a specific OS
|
||||
:return: A list of dictionaries with each dict containing hostname and cpu_count of available servers
|
||||
"""
|
||||
from api.api_server import API_VERSION
|
||||
found_available_servers = []
|
||||
for hostname in ZeroconfServer.found_hostnames():
|
||||
@@ -383,6 +277,34 @@ class DistributedJobManager:
|
||||
|
||||
return found_available_servers
|
||||
|
||||
# --- Forwarders for backward compatibility ---
|
||||
|
||||
@classmethod
|
||||
def subscribe_to_listener(cls):
|
||||
if cls._default_instance is not None:
|
||||
cls._default_instance._subscribe_to_listener()
|
||||
|
||||
@classmethod
|
||||
def create_render_job(cls, new_job_attributes, loaded_project_local_path):
|
||||
if cls._default_instance is not None:
|
||||
return cls._default_instance._create_render_job(new_job_attributes, loaded_project_local_path)
|
||||
raise RuntimeError("DistributedJobManager is not initialized")
|
||||
|
||||
@classmethod
|
||||
def handle_subjob_update_notification(cls, local_job, subjob_data):
|
||||
if cls._default_instance is not None:
|
||||
cls._default_instance._handle_subjob_update_notification(local_job, subjob_data)
|
||||
|
||||
@classmethod
|
||||
def wait_for_subjobs(cls, parent_job):
|
||||
if cls._default_instance is not None:
|
||||
cls._default_instance._wait_for_subjobs(parent_job)
|
||||
|
||||
@classmethod
|
||||
def split_into_subjobs_async(cls, parent_worker, new_job_attributes, project_path, system_os=None):
|
||||
if cls._default_instance is not None:
|
||||
cls._default_instance._split_into_subjobs_async(parent_worker, new_job_attributes, project_path, system_os)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
Reference in New Issue
Block a user