mirror of
https://github.com/blw1138/Zordon.git
synced 2025-12-17 16:58:12 +00:00
Refactor: DistributedJobManager with pub/sub status change notifications (#25)
* Add pubsub to render_queue and base_worker * Refactor: Convert ZeroconfServer to Singleton with Class Methods * New API for subjob servers to notify parent job servers of status changes * Refactor: Move all subjob related methods to distributed_job_manager.py * Rewrite for wait_for_subjobs * Fix: DistributedJobManager.find_available_servers() takes 1 positional argument but 3 were given * DistributedJobManager should now notify / be notified abotu background job changes * Fix the make_ready api. Change children keyname to be id@hostname so it can be unique * Fixes * Image sequence to movie needs to find the actual start frame * Fix: subjob_status_change did not return a valid response * Fix client renderer selection * Small fix for subjob status checking * Fix issue with divide_frames_equally * Fix issue where downloads were not occurring * Fix issue where old status was being reported * Add docstrings and code cleanup
This commit is contained in:
@@ -5,12 +5,11 @@ import logging
|
||||
import os
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
import psutil
|
||||
from pubsub import pub
|
||||
from sqlalchemy import Column, Integer, String, DateTime, JSON
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
@@ -27,8 +26,8 @@ class RenderStatus(Enum):
|
||||
CANCELLED = "cancelled"
|
||||
ERROR = "error"
|
||||
SCHEDULED = "scheduled"
|
||||
WAITING = "waiting"
|
||||
NOT_READY = "not_ready"
|
||||
WAITING_FOR_SUBJOBS = "waiting_for_subjobs"
|
||||
CONFIGURING = "configuring"
|
||||
UNDEFINED = "undefined"
|
||||
|
||||
|
||||
@@ -101,7 +100,7 @@ class BaseRenderWorker(Base):
|
||||
self.end_time = None
|
||||
|
||||
# History
|
||||
self.status = RenderStatus.NOT_READY
|
||||
self.status = RenderStatus.CONFIGURING
|
||||
self.warnings = []
|
||||
self.errors = []
|
||||
|
||||
@@ -120,8 +119,11 @@ class BaseRenderWorker(Base):
|
||||
return self._status
|
||||
|
||||
@status.setter
|
||||
def status(self, value):
|
||||
self._status = value.value
|
||||
def status(self, new_status):
|
||||
if self._status != new_status.value:
|
||||
old_status = self._status
|
||||
self._status = new_status.value
|
||||
pub.sendMessage('status_change', job_id=self.id, old_status=old_status, new_status=new_status)
|
||||
|
||||
@status.getter
|
||||
def status(self):
|
||||
@@ -230,45 +232,9 @@ class BaseRenderWorker(Base):
|
||||
logger.info(message)
|
||||
f.write(message)
|
||||
|
||||
from lib.server.server_proxy import RenderServerProxy
|
||||
|
||||
# Wait on children jobs, if necessary
|
||||
if self.children:
|
||||
self.status = RenderStatus.WAITING
|
||||
subjobs_still_running = self.children.copy()
|
||||
while len(subjobs_still_running):
|
||||
for hostname, job_id in subjobs_still_running.copy().items():
|
||||
proxy = RenderServerProxy(hostname)
|
||||
response = proxy.get_job_info(job_id)
|
||||
if not response:
|
||||
logger.warning(f"No response from: {hostname}")
|
||||
else:
|
||||
status = string_to_status(response.get('status', ''))
|
||||
status_msg = f"Subjob {job_id}@{hostname} | Status: {status} | {response.get('percent_complete')}%"
|
||||
|
||||
if status in [RenderStatus.CANCELLED, RenderStatus.ERROR, RenderStatus.COMPLETED]:
|
||||
logger.info(f"Downloading completed subjob files from {hostname} to localhost")
|
||||
try:
|
||||
zip_file_path = self.output_path + f'_{hostname}_{job_id}.zip'
|
||||
proxy.get_job_files(job_id, zip_file_path)
|
||||
logger.debug("Zip file download successfully - Preparing to unzip.")
|
||||
extract_path = os.path.dirname(zip_file_path)
|
||||
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
||||
zip_ref.extractall(extract_path)
|
||||
logger.info(f"Successfully extracted zip to: {extract_path}")
|
||||
os.remove(zip_file_path)
|
||||
except Exception as e:
|
||||
err_msg = f"Error transferring output from subjob {job_id}@{hostname}: {e}"
|
||||
logger.exception(err_msg)
|
||||
self.errors.append(err_msg)
|
||||
finally:
|
||||
subjobs_still_running.pop(hostname)
|
||||
else:
|
||||
logger.debug(status_msg)
|
||||
logger.debug(f"Waiting on {len(subjobs_still_running)} subjobs on {', '.join(list(subjobs_still_running.keys()))}")
|
||||
time.sleep(5)
|
||||
|
||||
logger.info("All subjobs complete")
|
||||
from lib.distributed_job_manager import DistributedJobManager
|
||||
DistributedJobManager.wait_for_subjobs(local_job=self)
|
||||
|
||||
# Post Render Work
|
||||
logger.debug("Starting post-processing work")
|
||||
|
||||
Reference in New Issue
Block a user