Refactor: DistributedJobManager with pub/sub status change notifications (#25)

* Add pubsub to render_queue and base_worker

* Refactor: Convert ZeroconfServer to Singleton with Class Methods

* New API for subjob servers to notify parent job servers of status changes

* Refactor: Move all subjob related methods to distributed_job_manager.py

* Rewrite for wait_for_subjobs

* Fix: DistributedJobManager.find_available_servers() takes 1 positional argument but 3 were given

* DistributedJobManager should now notify / be notified abotu background job changes

* Fix the make_ready api. Change children keyname to be id@hostname so it can be unique

* Fixes

* Image sequence to movie needs to find the actual start frame

* Fix: subjob_status_change did not return a valid response

* Fix client renderer selection

* Small fix for subjob status checking

* Fix issue with divide_frames_equally

* Fix issue where downloads were not occurring

* Fix issue where old status was being reported

* Add docstrings and code cleanup
This commit is contained in:
2023-06-30 19:49:57 -05:00
committed by GitHub
parent 0b0b410e76
commit 34fbdaa4d9
12 changed files with 503 additions and 255 deletions

View File

@@ -5,12 +5,11 @@ import logging
import os
import subprocess
import threading
import time
import zipfile
from datetime import datetime
from enum import Enum
import psutil
from pubsub import pub
from sqlalchemy import Column, Integer, String, DateTime, JSON
from sqlalchemy.ext.declarative import declarative_base
@@ -27,8 +26,8 @@ class RenderStatus(Enum):
CANCELLED = "cancelled"
ERROR = "error"
SCHEDULED = "scheduled"
WAITING = "waiting"
NOT_READY = "not_ready"
WAITING_FOR_SUBJOBS = "waiting_for_subjobs"
CONFIGURING = "configuring"
UNDEFINED = "undefined"
@@ -101,7 +100,7 @@ class BaseRenderWorker(Base):
self.end_time = None
# History
self.status = RenderStatus.NOT_READY
self.status = RenderStatus.CONFIGURING
self.warnings = []
self.errors = []
@@ -120,8 +119,11 @@ class BaseRenderWorker(Base):
return self._status
@status.setter
def status(self, value):
self._status = value.value
def status(self, new_status):
if self._status != new_status.value:
old_status = self._status
self._status = new_status.value
pub.sendMessage('status_change', job_id=self.id, old_status=old_status, new_status=new_status)
@status.getter
def status(self):
@@ -230,45 +232,9 @@ class BaseRenderWorker(Base):
logger.info(message)
f.write(message)
from lib.server.server_proxy import RenderServerProxy
# Wait on children jobs, if necessary
if self.children:
self.status = RenderStatus.WAITING
subjobs_still_running = self.children.copy()
while len(subjobs_still_running):
for hostname, job_id in subjobs_still_running.copy().items():
proxy = RenderServerProxy(hostname)
response = proxy.get_job_info(job_id)
if not response:
logger.warning(f"No response from: {hostname}")
else:
status = string_to_status(response.get('status', ''))
status_msg = f"Subjob {job_id}@{hostname} | Status: {status} | {response.get('percent_complete')}%"
if status in [RenderStatus.CANCELLED, RenderStatus.ERROR, RenderStatus.COMPLETED]:
logger.info(f"Downloading completed subjob files from {hostname} to localhost")
try:
zip_file_path = self.output_path + f'_{hostname}_{job_id}.zip'
proxy.get_job_files(job_id, zip_file_path)
logger.debug("Zip file download successfully - Preparing to unzip.")
extract_path = os.path.dirname(zip_file_path)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
logger.info(f"Successfully extracted zip to: {extract_path}")
os.remove(zip_file_path)
except Exception as e:
err_msg = f"Error transferring output from subjob {job_id}@{hostname}: {e}"
logger.exception(err_msg)
self.errors.append(err_msg)
finally:
subjobs_still_running.pop(hostname)
else:
logger.debug(status_msg)
logger.debug(f"Waiting on {len(subjobs_still_running)} subjobs on {', '.join(list(subjobs_still_running.keys()))}")
time.sleep(5)
logger.info("All subjobs complete")
from lib.distributed_job_manager import DistributedJobManager
DistributedJobManager.wait_for_subjobs(local_job=self)
# Post Render Work
logger.debug("Starting post-processing work")

View File

@@ -124,12 +124,20 @@ class BlenderRenderWorker(BaseRenderWorker):
output_dir_files = os.listdir(os.path.dirname(self.output_path))
if self.total_frames > 1 and len(output_dir_files) > 1 and not self.parent:
logger.info("Generating preview for image sequence")
# Calculate what the real start frame # is if we have child objects
start_frame = self.start_frame
if self.children:
min_child_frame = min(int(child["start_frame"]) for child in self.children.values())
start_frame = min(min_child_frame, self.start_frame)
logger.debug(f"Post processing start frame #{start_frame}")
try:
pattern = os.path.splitext(self.output_path)[0] + "_%04d" + most_common_extension(output_dir_files)
image_sequence_to_video(source_glob_pattern=pattern,
output_path=self.output_path + '.mov',
framerate=self.scene_info['fps'],
start_frame=self.start_frame)
start_frame=start_frame)
logger.info('Successfully generated preview video from image sequence')
except Exception as e:
logger.error(f'Error generating video from image sequence: {e}')