Refactor: DistributedJobManager with pub/sub status change notifications (#25)

* Add pubsub to render_queue and base_worker

* Refactor: Convert ZeroconfServer to Singleton with Class Methods

* New API for subjob servers to notify parent job servers of status changes

* Refactor: Move all subjob related methods to distributed_job_manager.py

* Rewrite for wait_for_subjobs

* Fix: DistributedJobManager.find_available_servers() takes 1 positional argument but 3 were given

* DistributedJobManager should now notify / be notified abotu background job changes

* Fix the make_ready api. Change children keyname to be id@hostname so it can be unique

* Fixes

* Image sequence to movie needs to find the actual start frame

* Fix: subjob_status_change did not return a valid response

* Fix client renderer selection

* Small fix for subjob status checking

* Fix issue with divide_frames_equally

* Fix issue where downloads were not occurring

* Fix issue where old status was being reported

* Add docstrings and code cleanup
This commit is contained in:
2023-06-30 19:49:57 -05:00
committed by GitHub
parent 0b0b410e76
commit 34fbdaa4d9
12 changed files with 503 additions and 255 deletions

View File

@@ -82,9 +82,9 @@ def create_node_tree(all_server_data) -> Tree:
node_tree.add(Tree(stats_text)) node_tree.add(Tree(stats_text))
running_jobs = [job for job in server_data['jobs'] if job['status'] == 'running'] running_jobs = [job for job in server_data['jobs'] if job['status'] == RenderStatus.RUNNING.value]
not_started = [job for job in server_data['jobs'] if job['status'] == 'not_started'] not_started = [job for job in server_data['jobs'] if job['status'] == RenderStatus.NOT_STARTED.value]
scheduled = [job for job in server_data['jobs'] if job['status'] == 'scheduled'] scheduled = [job for job in server_data['jobs'] if job['status'] == RenderStatus.SCHEDULED.value]
jobs_to_display = running_jobs + not_started + scheduled jobs_to_display = running_jobs + not_started + scheduled
jobs_tree = Tree(f"Running: [green]{len(running_jobs)} [default]| Queued: [cyan]{len(not_started)}" jobs_tree = Tree(f"Running: [green]{len(running_jobs)} [default]| Queued: [cyan]{len(not_started)}"
@@ -93,7 +93,7 @@ def create_node_tree(all_server_data) -> Tree:
for job in jobs_to_display: for job in jobs_to_display:
renderer = f"{renderer_colors[job['renderer']]}{job['renderer']}[default]" renderer = f"{renderer_colors[job['renderer']]}{job['renderer']}[default]"
filename = os.path.basename(job['input_path']).split('.')[0] filename = os.path.basename(job['input_path']).split('.')[0]
if job['status'] == 'running': if job['status'] == RenderStatus.RUNNING.value:
jobs_tree.add(f"[bold]{renderer} {filename} ({job['id']}) - {status_string_to_color(job['status'])}{(float(job['percent_complete']) * 100):.1f}%") jobs_tree.add(f"[bold]{renderer} {filename} ({job['id']}) - {status_string_to_color(job['status'])}{(float(job['percent_complete']) * 100):.1f}%")
else: else:
jobs_tree.add(f"{filename} ({job['id']}) - {status_string_to_color(job['status'])}{job['status'].title()}") jobs_tree.add(f"{filename} ({job['id']}) - {status_string_to_color(job['status'])}{job['status'].title()}")

View File

@@ -44,8 +44,8 @@ class DashboardWindow:
self.added_hostnames = [] self.added_hostnames = []
# Setup zeroconf # Setup zeroconf
self.zeroconf = ZeroconfServer("_zordon._tcp.local.", socket.gethostname(), 8080) ZeroconfServer.configure("_zordon._tcp.local.", socket.gethostname(), 8080)
self.zeroconf.start(listen_only=True) ZeroconfServer.start(listen_only=True)
# Setup photo preview # Setup photo preview
photo_pad = tk.Frame(self.root, background="gray") photo_pad = tk.Frame(self.root, background="gray")
@@ -299,7 +299,7 @@ class DashboardWindow:
tree.item(item, values=new_values) tree.item(item, values=new_values)
break break
current_servers = list(set(self.zeroconf.found_clients() + self.added_hostnames)) current_servers = list(set(ZeroconfServer.found_clients() + self.added_hostnames))
for hostname in current_servers: for hostname in current_servers:
if not self.server_proxies.get(hostname, None): if not self.server_proxies.get(hostname, None):
new_proxy = RenderServerProxy(hostname=hostname) new_proxy = RenderServerProxy(hostname=hostname)

View File

@@ -175,9 +175,8 @@ class NewJobWindow(Frame):
self.presets = self.server_proxy.request_data('presets', timeout=3) or {} self.presets = self.server_proxy.request_data('presets', timeout=3) or {}
# update available renders # update available renders
available_renderers = [x for x in self.renderer_info.keys() if self.renderer_info[x].get('available', False)] self.renderer_combo['values'] = list(self.renderer_info.keys())
self.renderer_combo['values'] = available_renderers if self.renderer_info.keys():
if available_renderers:
self.renderer_combo.current(0) self.renderer_combo.current(0)
self.refresh_renderer_settings() self.refresh_renderer_settings()

View File

@@ -0,0 +1,338 @@
import logging
import os
import socket
import time
import zipfile
from pubsub import pub
from lib.render_queue import RenderQueue
from lib.server.server_proxy import RenderServerProxy
from lib.server.zeroconf_server import ZeroconfServer
from lib.utilities.misc_helper import get_file_size_human
from lib.workers.base_worker import RenderStatus, string_to_status
logger = logging.getLogger()
class DistributedJobManager:
def __init__(self):
pass
@classmethod
def start(cls):
"""
Subscribes the private class method '__local_job_status_changed' to the 'status_change' pubsub message.
This should be called once, typically during the initialization phase.
"""
pub.subscribe(cls.__local_job_status_changed, 'status_change')
@classmethod
def __local_job_status_changed(cls, job_id, old_status, new_status):
"""
Responds to the 'status_change' pubsub message for local jobs.
If it's a child job, it notifies the parent job about the status change.
Parameters:
job_id (str): The ID of the job that has changed status.
old_status (str): The previous status of the job.
new_status (str): The new (current) status of the job.
Note: Do not call directly. Instead, call via the 'status_change' pubsub message.
"""
render_job = RenderQueue.job_with_id(job_id, none_ok=True)
if not render_job: # ignore jobs created but not yet added to queue
return
logger.debug(f"Job {job_id} status change: {old_status} -> {new_status}")
if render_job.parent: # If local job is a subjob from a remote server
parent_id, hostname = render_job.parent.split('@')[0], render_job.parent.split('@')[-1]
RenderServerProxy(hostname).notify_parent_of_status_change(parent_id=parent_id, subjob=render_job)
elif render_job.children and new_status == RenderStatus.CANCELLED:
# todo: handle cancelling all the children
pass
@classmethod
def handle_subjob_status_change(cls, local_job, subjob_data):
"""
Responds to a status change from a remote subjob and triggers the creation or modification of subjobs as needed.
Parameters:
local_job (BaseRenderWorker): The local parent job worker.
subjob_data (dict): subjob data sent from remote server.
Returns:
None
"""
subjob_status = string_to_status(subjob_data['status'])
subjob_id = subjob_data['id']
subjob_hostname = next((hostname.split('@')[1] for hostname in local_job.children if
hostname.split('@')[0] == subjob_id), None)
local_job.children[f'{subjob_id}@{subjob_hostname}'] = subjob_data
logname = f"{local_job.id}:{subjob_id}@{subjob_hostname}"
logger.debug(f"Subjob status changed: {logname} -> {subjob_status.value}")
# Download complete or partial render jobs
if subjob_status in [RenderStatus.COMPLETED, RenderStatus.CANCELLED, RenderStatus.ERROR] and \
subjob_data['file_count']:
download_result = cls.download_from_subjob(local_job, subjob_id, subjob_hostname)
if not download_result:
# todo: handle error
logger.error(f"Unable to download subjob files from {logname} with status {subjob_status.value}")
if subjob_status == RenderStatus.CANCELLED or subjob_status == RenderStatus.ERROR:
# todo: determine missing frames and schedule new job
pass
@staticmethod
def download_from_subjob(local_job, subjob_id, subjob_hostname):
"""
Downloads and extracts files from a completed subjob on a remote server.
Parameters:
local_job (BaseRenderWorker): The local parent job worker.
subjob_id (str or int): The ID of the subjob.
subjob_hostname (str): The hostname of the remote server where the subjob is located.
Returns:
bool: True if the files have been downloaded and extracted successfully, False otherwise.
"""
child_key = f'{subjob_id}@{subjob_hostname}'
logname = f"{local_job.id}:{child_key}"
zip_file_path = local_job.output_path + f'_{subjob_hostname}_{subjob_id}.zip'
# download zip file from server
try:
local_job.children[child_key]['download_status'] = 'working'
logger.info(f"Downloading completed subjob files from {subjob_hostname} to localhost")
RenderServerProxy(subjob_hostname).get_job_files(subjob_id, zip_file_path)
logger.info(f"File transfer complete for {logname} - Transferred {get_file_size_human(zip_file_path)}")
except Exception as e:
logger.exception(f"Exception downloading files from remote server: {e}")
local_job.children[child_key]['download_status'] = 'failed'
return False
# extract zip
try:
logger.debug(f"Extracting zip file: {zip_file_path}")
extract_path = os.path.dirname(zip_file_path)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
logger.info(f"Successfully extracted zip to: {extract_path}")
os.remove(zip_file_path)
local_job.children[child_key]['download_status'] = 'complete'
except Exception as e:
logger.exception(f"Exception extracting zip file: {e}")
local_job.children[child_key]['download_status'] = 'failed'
return local_job.children[child_key].get('download_status', None) == 'complete'
@classmethod
def wait_for_subjobs(cls, local_job):
logger.debug(f"Waiting for subjobs for job {local_job}")
local_job.status = RenderStatus.WAITING_FOR_SUBJOBS
statuses_to_download = [RenderStatus.CANCELLED, RenderStatus.ERROR, RenderStatus.COMPLETED]
def subjobs_not_downloaded():
return {k: v for k, v in local_job.children.items() if 'download_status' not in v or
v['download_status'] == 'working' or v['download_status'] is None}
logger.debug(f'subjobs_not_downloaded: {subjobs_not_downloaded()}')
while len(subjobs_not_downloaded()):
for child_key, subjob_cached_data in subjobs_not_downloaded().items():
subjob_id = child_key.split('@')[0]
subjob_hostname = child_key.split('@')[-1]
# Fetch info from server and handle failing case
subjob_data = RenderServerProxy(subjob_hostname).get_job_info(subjob_id)
if not subjob_data:
logger.warning(f"No response from: {subjob_hostname}")
# todo: handle timeout / missing server situations
continue
# Update parent job cache but keep the download status
download_status = local_job.children[child_key].get('download_status', None)
local_job.children[child_key] = subjob_data
local_job.children[child_key]['download_status'] = download_status
status = string_to_status(subjob_data.get('status', ''))
status_msg = f"Subjob {child_key} | {status} | " \
f"{float(subjob_data.get('percent_complete')) * 100.0}%"
logger.debug(status_msg)
# Still working in another thread - keep waiting
if download_status == 'working':
continue
# Check if job is finished, but has not had files copied yet over yet
if download_status is None and subjob_data['file_count'] and status in statuses_to_download:
download_result = cls.download_from_subjob(local_job, subjob_id, subjob_hostname)
if not download_result:
logger.error("Failed to download from subjob")
# todo: error handling here
# Any finished jobs not successfully downloaded at this point are skipped
if local_job.children[child_key].get('download_status', None) is None and \
status in statuses_to_download:
logger.warning(f"Skipping waiting on downloading from subjob: {child_key}")
local_job.children[child_key]['download_status'] = 'skipped'
if subjobs_not_downloaded():
logger.debug(f"Waiting on {len(subjobs_not_downloaded())} subjobs on "
f"{', '.join(list(subjobs_not_downloaded().keys()))}")
time.sleep(5)
@classmethod
def split_into_subjobs(cls, worker, job_data, project_path):
# Check availability
available_servers = cls.find_available_servers(worker.renderer)
subjob_servers = cls.distribute_server_work(worker.start_frame, worker.end_frame, available_servers)
local_hostname = socket.gethostname()
# Prep and submit these sub-jobs
logger.info(f"Job {worker.id} split plan: {subjob_servers}")
try:
for server_data in subjob_servers:
server_hostname = server_data['hostname']
if server_hostname != local_hostname:
post_results = cls.__create_subjob(job_data, local_hostname, project_path, server_data,
server_hostname, worker)
if post_results.ok:
server_data['submission_results'] = post_results.json()[0]
else:
logger.error(f"Failed to create subjob on {server_hostname}")
break
else:
# truncate parent render_job
worker.start_frame = max(server_data['frame_range'][0], worker.start_frame)
worker.end_frame = min(server_data['frame_range'][-1], worker.end_frame)
logger.info(f"Local job now rendering from {worker.start_frame} to {worker.end_frame}")
server_data['submission_results'] = worker.json()
# check that job posts were all successful.
if not all(d.get('submission_results') is not None for d in subjob_servers):
raise ValueError("Failed to create all subjobs") # look into recalculating job #s and use exising jobs
# start subjobs
logger.debug(f"Starting {len(subjob_servers) - 1} attempted subjobs")
for server_data in subjob_servers:
if server_data['hostname'] != local_hostname:
child_key = f"{server_data['submission_results']['id']}@{server_data['hostname']}"
worker.children[child_key] = server_data['submission_results']
worker.name = f"{worker.name}[{worker.start_frame}-{worker.end_frame}]"
except Exception as e:
# cancel all the subjobs
logger.error(f"Failed to split job into subjobs: {e}")
logger.debug(f"Cancelling {len(subjob_servers) - 1} attempted subjobs")
# [RenderServerProxy(hostname).cancel_job(results['id'], confirm=True) for hostname, results in
# submission_results.items()] # todo: fix this
@staticmethod
def __create_subjob(job_data, local_hostname, project_path, server_data, server_hostname, worker):
subjob = job_data.copy()
subjob['name'] = f"{worker.name}[{server_data['frame_range'][0]}-{server_data['frame_range'][-1]}]"
subjob['parent'] = f"{worker.id}@{local_hostname}"
subjob['start_frame'] = server_data['frame_range'][0]
subjob['end_frame'] = server_data['frame_range'][-1]
logger.debug(f"Posting subjob with frames {subjob['start_frame']}-"
f"{subjob['end_frame']} to {server_hostname}")
post_results = RenderServerProxy(server_hostname).post_job_to_server(
file_path=project_path, job_list=[subjob])
return post_results
@staticmethod
def distribute_server_work(start_frame, end_frame, available_servers, method='cpu_count'):
"""
Splits the frame range among available servers proportionally based on their performance (CPU count).
:param start_frame: int, The start frame number of the animation to be rendered.
:param end_frame: int, The end frame number of the animation to be rendered.
:param available_servers: list, A list of available server dictionaries. Each server dictionary should include
'hostname' and 'cpu_count' keys (see find_available_servers)
:param method: str, Optional. Specifies the distribution method. Possible values are 'cpu_count' and 'equally'
:return: A list of server dictionaries where each dictionary includes the frame range and total number of frames
to be rendered by the server.
"""
# Calculate respective frames for each server
def divide_frames_by_cpu_count(frame_start, frame_end, servers):
total_frames = frame_end - frame_start + 1
total_performance = sum(server['cpu_count'] for server in servers)
frame_ranges = {}
current_frame = frame_start
allocated_frames = 0
for i, server in enumerate(servers):
if i == len(servers) - 1: # if it's the last server
# Give all remaining frames to the last server
num_frames = total_frames - allocated_frames
else:
num_frames = round((server['cpu_count'] / total_performance) * total_frames)
allocated_frames += num_frames
frame_end_for_server = current_frame + num_frames - 1
if current_frame <= frame_end_for_server:
frame_ranges[server['hostname']] = (current_frame, frame_end_for_server)
current_frame = frame_end_for_server + 1
return frame_ranges
def divide_frames_equally(frame_start, frame_end, servers):
frame_range = frame_end - frame_start + 1
frames_per_server = frame_range // len(servers)
leftover_frames = frame_range % len(servers)
frame_ranges = {}
current_start = frame_start
for i, server in enumerate(servers):
current_end = current_start + frames_per_server - 1
if leftover_frames > 0:
current_end += 1
leftover_frames -= 1
if current_start <= current_end:
frame_ranges[server['hostname']] = (current_start, current_end)
current_start = current_end + 1
return frame_ranges
if method == 'equally':
breakdown = divide_frames_equally(start_frame, end_frame, available_servers)
# elif method == 'benchmark_score': # todo: implement benchmark score
# pass
else:
breakdown = divide_frames_by_cpu_count(start_frame, end_frame, available_servers)
server_breakdown = [server for server in available_servers if breakdown.get(server['hostname']) is not None]
for server in server_breakdown:
server['frame_range'] = breakdown[server['hostname']]
server['total_frames'] = breakdown[server['hostname']][-1] - breakdown[server['hostname']][0] + 1
return server_breakdown
@staticmethod
def find_available_servers(renderer):
"""
Scan the Zeroconf network for currently available render servers supporting a specific renderer.
:param renderer: str, The renderer type to search for
:return: A list of dictionaries with each dict containing hostname and cpu_count of available servers
"""
available_servers = []
for hostname in ZeroconfServer.found_clients():
response = RenderServerProxy(hostname).get_status()
if response and response.get('renderers', {}).get(renderer, {}).get('is_available', False):
available_servers.append({'hostname': hostname, 'cpu_count': int(response['cpu_count'])})
return available_servers

View File

@@ -4,8 +4,8 @@ from datetime import datetime
from sqlalchemy import create_engine from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from .workers.base_worker import RenderStatus, BaseRenderWorker, Base from lib.workers.base_worker import RenderStatus, BaseRenderWorker, Base
from .workers.worker_factory import RenderWorkerFactory from lib.workers.worker_factory import RenderWorkerFactory
logger = logging.getLogger() logger = logging.getLogger()
@@ -28,6 +28,14 @@ class RenderQueue:
def __init__(self): def __init__(self):
pass pass
@classmethod
def start_queue(cls):
cls.load_state()
@classmethod
def job_status_change(cls, job_id, status):
logger.debug(f"Job status changed: {job_id} -> {status}")
@classmethod @classmethod
def add_to_render_queue(cls, render_job, force_start=False): def add_to_render_queue(cls, render_job, force_start=False):
logger.debug('Adding priority {} job to render queue: {}'.format(render_job.priority, render_job)) logger.debug('Adding priority {} job to render queue: {}'.format(render_job.priority, render_job))

View File

@@ -1,9 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import json import json
import logging
import os
import pathlib import pathlib
import platform import platform
import shutil import shutil
import socket
import ssl import ssl
import threading
import time import time
import zipfile import zipfile
from datetime import datetime from datetime import datetime
@@ -11,13 +15,16 @@ from urllib.request import urlretrieve
from zipfile import ZipFile from zipfile import ZipFile
import json2html import json2html
import psutil
import yaml import yaml
from flask import Flask, request, render_template, send_file, after_this_request, Response, redirect, url_for, abort from flask import Flask, request, render_template, send_file, after_this_request, Response, redirect, url_for, abort
from werkzeug.utils import secure_filename from werkzeug.utils import secure_filename
from lib.distributed_job_manager import DistributedJobManager
from lib.render_queue import RenderQueue, JobNotFoundError from lib.render_queue import RenderQueue, JobNotFoundError
from lib.server.server_proxy import RenderServerProxy
from lib.server.zeroconf_server import ZeroconfServer from lib.server.zeroconf_server import ZeroconfServer
from lib.utilities.server_helper import * from lib.utilities.server_helper import generate_thumbnail_for_job
from lib.workers.base_worker import string_to_status, RenderStatus from lib.workers.base_worker import string_to_status, RenderStatus
from lib.workers.worker_factory import RenderWorkerFactory from lib.workers.worker_factory import RenderWorkerFactory
@@ -157,6 +164,17 @@ def filtered_jobs_json(status_val):
return f'Cannot find jobs with status {status_val}', 400 return f'Cannot find jobs with status {status_val}', 400
@server.post('/api/job/<job_id>/notify_parent_of_status_change')
def subjob_status_change(job_id):
try:
subjob_details = request.json
logger.info(f"Subjob to job id: {job_id} is now {subjob_details['status']}")
DistributedJobManager.handle_subjob_status_change(RenderQueue.job_with_id(job_id), subjob_data=subjob_details)
return Response(status=200)
except JobNotFoundError:
return "Job not found", 404
@server.errorhandler(JobNotFoundError) @server.errorhandler(JobNotFoundError)
def handle_job_not_found(job_error): def handle_job_not_found(job_error):
return f'Cannot find job with ID {job_error.job_id}', 400 return f'Cannot find job with ID {job_error.job_id}', 400
@@ -187,10 +205,12 @@ def get_file_list(job_id):
def make_job_ready(job_id): def make_job_ready(job_id):
try: try:
found_job = RenderQueue.job_with_id(job_id) found_job = RenderQueue.job_with_id(job_id)
if found_job.status in [RenderStatus.NOT_READY, RenderStatus.NOT_STARTED]: if found_job.status in [RenderStatus.CONFIGURING, RenderStatus.NOT_STARTED]:
if found_job.children: if found_job.children:
for hostname, child_id in found_job.children.items(): for child_key in found_job.children.keys():
RenderServerProxy(hostname).request_data(f'/api/job/<child_id>/make_ready') child_id = child_key.split('@')[0]
hostname = child_key.split('@')[-1]
RenderServerProxy(hostname).request_data(f'job/{child_id}/make_ready')
found_job.status = RenderStatus.NOT_STARTED found_job.status = RenderStatus.NOT_STARTED
RenderQueue.save_state() RenderQueue.save_state()
return found_job.json(), 200 return found_job.json(), 200
@@ -255,7 +275,7 @@ def snapshot():
@server.get('/api/_detected_clients') @server.get('/api/_detected_clients')
def detected_clients(): def detected_clients():
# todo: dev/debug only. Should not ship this - probably. # todo: dev/debug only. Should not ship this - probably.
return server.config['ZEROCONF_SERVER'].found_clients() return ZeroconfServer.found_clients()
@server.post('/api/add_job') @server.post('/api/add_job')
@@ -312,8 +332,9 @@ def add_job_handler():
return "Cannot find any valid project paths", 400 return "Cannot find any valid project paths", 400
# prep local filepath # prep local filepath
job_dir = os.path.join(server.config['UPLOAD_FOLDER'], '_'.join( cleaned_path_name = os.path.splitext(referred_name)[0].replace(' ', '_')
[datetime.now().strftime("%Y.%m.%d_%H.%M.%S"), renderer, os.path.splitext(referred_name)[0]])) job_dir = os.path.join(server.config['UPLOAD_FOLDER'], '-'.join(
[datetime.now().strftime("%Y.%m.%d_%H.%M.%S"), renderer, cleaned_path_name]))
os.makedirs(job_dir, exist_ok=True) os.makedirs(job_dir, exist_ok=True)
upload_dir = os.path.join(job_dir, 'source') upload_dir = os.path.join(job_dir, 'source')
os.makedirs(upload_dir, exist_ok=True) os.makedirs(upload_dir, exist_ok=True)
@@ -387,14 +408,15 @@ def add_job_handler():
# determine if we can / should split the job # determine if we can / should split the job
if server.config.get('enable_split_jobs', False) and (worker.total_frames > 1) and not worker.parent: if server.config.get('enable_split_jobs', False) and (worker.total_frames > 1) and not worker.parent:
create_subjobs(worker, job_data, zip_path or loaded_project_local_path) DistributedJobManager.split_into_subjobs(worker, job_data, zip_path or loaded_project_local_path)
RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False)) RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False))
if not worker.parent:
make_job_ready(worker.id) make_job_ready(worker.id)
results.append(worker.json()) results.append(worker.json())
except Exception as e: except Exception as e:
err_msg = f"Error creating render job: {e}" err_msg = f"Exception creating render job: {e}"
logger.error(err_msg) logger.exception(err_msg)
results.append({'error': err_msg}) results.append({'error': err_msg})
# return any errors from results list # return any errors from results list
@@ -413,61 +435,6 @@ def add_job_handler():
return 'unknown error', 500 return 'unknown error', 500
def create_subjobs(worker, job_data, project_path):
# Check availablity
local_hostname = server.config['HOSTNAME']
found_servers = [x for x in server.config['ZEROCONF_SERVER'].found_clients() if local_hostname not in x]
subjob_servers = find_available_servers(found_servers, worker.renderer, worker.start_frame, worker.end_frame)
# Prep and submit these sub-jobs
logger.info(f"Job {worker.id} split plan: {subjob_servers}")
submission_results = {}
try:
for server_data in subjob_servers:
server_hostname = server_data['hostname']
if server_hostname != local_hostname:
subjob = job_data.copy()
subjob['name'] = f"{worker.name}[{server_data['frame_range'][0]}-{server_data['frame_range'][-1]}]"
subjob['parent'] = f"{worker.id}@{local_hostname}"
subjob['start_frame'] = server_data['frame_range'][0]
subjob['end_frame'] = server_data['frame_range'][-1]
logger.debug(f"Posting subjob with frames {subjob['start_frame']}-"
f"{subjob['end_frame']} to {server_hostname}")
post_results = RenderServerProxy(server_hostname).post_job_to_server(
file_path=project_path, job_list=[subjob])
if post_results.ok:
server_data['submission_results'] = post_results.json()[0]
else:
logger.error(f"Failed to create subjob on {server_hostname}")
break
else:
# truncate parent render_job
worker.start_frame = max(server_data['frame_range'][0], worker.start_frame)
worker.end_frame = min(server_data['frame_range'][-1], worker.end_frame)
logger.info(f"Local job now rendering from {worker.start_frame} to {worker.end_frame}")
server_data['submission_results'] = worker.json()
# check that job posts were all successful.
if not all(d.get('submission_results') is not None for d in subjob_servers):
raise ValueError("Failed to create all subjobs") # look into recalculating job numbers and use exising jobs
# start subjobs
logger.debug(f"Starting {len(subjob_servers) - 1} attempted subjobs")
for server_data in subjob_servers:
if server_data['hostname'] != local_hostname:
worker.children[server_data['hostname']] = server_data['submission_results']['id']
worker.name = f"{worker.name}[{worker.start_frame}-{worker.end_frame}]"
except Exception as e:
# cancel all the subjobs
logger.error(f"Failed to split job into subjobs: {e}")
logger.debug(f"Cancelling {len(subjob_servers) - 1} attempted subjobs")
[RenderServerProxy(hostname).cancel_job(results['id'], confirm=True) for hostname, results in submission_results.items()]
@server.get('/api/job/<job_id>/cancel') @server.get('/api/job/<job_id>/cancel')
def cancel_job(job_id): def cancel_job(job_id):
if not request.args.get('confirm', False): if not request.args.get('confirm', False):
@@ -534,9 +501,10 @@ def status():
renderer_data = {} renderer_data = {}
for render_class in RenderWorkerFactory.supported_classes(): for render_class in RenderWorkerFactory.supported_classes():
if render_class.engine.renderer_path(): # only return renderers installed on host
renderer_data[render_class.engine.name()] = \ renderer_data[render_class.engine.name()] = \
{'version': render_class.engine.version(), {'version': render_class.engine.version(),
'is_ready': RenderQueue.is_available_for_job(render_class.engine.name()) 'is_available': RenderQueue.is_available_for_job(render_class.engine.name())
} }
return {"timestamp": datetime.now().isoformat(), return {"timestamp": datetime.now().isoformat(),
@@ -559,11 +527,11 @@ def renderer_info():
renderer_data = {} renderer_data = {}
for r in RenderWorkerFactory.supported_renderers(): for r in RenderWorkerFactory.supported_renderers():
engine = RenderWorkerFactory.class_for_name(r).engine engine = RenderWorkerFactory.class_for_name(r).engine
engine_available = engine.renderer_path() is not None if engine.renderer_path():
renderer_data[r] = {'available': engine_available, renderer_data[r] = {'is_available': RenderQueue.is_available_for_job(engine.name()),
'version': engine.version() if engine_available else None, 'version': engine.version(),
'supported_extensions': engine.supported_extensions, 'supported_extensions': engine.supported_extensions,
'supported_export_formats': engine.get_output_formats() if engine_available else None, 'supported_export_formats': engine.get_output_formats(),
'path': engine.renderer_path()} 'path': engine.renderer_path()}
return renderer_data return renderer_data
@@ -602,15 +570,15 @@ def start_server(background_thread=False):
flask_log.setLevel(config.get('flask_log_level', 'ERROR').upper()) flask_log.setLevel(config.get('flask_log_level', 'ERROR').upper())
# Set up the RenderQueue object # Set up the RenderQueue object
RenderQueue.load_state() RenderQueue.start_queue()
DistributedJobManager.start()
thread = threading.Thread(target=eval_loop, kwargs={'delay_sec': config.get('queue_eval_seconds', 1)}, daemon=True) thread = threading.Thread(target=eval_loop, kwargs={'delay_sec': config.get('queue_eval_seconds', 1)}, daemon=True)
thread.start() thread.start()
logging.info(f"Starting Zordon Render Server - Hostname: '{server.config['HOSTNAME']}:'") logging.info(f"Starting Zordon Render Server - Hostname: '{server.config['HOSTNAME']}:'")
zeroconf_server = ZeroconfServer("_zordon._tcp.local.", server.config['HOSTNAME'], server.config['PORT']) ZeroconfServer.configure("_zordon._tcp.local.", server.config['HOSTNAME'], server.config['PORT'])
zeroconf_server.start() ZeroconfServer.start()
server.config['ZEROCONF_SERVER'] = zeroconf_server
try: try:
if background_thread: if background_thread:
@@ -623,4 +591,4 @@ def start_server(background_thread=False):
use_reloader=False, threaded=True) use_reloader=False, threaded=True)
finally: finally:
RenderQueue.save_state() RenderQueue.save_state()
zeroconf_server.stop() ZeroconfServer.stop()

View File

@@ -10,9 +10,9 @@ from requests_toolbelt.multipart import MultipartEncoder, MultipartEncoderMonito
status_colors = {RenderStatus.ERROR: "red", RenderStatus.CANCELLED: 'orange1', RenderStatus.COMPLETED: 'green', status_colors = {RenderStatus.ERROR: "red", RenderStatus.CANCELLED: 'orange1', RenderStatus.COMPLETED: 'green',
RenderStatus.NOT_STARTED: "yellow", RenderStatus.SCHEDULED: 'purple', RenderStatus.NOT_STARTED: "yellow", RenderStatus.SCHEDULED: 'purple',
RenderStatus.RUNNING: 'cyan'} RenderStatus.RUNNING: 'cyan', RenderStatus.WAITING_FOR_SUBJOBS: 'blue'}
categories = [RenderStatus.RUNNING, RenderStatus.WAITING, RenderStatus.ERROR, RenderStatus.NOT_STARTED, RenderStatus.SCHEDULED, categories = [RenderStatus.RUNNING, RenderStatus.WAITING_FOR_SUBJOBS, RenderStatus.ERROR, RenderStatus.NOT_STARTED, RenderStatus.SCHEDULED,
RenderStatus.COMPLETED, RenderStatus.CANCELLED, RenderStatus.UNDEFINED] RenderStatus.COMPLETED, RenderStatus.CANCELLED, RenderStatus.UNDEFINED]
logger = logging.getLogger() logger = logging.getLogger()
@@ -114,6 +114,10 @@ class RenderServerProxy:
def get_status(self): def get_status(self):
return self.request_data('status') return self.request_data('status')
def notify_parent_of_status_change(self, parent_id, subjob):
return requests.post(f'http://{self.hostname}:{self.port}/api/job/{parent_id}/notify_parent_of_status_change',
json=subjob.json())
def post_job_to_server(self, file_path, job_list, callback=None): def post_job_to_server(self, file_path, job_list, callback=None):
# bypass uploading file if posting to localhost # bypass uploading file if posting to localhost

View File

@@ -6,68 +6,80 @@ from zeroconf import Zeroconf, ServiceInfo, ServiceBrowser, ServiceStateChange
logger = logging.getLogger() logger = logging.getLogger()
class ZeroconfServer(): class ZeroconfServer:
def __init__(self, service_type, server_name, server_port): service_type = None
self.service_type = service_type server_name = None
self.server_name = server_name server_port = None
self.server_port = server_port server_ip = None
self.server_ip = None zeroconf = Zeroconf()
self.zeroconf = Zeroconf() service_info = None
self.service_info = None client_cache = {}
self.client_cache = {} properties = {}
self.properties = {}
def start(self, listen_only=False): @classmethod
def configure(cls, service_type, server_name, server_port):
cls.service_type = service_type
cls.server_name = server_name
cls.server_port = server_port
@classmethod
def start(cls, listen_only=False):
if not listen_only: if not listen_only:
self._register_service() cls._register_service()
self._browse_services() cls._browse_services()
def stop(self): @classmethod
self._unregister_service() def stop(cls):
self.zeroconf.close() cls._unregister_service()
cls.zeroconf.close()
def _register_service(self): @classmethod
self.server_ip = socket.gethostbyname(socket.gethostname()) def _register_service(cls):
cls.server_ip = socket.gethostbyname(socket.gethostname())
info = ServiceInfo( info = ServiceInfo(
self.service_type, cls.service_type,
f"{self.server_name}.{self.service_type}", f"{cls.server_name}.{cls.service_type}",
addresses=[socket.inet_aton(self.server_ip)], addresses=[socket.inet_aton(cls.server_ip)],
port=self.server_port, port=cls.server_port,
properties=self.properties, properties=cls.properties,
) )
self.service_info = info cls.service_info = info
self.zeroconf.register_service(info) cls.zeroconf.register_service(info)
logger.info(f"Registered zeroconf service: {self.service_info.name}") logger.info(f"Registered zeroconf service: {cls.service_info.name}")
def _unregister_service(self): @classmethod
if self.service_info: def _unregister_service(cls):
self.zeroconf.unregister_service(self.service_info) if cls.service_info:
logger.info(f"Unregistered zeroconf service: {self.service_info.name}") cls.zeroconf.unregister_service(cls.service_info)
self.service_info = None logger.info(f"Unregistered zeroconf service: {cls.service_info.name}")
cls.service_info = None
def _browse_services(self): @classmethod
browser = ServiceBrowser(self.zeroconf, self.service_type, [self._on_service_discovered]) def _browse_services(cls):
browser = ServiceBrowser(cls.zeroconf, cls.service_type, [cls._on_service_discovered])
def _on_service_discovered(self, zeroconf, service_type, name, state_change): @classmethod
def _on_service_discovered(cls, zeroconf, service_type, name, state_change):
info = zeroconf.get_service_info(service_type, name) info = zeroconf.get_service_info(service_type, name)
logger.debug(f"Zeroconf: {name} {state_change}") logger.debug(f"Zeroconf: {name} {state_change}")
if service_type == self.service_type: if service_type == cls.service_type:
if state_change == ServiceStateChange.Added or state_change == ServiceStateChange.Updated: if state_change == ServiceStateChange.Added or state_change == ServiceStateChange.Updated:
self.client_cache[name] = info cls.client_cache[name] = info
else: else:
self.client_cache.pop(name) cls.client_cache.pop(name)
def found_clients(self): @classmethod
return [x.split(f'.{self.service_type}')[0] for x in self.client_cache.keys()] def found_clients(cls):
return [x.split(f'.{cls.service_type}')[0] for x in cls.client_cache.keys()]
# Example usage: # Example usage:
if __name__ == "__main__": if __name__ == "__main__":
server = ZeroconfServer("_zordon._tcp.local.", "foobar.local", 8080) ZeroconfServer.configure("_zordon._tcp.local.", "foobar.local", 8080)
try: try:
server.start() ZeroconfServer.start()
input("Server running - Press enter to end") input("Server running - Press enter to end")
finally: finally:
server.stop() ZeroconfServer.stop()

View File

@@ -86,3 +86,20 @@ def get_time_elapsed(start_time=None, end_time=None):
elapsed_time_string = strfdelta(elapsed_time) if elapsed_time else None elapsed_time_string = strfdelta(elapsed_time) if elapsed_time else None
return elapsed_time_string return elapsed_time_string
def get_file_size_human(file_path):
size_in_bytes = os.path.getsize(file_path)
# Convert size to a human readable format
if size_in_bytes < 1024:
return f"{size_in_bytes} B"
elif size_in_bytes < 1024 ** 2:
return f"{size_in_bytes / 1024:.2f} KB"
elif size_in_bytes < 1024 ** 3:
return f"{size_in_bytes / 1024 ** 2:.2f} MB"
elif size_in_bytes < 1024 ** 4:
return f"{size_in_bytes / 1024 ** 3:.2f} GB"
else:
return f"{size_in_bytes / 1024 ** 4:.2f} TB"

View File

@@ -49,75 +49,3 @@ def generate_thumbnail_for_job(job, thumb_video_path, thumb_image_path, max_widt
if video_files and not os.path.exists(thumb_video_path): if video_files and not os.path.exists(thumb_video_path):
x = threading.Thread(target=generate_thumb_thread, args=(video_files[0],)) x = threading.Thread(target=generate_thumb_thread, args=(video_files[0],))
x.start() x.start()
def divide_frames_evenly(start_frame, end_frame, num_servers):
frame_range = end_frame - start_frame + 1
frames_per_server = frame_range // num_servers
leftover_frames = frame_range % num_servers
ranges = []
current_start = start_frame
for i in range(num_servers):
current_end = current_start + frames_per_server - 1
if leftover_frames > 0:
current_end += 1
leftover_frames -= 1
if current_start <= current_end:
ranges.append((current_start, current_end))
current_start = current_end + 1
return ranges
def divide_frames_by_cpu_count(frame_start, frame_end, servers):
total_frames = frame_end - frame_start + 1
total_performance = sum(server['cpu_count'] for server in servers)
frame_ranges = {}
current_frame = frame_start
allocated_frames = 0
for i, server in enumerate(servers):
if i == len(servers) - 1: # if it's the last server
# Give all remaining frames to the last server
num_frames = total_frames - allocated_frames
else:
num_frames = round((server['cpu_count'] / total_performance) * total_frames)
allocated_frames += num_frames
frame_end_for_server = current_frame + num_frames - 1
if current_frame <= frame_end_for_server:
frame_ranges[server['hostname']] = (current_frame, frame_end_for_server)
current_frame = frame_end_for_server + 1
return frame_ranges
def find_available_servers(server_list, renderer, start_frame, end_frame):
local_hostname = socket.gethostname()
subjob_servers = [{'hostname': local_hostname, 'cpu_count': psutil.cpu_count(logical=False)}]
for hostname in server_list:
if hostname != local_hostname:
response = RenderServerProxy(hostname).get_status()
if response and response.get('renderers', {}).get(renderer, {}).get('is_ready', False):
subjob_servers.append({'hostname': hostname, 'cpu_count': int(response['cpu_count'])})
if len(subjob_servers) == 1:
logger.debug("No available servers to split job with. Skipping subjob creation.")
return subjob_servers
# Calculate respective frames for each server
breakdown = divide_frames_by_cpu_count(start_frame, end_frame, subjob_servers)
subjob_servers = [server for server in subjob_servers if breakdown.get(server['hostname']) is not None]
for server in subjob_servers:
server['frame_range'] = breakdown[server['hostname']]
server['total_frames'] = breakdown[server['hostname']][-1] - breakdown[server['hostname']][0] + 1
return subjob_servers
if __name__ == "__main__":
found_servers = ['kamino.local', 'deathstar.local']
print(find_available_servers(found_servers, 'blender', 1, 5))

View File

@@ -5,12 +5,11 @@ import logging
import os import os
import subprocess import subprocess
import threading import threading
import time
import zipfile
from datetime import datetime from datetime import datetime
from enum import Enum from enum import Enum
import psutil import psutil
from pubsub import pub
from sqlalchemy import Column, Integer, String, DateTime, JSON from sqlalchemy import Column, Integer, String, DateTime, JSON
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
@@ -27,8 +26,8 @@ class RenderStatus(Enum):
CANCELLED = "cancelled" CANCELLED = "cancelled"
ERROR = "error" ERROR = "error"
SCHEDULED = "scheduled" SCHEDULED = "scheduled"
WAITING = "waiting" WAITING_FOR_SUBJOBS = "waiting_for_subjobs"
NOT_READY = "not_ready" CONFIGURING = "configuring"
UNDEFINED = "undefined" UNDEFINED = "undefined"
@@ -101,7 +100,7 @@ class BaseRenderWorker(Base):
self.end_time = None self.end_time = None
# History # History
self.status = RenderStatus.NOT_READY self.status = RenderStatus.CONFIGURING
self.warnings = [] self.warnings = []
self.errors = [] self.errors = []
@@ -120,8 +119,11 @@ class BaseRenderWorker(Base):
return self._status return self._status
@status.setter @status.setter
def status(self, value): def status(self, new_status):
self._status = value.value if self._status != new_status.value:
old_status = self._status
self._status = new_status.value
pub.sendMessage('status_change', job_id=self.id, old_status=old_status, new_status=new_status)
@status.getter @status.getter
def status(self): def status(self):
@@ -230,45 +232,9 @@ class BaseRenderWorker(Base):
logger.info(message) logger.info(message)
f.write(message) f.write(message)
from lib.server.server_proxy import RenderServerProxy
# Wait on children jobs, if necessary
if self.children: if self.children:
self.status = RenderStatus.WAITING from lib.distributed_job_manager import DistributedJobManager
subjobs_still_running = self.children.copy() DistributedJobManager.wait_for_subjobs(local_job=self)
while len(subjobs_still_running):
for hostname, job_id in subjobs_still_running.copy().items():
proxy = RenderServerProxy(hostname)
response = proxy.get_job_info(job_id)
if not response:
logger.warning(f"No response from: {hostname}")
else:
status = string_to_status(response.get('status', ''))
status_msg = f"Subjob {job_id}@{hostname} | Status: {status} | {response.get('percent_complete')}%"
if status in [RenderStatus.CANCELLED, RenderStatus.ERROR, RenderStatus.COMPLETED]:
logger.info(f"Downloading completed subjob files from {hostname} to localhost")
try:
zip_file_path = self.output_path + f'_{hostname}_{job_id}.zip'
proxy.get_job_files(job_id, zip_file_path)
logger.debug("Zip file download successfully - Preparing to unzip.")
extract_path = os.path.dirname(zip_file_path)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
logger.info(f"Successfully extracted zip to: {extract_path}")
os.remove(zip_file_path)
except Exception as e:
err_msg = f"Error transferring output from subjob {job_id}@{hostname}: {e}"
logger.exception(err_msg)
self.errors.append(err_msg)
finally:
subjobs_still_running.pop(hostname)
else:
logger.debug(status_msg)
logger.debug(f"Waiting on {len(subjobs_still_running)} subjobs on {', '.join(list(subjobs_still_running.keys()))}")
time.sleep(5)
logger.info("All subjobs complete")
# Post Render Work # Post Render Work
logger.debug("Starting post-processing work") logger.debug("Starting post-processing work")

View File

@@ -124,12 +124,20 @@ class BlenderRenderWorker(BaseRenderWorker):
output_dir_files = os.listdir(os.path.dirname(self.output_path)) output_dir_files = os.listdir(os.path.dirname(self.output_path))
if self.total_frames > 1 and len(output_dir_files) > 1 and not self.parent: if self.total_frames > 1 and len(output_dir_files) > 1 and not self.parent:
logger.info("Generating preview for image sequence") logger.info("Generating preview for image sequence")
# Calculate what the real start frame # is if we have child objects
start_frame = self.start_frame
if self.children:
min_child_frame = min(int(child["start_frame"]) for child in self.children.values())
start_frame = min(min_child_frame, self.start_frame)
logger.debug(f"Post processing start frame #{start_frame}")
try: try:
pattern = os.path.splitext(self.output_path)[0] + "_%04d" + most_common_extension(output_dir_files) pattern = os.path.splitext(self.output_path)[0] + "_%04d" + most_common_extension(output_dir_files)
image_sequence_to_video(source_glob_pattern=pattern, image_sequence_to_video(source_glob_pattern=pattern,
output_path=self.output_path + '.mov', output_path=self.output_path + '.mov',
framerate=self.scene_info['fps'], framerate=self.scene_info['fps'],
start_frame=self.start_frame) start_frame=start_frame)
logger.info('Successfully generated preview video from image sequence') logger.info('Successfully generated preview video from image sequence')
except Exception as e: except Exception as e:
logger.error(f'Error generating video from image sequence: {e}') logger.error(f'Error generating video from image sequence: {e}')