mirror of
https://github.com/blw1138/Zordon.git
synced 2025-12-17 16:58:12 +00:00
Refactor: DistributedJobManager with pub/sub status change notifications (#25)
* Add pubsub to render_queue and base_worker * Refactor: Convert ZeroconfServer to Singleton with Class Methods * New API for subjob servers to notify parent job servers of status changes * Refactor: Move all subjob related methods to distributed_job_manager.py * Rewrite for wait_for_subjobs * Fix: DistributedJobManager.find_available_servers() takes 1 positional argument but 3 were given * DistributedJobManager should now notify / be notified abotu background job changes * Fix the make_ready api. Change children keyname to be id@hostname so it can be unique * Fixes * Image sequence to movie needs to find the actual start frame * Fix: subjob_status_change did not return a valid response * Fix client renderer selection * Small fix for subjob status checking * Fix issue with divide_frames_equally * Fix issue where downloads were not occurring * Fix issue where old status was being reported * Add docstrings and code cleanup
This commit is contained in:
@@ -1,9 +1,13 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import platform
|
||||
import shutil
|
||||
import socket
|
||||
import ssl
|
||||
import threading
|
||||
import time
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
@@ -11,13 +15,16 @@ from urllib.request import urlretrieve
|
||||
from zipfile import ZipFile
|
||||
|
||||
import json2html
|
||||
import psutil
|
||||
import yaml
|
||||
from flask import Flask, request, render_template, send_file, after_this_request, Response, redirect, url_for, abort
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
from lib.distributed_job_manager import DistributedJobManager
|
||||
from lib.render_queue import RenderQueue, JobNotFoundError
|
||||
from lib.server.server_proxy import RenderServerProxy
|
||||
from lib.server.zeroconf_server import ZeroconfServer
|
||||
from lib.utilities.server_helper import *
|
||||
from lib.utilities.server_helper import generate_thumbnail_for_job
|
||||
from lib.workers.base_worker import string_to_status, RenderStatus
|
||||
from lib.workers.worker_factory import RenderWorkerFactory
|
||||
|
||||
@@ -157,6 +164,17 @@ def filtered_jobs_json(status_val):
|
||||
return f'Cannot find jobs with status {status_val}', 400
|
||||
|
||||
|
||||
@server.post('/api/job/<job_id>/notify_parent_of_status_change')
|
||||
def subjob_status_change(job_id):
|
||||
try:
|
||||
subjob_details = request.json
|
||||
logger.info(f"Subjob to job id: {job_id} is now {subjob_details['status']}")
|
||||
DistributedJobManager.handle_subjob_status_change(RenderQueue.job_with_id(job_id), subjob_data=subjob_details)
|
||||
return Response(status=200)
|
||||
except JobNotFoundError:
|
||||
return "Job not found", 404
|
||||
|
||||
|
||||
@server.errorhandler(JobNotFoundError)
|
||||
def handle_job_not_found(job_error):
|
||||
return f'Cannot find job with ID {job_error.job_id}', 400
|
||||
@@ -187,10 +205,12 @@ def get_file_list(job_id):
|
||||
def make_job_ready(job_id):
|
||||
try:
|
||||
found_job = RenderQueue.job_with_id(job_id)
|
||||
if found_job.status in [RenderStatus.NOT_READY, RenderStatus.NOT_STARTED]:
|
||||
if found_job.status in [RenderStatus.CONFIGURING, RenderStatus.NOT_STARTED]:
|
||||
if found_job.children:
|
||||
for hostname, child_id in found_job.children.items():
|
||||
RenderServerProxy(hostname).request_data(f'/api/job/<child_id>/make_ready')
|
||||
for child_key in found_job.children.keys():
|
||||
child_id = child_key.split('@')[0]
|
||||
hostname = child_key.split('@')[-1]
|
||||
RenderServerProxy(hostname).request_data(f'job/{child_id}/make_ready')
|
||||
found_job.status = RenderStatus.NOT_STARTED
|
||||
RenderQueue.save_state()
|
||||
return found_job.json(), 200
|
||||
@@ -255,7 +275,7 @@ def snapshot():
|
||||
@server.get('/api/_detected_clients')
|
||||
def detected_clients():
|
||||
# todo: dev/debug only. Should not ship this - probably.
|
||||
return server.config['ZEROCONF_SERVER'].found_clients()
|
||||
return ZeroconfServer.found_clients()
|
||||
|
||||
|
||||
@server.post('/api/add_job')
|
||||
@@ -312,8 +332,9 @@ def add_job_handler():
|
||||
return "Cannot find any valid project paths", 400
|
||||
|
||||
# prep local filepath
|
||||
job_dir = os.path.join(server.config['UPLOAD_FOLDER'], '_'.join(
|
||||
[datetime.now().strftime("%Y.%m.%d_%H.%M.%S"), renderer, os.path.splitext(referred_name)[0]]))
|
||||
cleaned_path_name = os.path.splitext(referred_name)[0].replace(' ', '_')
|
||||
job_dir = os.path.join(server.config['UPLOAD_FOLDER'], '-'.join(
|
||||
[datetime.now().strftime("%Y.%m.%d_%H.%M.%S"), renderer, cleaned_path_name]))
|
||||
os.makedirs(job_dir, exist_ok=True)
|
||||
upload_dir = os.path.join(job_dir, 'source')
|
||||
os.makedirs(upload_dir, exist_ok=True)
|
||||
@@ -387,14 +408,15 @@ def add_job_handler():
|
||||
|
||||
# determine if we can / should split the job
|
||||
if server.config.get('enable_split_jobs', False) and (worker.total_frames > 1) and not worker.parent:
|
||||
create_subjobs(worker, job_data, zip_path or loaded_project_local_path)
|
||||
DistributedJobManager.split_into_subjobs(worker, job_data, zip_path or loaded_project_local_path)
|
||||
|
||||
RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False))
|
||||
make_job_ready(worker.id)
|
||||
if not worker.parent:
|
||||
make_job_ready(worker.id)
|
||||
results.append(worker.json())
|
||||
except Exception as e:
|
||||
err_msg = f"Error creating render job: {e}"
|
||||
logger.error(err_msg)
|
||||
err_msg = f"Exception creating render job: {e}"
|
||||
logger.exception(err_msg)
|
||||
results.append({'error': err_msg})
|
||||
|
||||
# return any errors from results list
|
||||
@@ -413,61 +435,6 @@ def add_job_handler():
|
||||
return 'unknown error', 500
|
||||
|
||||
|
||||
def create_subjobs(worker, job_data, project_path):
|
||||
|
||||
# Check availablity
|
||||
local_hostname = server.config['HOSTNAME']
|
||||
found_servers = [x for x in server.config['ZEROCONF_SERVER'].found_clients() if local_hostname not in x]
|
||||
|
||||
subjob_servers = find_available_servers(found_servers, worker.renderer, worker.start_frame, worker.end_frame)
|
||||
|
||||
# Prep and submit these sub-jobs
|
||||
logger.info(f"Job {worker.id} split plan: {subjob_servers}")
|
||||
submission_results = {}
|
||||
try:
|
||||
for server_data in subjob_servers:
|
||||
server_hostname = server_data['hostname']
|
||||
if server_hostname != local_hostname:
|
||||
subjob = job_data.copy()
|
||||
subjob['name'] = f"{worker.name}[{server_data['frame_range'][0]}-{server_data['frame_range'][-1]}]"
|
||||
subjob['parent'] = f"{worker.id}@{local_hostname}"
|
||||
subjob['start_frame'] = server_data['frame_range'][0]
|
||||
subjob['end_frame'] = server_data['frame_range'][-1]
|
||||
|
||||
logger.debug(f"Posting subjob with frames {subjob['start_frame']}-"
|
||||
f"{subjob['end_frame']} to {server_hostname}")
|
||||
post_results = RenderServerProxy(server_hostname).post_job_to_server(
|
||||
file_path=project_path, job_list=[subjob])
|
||||
if post_results.ok:
|
||||
server_data['submission_results'] = post_results.json()[0]
|
||||
else:
|
||||
logger.error(f"Failed to create subjob on {server_hostname}")
|
||||
break
|
||||
else:
|
||||
# truncate parent render_job
|
||||
worker.start_frame = max(server_data['frame_range'][0], worker.start_frame)
|
||||
worker.end_frame = min(server_data['frame_range'][-1], worker.end_frame)
|
||||
logger.info(f"Local job now rendering from {worker.start_frame} to {worker.end_frame}")
|
||||
server_data['submission_results'] = worker.json()
|
||||
|
||||
# check that job posts were all successful.
|
||||
if not all(d.get('submission_results') is not None for d in subjob_servers):
|
||||
raise ValueError("Failed to create all subjobs") # look into recalculating job numbers and use exising jobs
|
||||
|
||||
# start subjobs
|
||||
logger.debug(f"Starting {len(subjob_servers) - 1} attempted subjobs")
|
||||
for server_data in subjob_servers:
|
||||
if server_data['hostname'] != local_hostname:
|
||||
worker.children[server_data['hostname']] = server_data['submission_results']['id']
|
||||
worker.name = f"{worker.name}[{worker.start_frame}-{worker.end_frame}]"
|
||||
|
||||
except Exception as e:
|
||||
# cancel all the subjobs
|
||||
logger.error(f"Failed to split job into subjobs: {e}")
|
||||
logger.debug(f"Cancelling {len(subjob_servers) - 1} attempted subjobs")
|
||||
[RenderServerProxy(hostname).cancel_job(results['id'], confirm=True) for hostname, results in submission_results.items()]
|
||||
|
||||
|
||||
@server.get('/api/job/<job_id>/cancel')
|
||||
def cancel_job(job_id):
|
||||
if not request.args.get('confirm', False):
|
||||
@@ -534,10 +501,11 @@ def status():
|
||||
|
||||
renderer_data = {}
|
||||
for render_class in RenderWorkerFactory.supported_classes():
|
||||
renderer_data[render_class.engine.name()] = \
|
||||
{'version': render_class.engine.version(),
|
||||
'is_ready': RenderQueue.is_available_for_job(render_class.engine.name())
|
||||
}
|
||||
if render_class.engine.renderer_path(): # only return renderers installed on host
|
||||
renderer_data[render_class.engine.name()] = \
|
||||
{'version': render_class.engine.version(),
|
||||
'is_available': RenderQueue.is_available_for_job(render_class.engine.name())
|
||||
}
|
||||
|
||||
return {"timestamp": datetime.now().isoformat(),
|
||||
"platform": platform.platform(),
|
||||
@@ -559,12 +527,12 @@ def renderer_info():
|
||||
renderer_data = {}
|
||||
for r in RenderWorkerFactory.supported_renderers():
|
||||
engine = RenderWorkerFactory.class_for_name(r).engine
|
||||
engine_available = engine.renderer_path() is not None
|
||||
renderer_data[r] = {'available': engine_available,
|
||||
'version': engine.version() if engine_available else None,
|
||||
'supported_extensions': engine.supported_extensions,
|
||||
'supported_export_formats': engine.get_output_formats() if engine_available else None,
|
||||
'path': engine.renderer_path()}
|
||||
if engine.renderer_path():
|
||||
renderer_data[r] = {'is_available': RenderQueue.is_available_for_job(engine.name()),
|
||||
'version': engine.version(),
|
||||
'supported_extensions': engine.supported_extensions,
|
||||
'supported_export_formats': engine.get_output_formats(),
|
||||
'path': engine.renderer_path()}
|
||||
return renderer_data
|
||||
|
||||
|
||||
@@ -602,15 +570,15 @@ def start_server(background_thread=False):
|
||||
flask_log.setLevel(config.get('flask_log_level', 'ERROR').upper())
|
||||
|
||||
# Set up the RenderQueue object
|
||||
RenderQueue.load_state()
|
||||
RenderQueue.start_queue()
|
||||
DistributedJobManager.start()
|
||||
|
||||
thread = threading.Thread(target=eval_loop, kwargs={'delay_sec': config.get('queue_eval_seconds', 1)}, daemon=True)
|
||||
thread.start()
|
||||
|
||||
logging.info(f"Starting Zordon Render Server - Hostname: '{server.config['HOSTNAME']}:'")
|
||||
zeroconf_server = ZeroconfServer("_zordon._tcp.local.", server.config['HOSTNAME'], server.config['PORT'])
|
||||
zeroconf_server.start()
|
||||
server.config['ZEROCONF_SERVER'] = zeroconf_server
|
||||
ZeroconfServer.configure("_zordon._tcp.local.", server.config['HOSTNAME'], server.config['PORT'])
|
||||
ZeroconfServer.start()
|
||||
|
||||
try:
|
||||
if background_thread:
|
||||
@@ -623,4 +591,4 @@ def start_server(background_thread=False):
|
||||
use_reloader=False, threaded=True)
|
||||
finally:
|
||||
RenderQueue.save_state()
|
||||
zeroconf_server.stop()
|
||||
ZeroconfServer.stop()
|
||||
|
||||
Reference in New Issue
Block a user