Refactor: DistributedJobManager with pub/sub status change notifications (#25)

* Add pubsub to render_queue and base_worker

* Refactor: Convert ZeroconfServer to Singleton with Class Methods

* New API for subjob servers to notify parent job servers of status changes

* Refactor: Move all subjob related methods to distributed_job_manager.py

* Rewrite for wait_for_subjobs

* Fix: DistributedJobManager.find_available_servers() takes 1 positional argument but 3 were given

* DistributedJobManager should now notify / be notified abotu background job changes

* Fix the make_ready api. Change children keyname to be id@hostname so it can be unique

* Fixes

* Image sequence to movie needs to find the actual start frame

* Fix: subjob_status_change did not return a valid response

* Fix client renderer selection

* Small fix for subjob status checking

* Fix issue with divide_frames_equally

* Fix issue where downloads were not occurring

* Fix issue where old status was being reported

* Add docstrings and code cleanup
This commit is contained in:
2023-06-30 19:49:57 -05:00
committed by GitHub
parent 0b0b410e76
commit 34fbdaa4d9
12 changed files with 503 additions and 255 deletions

View File

@@ -1,9 +1,13 @@
#!/usr/bin/env python3
import json
import logging
import os
import pathlib
import platform
import shutil
import socket
import ssl
import threading
import time
import zipfile
from datetime import datetime
@@ -11,13 +15,16 @@ from urllib.request import urlretrieve
from zipfile import ZipFile
import json2html
import psutil
import yaml
from flask import Flask, request, render_template, send_file, after_this_request, Response, redirect, url_for, abort
from werkzeug.utils import secure_filename
from lib.distributed_job_manager import DistributedJobManager
from lib.render_queue import RenderQueue, JobNotFoundError
from lib.server.server_proxy import RenderServerProxy
from lib.server.zeroconf_server import ZeroconfServer
from lib.utilities.server_helper import *
from lib.utilities.server_helper import generate_thumbnail_for_job
from lib.workers.base_worker import string_to_status, RenderStatus
from lib.workers.worker_factory import RenderWorkerFactory
@@ -157,6 +164,17 @@ def filtered_jobs_json(status_val):
return f'Cannot find jobs with status {status_val}', 400
@server.post('/api/job/<job_id>/notify_parent_of_status_change')
def subjob_status_change(job_id):
try:
subjob_details = request.json
logger.info(f"Subjob to job id: {job_id} is now {subjob_details['status']}")
DistributedJobManager.handle_subjob_status_change(RenderQueue.job_with_id(job_id), subjob_data=subjob_details)
return Response(status=200)
except JobNotFoundError:
return "Job not found", 404
@server.errorhandler(JobNotFoundError)
def handle_job_not_found(job_error):
return f'Cannot find job with ID {job_error.job_id}', 400
@@ -187,10 +205,12 @@ def get_file_list(job_id):
def make_job_ready(job_id):
try:
found_job = RenderQueue.job_with_id(job_id)
if found_job.status in [RenderStatus.NOT_READY, RenderStatus.NOT_STARTED]:
if found_job.status in [RenderStatus.CONFIGURING, RenderStatus.NOT_STARTED]:
if found_job.children:
for hostname, child_id in found_job.children.items():
RenderServerProxy(hostname).request_data(f'/api/job/<child_id>/make_ready')
for child_key in found_job.children.keys():
child_id = child_key.split('@')[0]
hostname = child_key.split('@')[-1]
RenderServerProxy(hostname).request_data(f'job/{child_id}/make_ready')
found_job.status = RenderStatus.NOT_STARTED
RenderQueue.save_state()
return found_job.json(), 200
@@ -255,7 +275,7 @@ def snapshot():
@server.get('/api/_detected_clients')
def detected_clients():
# todo: dev/debug only. Should not ship this - probably.
return server.config['ZEROCONF_SERVER'].found_clients()
return ZeroconfServer.found_clients()
@server.post('/api/add_job')
@@ -312,8 +332,9 @@ def add_job_handler():
return "Cannot find any valid project paths", 400
# prep local filepath
job_dir = os.path.join(server.config['UPLOAD_FOLDER'], '_'.join(
[datetime.now().strftime("%Y.%m.%d_%H.%M.%S"), renderer, os.path.splitext(referred_name)[0]]))
cleaned_path_name = os.path.splitext(referred_name)[0].replace(' ', '_')
job_dir = os.path.join(server.config['UPLOAD_FOLDER'], '-'.join(
[datetime.now().strftime("%Y.%m.%d_%H.%M.%S"), renderer, cleaned_path_name]))
os.makedirs(job_dir, exist_ok=True)
upload_dir = os.path.join(job_dir, 'source')
os.makedirs(upload_dir, exist_ok=True)
@@ -387,14 +408,15 @@ def add_job_handler():
# determine if we can / should split the job
if server.config.get('enable_split_jobs', False) and (worker.total_frames > 1) and not worker.parent:
create_subjobs(worker, job_data, zip_path or loaded_project_local_path)
DistributedJobManager.split_into_subjobs(worker, job_data, zip_path or loaded_project_local_path)
RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False))
make_job_ready(worker.id)
if not worker.parent:
make_job_ready(worker.id)
results.append(worker.json())
except Exception as e:
err_msg = f"Error creating render job: {e}"
logger.error(err_msg)
err_msg = f"Exception creating render job: {e}"
logger.exception(err_msg)
results.append({'error': err_msg})
# return any errors from results list
@@ -413,61 +435,6 @@ def add_job_handler():
return 'unknown error', 500
def create_subjobs(worker, job_data, project_path):
# Check availablity
local_hostname = server.config['HOSTNAME']
found_servers = [x for x in server.config['ZEROCONF_SERVER'].found_clients() if local_hostname not in x]
subjob_servers = find_available_servers(found_servers, worker.renderer, worker.start_frame, worker.end_frame)
# Prep and submit these sub-jobs
logger.info(f"Job {worker.id} split plan: {subjob_servers}")
submission_results = {}
try:
for server_data in subjob_servers:
server_hostname = server_data['hostname']
if server_hostname != local_hostname:
subjob = job_data.copy()
subjob['name'] = f"{worker.name}[{server_data['frame_range'][0]}-{server_data['frame_range'][-1]}]"
subjob['parent'] = f"{worker.id}@{local_hostname}"
subjob['start_frame'] = server_data['frame_range'][0]
subjob['end_frame'] = server_data['frame_range'][-1]
logger.debug(f"Posting subjob with frames {subjob['start_frame']}-"
f"{subjob['end_frame']} to {server_hostname}")
post_results = RenderServerProxy(server_hostname).post_job_to_server(
file_path=project_path, job_list=[subjob])
if post_results.ok:
server_data['submission_results'] = post_results.json()[0]
else:
logger.error(f"Failed to create subjob on {server_hostname}")
break
else:
# truncate parent render_job
worker.start_frame = max(server_data['frame_range'][0], worker.start_frame)
worker.end_frame = min(server_data['frame_range'][-1], worker.end_frame)
logger.info(f"Local job now rendering from {worker.start_frame} to {worker.end_frame}")
server_data['submission_results'] = worker.json()
# check that job posts were all successful.
if not all(d.get('submission_results') is not None for d in subjob_servers):
raise ValueError("Failed to create all subjobs") # look into recalculating job numbers and use exising jobs
# start subjobs
logger.debug(f"Starting {len(subjob_servers) - 1} attempted subjobs")
for server_data in subjob_servers:
if server_data['hostname'] != local_hostname:
worker.children[server_data['hostname']] = server_data['submission_results']['id']
worker.name = f"{worker.name}[{worker.start_frame}-{worker.end_frame}]"
except Exception as e:
# cancel all the subjobs
logger.error(f"Failed to split job into subjobs: {e}")
logger.debug(f"Cancelling {len(subjob_servers) - 1} attempted subjobs")
[RenderServerProxy(hostname).cancel_job(results['id'], confirm=True) for hostname, results in submission_results.items()]
@server.get('/api/job/<job_id>/cancel')
def cancel_job(job_id):
if not request.args.get('confirm', False):
@@ -534,10 +501,11 @@ def status():
renderer_data = {}
for render_class in RenderWorkerFactory.supported_classes():
renderer_data[render_class.engine.name()] = \
{'version': render_class.engine.version(),
'is_ready': RenderQueue.is_available_for_job(render_class.engine.name())
}
if render_class.engine.renderer_path(): # only return renderers installed on host
renderer_data[render_class.engine.name()] = \
{'version': render_class.engine.version(),
'is_available': RenderQueue.is_available_for_job(render_class.engine.name())
}
return {"timestamp": datetime.now().isoformat(),
"platform": platform.platform(),
@@ -559,12 +527,12 @@ def renderer_info():
renderer_data = {}
for r in RenderWorkerFactory.supported_renderers():
engine = RenderWorkerFactory.class_for_name(r).engine
engine_available = engine.renderer_path() is not None
renderer_data[r] = {'available': engine_available,
'version': engine.version() if engine_available else None,
'supported_extensions': engine.supported_extensions,
'supported_export_formats': engine.get_output_formats() if engine_available else None,
'path': engine.renderer_path()}
if engine.renderer_path():
renderer_data[r] = {'is_available': RenderQueue.is_available_for_job(engine.name()),
'version': engine.version(),
'supported_extensions': engine.supported_extensions,
'supported_export_formats': engine.get_output_formats(),
'path': engine.renderer_path()}
return renderer_data
@@ -602,15 +570,15 @@ def start_server(background_thread=False):
flask_log.setLevel(config.get('flask_log_level', 'ERROR').upper())
# Set up the RenderQueue object
RenderQueue.load_state()
RenderQueue.start_queue()
DistributedJobManager.start()
thread = threading.Thread(target=eval_loop, kwargs={'delay_sec': config.get('queue_eval_seconds', 1)}, daemon=True)
thread.start()
logging.info(f"Starting Zordon Render Server - Hostname: '{server.config['HOSTNAME']}:'")
zeroconf_server = ZeroconfServer("_zordon._tcp.local.", server.config['HOSTNAME'], server.config['PORT'])
zeroconf_server.start()
server.config['ZEROCONF_SERVER'] = zeroconf_server
ZeroconfServer.configure("_zordon._tcp.local.", server.config['HOSTNAME'], server.config['PORT'])
ZeroconfServer.start()
try:
if background_thread:
@@ -623,4 +591,4 @@ def start_server(background_thread=False):
use_reloader=False, threaded=True)
finally:
RenderQueue.save_state()
zeroconf_server.stop()
ZeroconfServer.stop()