Multi client jobs (#15)

* Add API to expose if RenderQueue is available to take new jobs for a given renderer and priority

* Fix issue with calculating Blender percent complete when not starting at 1

* Rename owner / client properties to parent / children

* Add make_ready method to API

* Create and submit subjobs to other servers

* Update make_ready to update children jobs and some misc fixes

* Misc GUI cleanup
This commit is contained in:
2023-06-15 02:01:50 -05:00
committed by GitHub
parent 78a389080c
commit 69715e8afa
10 changed files with 215 additions and 71 deletions

View File

@@ -20,10 +20,11 @@ from flask import Flask, request, render_template, send_file, after_this_request
from werkzeug.utils import secure_filename
from lib.render_queue import RenderQueue, JobNotFoundError
from lib.workers.base_worker import string_to_status, RenderStatus
from lib.workers.worker_factory import RenderWorkerFactory
from lib.server.server_proxy import RenderServerProxy
from lib.server.zeroconf_server import ZeroconfServer
from lib.utilities.server_helper import generate_thumbnail_for_job
from lib.workers.base_worker import string_to_status, RenderStatus
from lib.workers.worker_factory import RenderWorkerFactory
logger = logging.getLogger()
server = Flask(__name__, template_folder='templates', static_folder='static')
@@ -186,6 +187,23 @@ def get_file_list(job_id):
return RenderQueue.job_with_id(job_id).file_list()
@server.get('/api/job/<job_id>/make_ready')
def make_job_ready(job_id):
try:
found_job = RenderQueue.job_with_id(job_id)
if found_job.status in [RenderStatus.NOT_READY, RenderStatus.NOT_STARTED]:
if found_job.children:
for child_name in found_job.children.split(','):
child_id, hostname = child_name.split('@')
RenderServerProxy(hostname).request_data(f'/api/job/<child_id>/make_ready')
found_job.status = RenderStatus.NOT_STARTED
RenderQueue.save_state()
return found_job.json(), 200
except Exception as e:
return "Error making job ready: {e}", 500
return "Not valid command", 405
@server.route('/api/job/<job_id>/download_all')
def download_all(job_id):
zip_filename = None
@@ -239,9 +257,31 @@ def snapshot():
return server_data
@server.get('/api/_detected_clients')
def detected_clients():
# todo: dev/debug only. Should not ship this - probably.
return server.config['ZEROCONF_SERVER'].found_clients()
@server.route('/api/is_available_for_job', methods=['POST', 'GET'])
def available_for_job():
"""
Check queue to see if it can take a job with a given renderer and priority
"""
renderer = request.args.get('renderer')
priority = request.args.get('priority')
if not renderer or not priority:
return {"error": "Both 'renderer' and 'priority' parameters are required"}, 400
elif renderer not in RenderWorkerFactory.supported_renderers():
return {"error": f"Unsupported renderer: {renderer}"}, 400
else:
return {"is_available": RenderQueue.is_available_for_job(renderer, priority),
'renderer': renderer, 'priority': priority}, 200
@server.post('/api/add_job')
def add_job_handler():
# initial handling of raw data
try:
if request.is_json:
@@ -253,11 +293,11 @@ def add_job_handler():
form_dict = {k: v for k, v in dict(request.form).items() if v}
args = {}
arg_keys = [k for k in form_dict.keys() if '-arg_' in k]
for key in arg_keys:
if form_dict['renderer'] in key or 'AnyRenderer' in key:
cleaned_key = key.split('-arg_')[-1]
args[cleaned_key] = form_dict[key]
form_dict.pop(key)
for server_hostname in arg_keys:
if form_dict['renderer'] in server_hostname or 'AnyRenderer' in server_hostname:
cleaned_key = server_hostname.split('-arg_')[-1]
args[cleaned_key] = form_dict[server_hostname]
form_dict.pop(server_hostname)
args['raw'] = form_dict.get('raw_args', None)
form_dict['args'] = args
jobs_list = [form_dict]
@@ -269,6 +309,7 @@ def add_job_handler():
# start handling project files
try:
# handle uploaded files
logger.debug(f"Incoming new job request: {jobs_list}")
uploaded_project = request.files.get('file', None)
project_url = jobs_list[0].get('url', None)
input_path = jobs_list[0].get('input_path', None)
@@ -342,31 +383,36 @@ def add_job_handler():
# create and add jobs to render queue
results = []
for job in jobs_list:
for job_data in jobs_list:
try:
# prepare output paths
output_dir = os.path.join(job_dir, job.get('name', None) or 'output')
output_dir = os.path.join(job_dir, job_data.get('name', None) or 'output')
os.makedirs(output_dir, exist_ok=True)
# get new output path in output_dir
job['output_path'] = os.path.join(output_dir, os.path.basename(
job.get('name', None) or job.get('output_path', None) or loaded_project_local_path
job_data['output_path'] = os.path.join(output_dir, os.path.basename(
job_data.get('name', None) or job_data.get('output_path', None) or loaded_project_local_path
))
# create & configure jobs
render_job = RenderWorkerFactory.create_worker(renderer=job['renderer'],
worker = RenderWorkerFactory.create_worker(renderer=job_data['renderer'],
input_path=loaded_project_local_path,
output_path=job["output_path"],
args=job.get('args', {}))
render_job.client = server.config['HOSTNAME']
render_job.owner = job.get("owner", render_job.owner)
render_job.name = job.get("name", render_job.name)
render_job.priority = int(job.get('priority', render_job.priority))
render_job.start_frame = job.get("start_frame", render_job.start_frame)
render_job.end_frame = job.get("end_frame", render_job.end_frame)
output_path=job_data["output_path"],
args=job_data.get('args', {}))
worker.status = job_data.get("initial_status", worker.status)
worker.parent = job_data.get("parent", worker.parent)
worker.name = job_data.get("name", worker.name)
worker.priority = int(job_data.get('priority', worker.priority))
worker.start_frame = job_data.get("start_frame", worker.start_frame)
worker.end_frame = job_data.get("end_frame", worker.end_frame)
RenderQueue.add_to_render_queue(render_job, force_start=job.get('force_start', False))
results.append(render_job.json())
# determine if we can / should split the job
if server.config.get('enable_split_jobs', False) and (worker.total_frames > 1) and not worker.parent:
create_subjobs(worker, job_data, loaded_project_local_path)
RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False))
make_job_ready(worker.id)
results.append(worker.json())
except Exception as e:
err_msg = f"Error creating render job: {e}"
logger.error(err_msg)
@@ -388,6 +434,89 @@ def add_job_handler():
return 'unknown error', 500
def create_subjobs(worker, job_data, project_path):
# Check availablity
local_hostname = server.config['HOSTNAME']
found_servers = [x for x in server.config['ZEROCONF_SERVER'].found_clients() if local_hostname not in x]
available_servers = [local_hostname] + [hostname for hostname in found_servers if
RenderServerProxy(hostname).is_available_for_job(renderer=worker.renderer,
priority=worker.priority)]
if len(available_servers) <= 1:
logger.debug("No available servers to split job with. Skipping subjob creation.")
return
logger.info(f"Found {len(available_servers) - 1} additional available servers | "
f"Breaking up job into {len(available_servers)} jobs")
logger.debug(f"Available servers: {available_servers}")
def divide_frames(start_frame, end_frame, num_servers):
frame_range = end_frame - start_frame + 1
frames_per_server = frame_range // num_servers
leftover_frames = frame_range % num_servers
ranges = []
current_start = start_frame
for i in range(num_servers):
current_end = current_start + frames_per_server - 1
if leftover_frames > 0:
current_end += 1
leftover_frames -= 1
if current_start <= current_end:
ranges.append((current_start, current_end))
current_start = current_end + 1
return ranges
# Calculate respective frames for each server
server_frame_ranges = {}
for idx, frame_range in enumerate(divide_frames(worker.start_frame, worker.end_frame, len(available_servers))):
server_frame_ranges[available_servers[idx]] = frame_range
logger.info(f"Job {worker.id} split plan: {server_frame_ranges}")
# Prep and submit these sub-jobs
submission_results = {}
try:
for server_hostname, frame_range in server_frame_ranges.items():
if server_hostname != local_hostname:
subjob = job_data.copy()
subjob['name'] = f"{worker.name}[{frame_range[0]}-{frame_range[-1]}]"
subjob['parent'] = f"{worker.id}@{local_hostname}"
subjob['start_frame'] = frame_range[0]
subjob['end_frame'] = frame_range[-1]
logger.debug(f"Posting subjob with frames {subjob['start_frame']}-"
f"{subjob['end_frame']} to {server_hostname}")
post_results = RenderServerProxy(server_hostname).post_job_to_server(
input_path=project_path, job_list=[subjob])
if post_results.ok:
submission_results[server_hostname] = post_results.json()[0]
else:
logger.error(f"Failed to create subjob on {server_hostname}")
break
# check that job posts were all successful.
if len(submission_results) != (len(server_frame_ranges) - 1):
raise ValueError("Failed to create all subjobs") # look into recalculating job numbers and use exising jobs
# truncate parent render_job
worker.end_frame = min(server_frame_ranges[local_hostname][-1], worker.end_frame)
logger.info(f"Local job now rendering from {worker.start_frame} to {worker.end_frame}")
# start subjobs
logger.debug(f"Starting {len(server_frame_ranges) - 1} attempted subjobs")
worker.children = ",".join([f"{results['id']}@{hostname}" for hostname, results in submission_results.items()])
worker.name = f"{worker.name}[{worker.start_frame}-{worker.end_frame}]"
except Exception as e:
# cancel all the subjobs
logger.error(f"Failed to split job into subjobs: {e}")
logger.debug(f"Cancelling {len(server_frame_ranges) - 1} attempted subjobs")
[RenderServerProxy(hostname).cancel_job(results['id'], confirm=True) for hostname, results in submission_results.items()]
@server.get('/api/job/<job_id>/cancel')
def cancel_job(job_id):
if not request.args.get('confirm', False):
@@ -506,6 +635,7 @@ def start_server(background_thread=False):
server.config['UPLOAD_FOLDER'] = os.path.expanduser(config['upload_folder'])
server.config['THUMBS_FOLDER'] = os.path.join(os.path.expanduser(config['upload_folder']), 'thumbs')
server.config['MAX_CONTENT_PATH'] = config['max_content_path']
server.config['enable_split_jobs'] = config.get('enable_split_jobs', False)
# disable most Flask logging
flask_log = logging.getLogger('werkzeug')
@@ -520,6 +650,7 @@ def start_server(background_thread=False):
logging.info(f"Starting Zordon Render Server - Hostname: '{server.config['HOSTNAME']}:'")
zeroconf_server = ZeroconfServer("_zordon._tcp.local.", server.config['HOSTNAME'], server.config['PORT'])
zeroconf_server.start()
server.config['ZEROCONF_SERVER'] = zeroconf_server
try:
if background_thread:
@@ -531,4 +662,5 @@ def start_server(background_thread=False):
server.run(host='0.0.0.0', port=server.config['PORT'], debug=config.get('flask_debug_enable', False),
use_reloader=False, threaded=True)
finally:
RenderQueue.save_state()
zeroconf_server.stop()