Files
Zordon/src/distributed_job_manager.py
Brett e757506787 Parent creates local subjobs instead of truncating original (#95)
* Parent worker now creates subjob on local host and waits for it

* Improve wait_for_subjobs logic

* Fix setting end_time for base_worker

* API cleanup

* Code refactoring

* Cleanup
2024-08-10 21:19:01 -05:00

585 lines
28 KiB
Python

import logging
import os
import socket
import threading
import time
import zipfile
from concurrent.futures import ThreadPoolExecutor
import requests
from plyer import notification
from pubsub import pub
from src.api.preview_manager import PreviewManager
from src.api.server_proxy import RenderServerProxy
from src.engines.engine_manager import EngineManager
from src.render_queue import RenderQueue
from src.utilities.config import Config
from src.utilities.misc_helper import get_file_size_human
from src.utilities.status_utils import RenderStatus, string_to_status
from src.utilities.zeroconf_server import ZeroconfServer
logger = logging.getLogger()
class DistributedJobManager:
def __init__(self):
pass
@classmethod
def subscribe_to_listener(cls):
"""
Subscribes the private class method '__local_job_status_changed' to the 'status_change' pubsub message.
This should be called once, typically during the initialization phase.
"""
pub.subscribe(cls.__local_job_status_changed, 'status_change')
pub.subscribe(cls.__local_job_frame_complete, 'frame_complete')
@classmethod
def __local_job_frame_complete(cls, job_id, frame_number, update_interval=5):
"""
Responds to the 'frame_complete' pubsub message for local jobs.
Parameters:
job_id (str): The ID of the job that has changed status.
old_status (str): The previous status of the job.
new_status (str): The new (current) status of the job.
Note: Do not call directly. Instead, call via the 'frame_complete' pubsub message.
"""
render_job = RenderQueue.job_with_id(job_id, none_ok=True)
if not render_job: # ignore jobs not in the queue
return
logger.debug(f"Job {job_id} has completed frame #{frame_number}")
replace_existing_previews = (frame_number % update_interval) == 0
cls.__job_update_shared(render_job, replace_existing_previews)
@classmethod
def __job_update_shared(cls, render_job, replace_existing_previews=False):
# update previews
PreviewManager.update_previews_for_job(job=render_job, replace_existing=replace_existing_previews)
# notify parent to allow individual frames to be copied instead of waiting until the end
if render_job.parent:
parent_id, parent_hostname = render_job.parent.split('@')[0], render_job.parent.split('@')[-1]
try:
logger.debug(f'Job {render_job.id} updating parent {parent_id}@{parent_hostname}')
RenderServerProxy(parent_hostname).send_subjob_update_notification(parent_id, render_job)
except Exception as e:
logger.error(f"Error notifying parent {parent_hostname} about update in subjob {render_job.id}: {e}")
@classmethod
def __local_job_status_changed(cls, job_id, old_status, new_status):
"""
Responds to the 'status_change' pubsub message for local jobs.
If it's a child job, it notifies the parent job about the status change.
Parameters:
job_id (str): The ID of the job that has changed status.
old_status (str): The previous status of the job.
new_status (str): The new (current) status of the job.
Note: Do not call directly. Instead, call via the 'status_change' pubsub message.
"""
render_job = RenderQueue.job_with_id(job_id, none_ok=True)
if not render_job: # ignore jobs created but not yet added to queue
return
logger.debug(f"Job {job_id} status change: {old_status} -> {new_status}")
cls.__job_update_shared(render_job, replace_existing_previews=(render_job.status == RenderStatus.COMPLETED))
# Handle children
if render_job.children:
if new_status in [RenderStatus.CANCELLED, RenderStatus.ERROR]: # Cancel children if necessary
for child in render_job.children:
child_id, child_hostname = child.split('@')
RenderServerProxy(child_hostname).cancel_job(child_id, confirm=True)
# UI Notifications
try:
if new_status == RenderStatus.COMPLETED:
logger.debug("Show render complete notification")
notification.notify(
title='Render Job Complete',
message=f'{render_job.name} completed succesfully',
timeout=10 # Display time in seconds
)
elif new_status == RenderStatus.ERROR:
logger.debug("Show render error notification")
notification.notify(
title='Render Job Failed',
message=f'{render_job.name} failed rendering',
timeout=10 # Display time in seconds
)
elif new_status == RenderStatus.RUNNING:
logger.debug("Show render started notification")
notification.notify(
title='Render Job Started',
message=f'{render_job.name} started rendering',
timeout=10 # Display time in seconds
)
except Exception as e:
logger.debug(f"Unable to show UI notification: {e}")
# --------------------------------------------
# Create Job
# --------------------------------------------
@classmethod
def create_render_job(cls, job_data, loaded_project_local_path):
"""
Creates render jobs.
This method job data and a local path to a loaded project. It creates and returns new a render job.
Args:
job_data (dict): Job data.
loaded_project_local_path (str): The local path to the loaded project.
Returns:
worker: Created job worker
"""
# get new output path in output_dir
output_path = job_data.get('output_path')
if not output_path:
loaded_project_filename = os.path.basename(loaded_project_local_path)
output_filename = os.path.splitext(loaded_project_filename)[0]
else:
output_filename = os.path.basename(output_path)
# Prepare output path
output_dir = os.path.join(os.path.dirname(os.path.dirname(loaded_project_local_path)), 'output')
output_path = os.path.join(output_dir, output_filename)
os.makedirs(output_dir, exist_ok=True)
logger.debug(f"New job output path: {output_path}")
# create & configure jobs
worker = EngineManager.create_worker(renderer=job_data['renderer'],
input_path=loaded_project_local_path,
output_path=output_path,
engine_version=job_data.get('engine_version'),
args=job_data.get('args', {}),
parent=job_data.get('parent'),
name=job_data.get('name'))
worker.status = job_data.get("initial_status", worker.status) # todo: is this necessary?
worker.priority = int(job_data.get('priority', worker.priority))
worker.start_frame = int(job_data.get("start_frame", worker.start_frame))
worker.end_frame = int(job_data.get("end_frame", worker.end_frame))
worker.watchdog_timeout = Config.worker_process_timeout
worker.hostname = socket.gethostname()
# determine if we can / should split the job
if job_data.get("enable_split_jobs", False) and (worker.total_frames > 1) and not worker.parent:
cls.split_into_subjobs_async(worker, job_data, loaded_project_local_path)
else:
worker.status = RenderStatus.NOT_STARTED
RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False))
PreviewManager.update_previews_for_job(worker)
return worker
# --------------------------------------------
# Handling Subjobs
# --------------------------------------------
@classmethod
def handle_subjob_update_notification(cls, local_job, subjob_data):
"""
Responds to a notification from a remote subjob and the host requests any subsequent updates from the subjob.
Args:
local_job (BaseRenderWorker): The local parent job worker.
subjob_data (dict): Subjob data sent from the remote server.
"""
subjob_status = string_to_status(subjob_data['status'])
subjob_id = subjob_data['id']
subjob_hostname = subjob_data['hostname']
subjob_key = f'{subjob_id}@{subjob_hostname}'
old_status = local_job.children.get(subjob_key, {}).get('status')
local_job.children[subjob_key] = subjob_data
logname = f"<Parent: {local_job.id} | Child: {subjob_key}>"
if old_status != subjob_status.value:
logger.debug(f"Subjob status changed: {logname} -> {subjob_status.value}")
download_success = cls.download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname)
if subjob_data['status'] == 'completed' and download_success:
local_job.children[subjob_key]['download_status'] = 'completed'
@staticmethod
def download_missing_frames_from_subjob(local_job, subjob_id, subjob_hostname):
success = True
try:
local_files = [os.path.basename(x) for x in local_job.file_list()]
subjob_proxy = RenderServerProxy(subjob_hostname)
subjob_files = subjob_proxy.get_job_files_list(job_id=subjob_id) or []
for subjob_filename in subjob_files:
if subjob_filename not in local_files:
try:
logger.debug(f"Downloading new file '{subjob_filename}' from {subjob_hostname}")
local_save_path = os.path.join(os.path.dirname(local_job.output_path), subjob_filename)
subjob_proxy.download_job_file(job_id=subjob_id, job_filename=subjob_filename,
save_path=local_save_path)
logger.debug(f'Downloaded successfully - {local_save_path}')
except Exception as e:
logger.error(f"Error downloading file '{subjob_filename}' from {subjob_hostname}: {e}")
success = False
except Exception as e:
logger.exception(f'Uncaught exception while trying to download from subjob: {e}')
success = False
return success
@staticmethod
def download_all_from_subjob(local_job, subjob_id, subjob_hostname):
"""
Downloads and extracts files from a completed subjob on a remote server.
Parameters:
local_job (BaseRenderWorker): The local parent job worker.
subjob_id (str or int): The ID of the subjob.
subjob_hostname (str): The hostname of the remote server where the subjob is located.
Returns:
bool: True if the files have been downloaded and extracted successfully, False otherwise.
"""
child_key = f'{subjob_id}@{subjob_hostname}'
logname = f"{local_job.id}:{child_key}"
zip_file_path = local_job.output_path + f'_{subjob_hostname}_{subjob_id}.zip'
# download zip file from server
try:
local_job.children[child_key]['download_status'] = 'working'
logger.info(f"Downloading completed subjob files from {subjob_hostname} to localhost")
RenderServerProxy(subjob_hostname).download_all_job_files(subjob_id, zip_file_path)
logger.info(f"File transfer complete for {logname} - Transferred {get_file_size_human(zip_file_path)}")
except Exception as e:
logger.error(f"Error downloading files from remote server: {e}")
local_job.children[child_key]['download_status'] = 'failed'
return False
# extract zip
try:
logger.debug(f"Extracting zip file: {zip_file_path}")
extract_path = os.path.dirname(zip_file_path)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
logger.info(f"Successfully extracted zip to: {extract_path}")
os.remove(zip_file_path)
local_job.children[child_key]['download_status'] = 'complete'
except Exception as e:
logger.exception(f"Exception extracting zip file: {e}")
local_job.children[child_key]['download_status'] = 'failed'
return local_job.children[child_key].get('download_status', None) == 'complete'
@classmethod
def wait_for_subjobs(cls, parent_job):
logger.debug(f"Waiting for subjobs for job {parent_job}")
parent_job.status = RenderStatus.WAITING_FOR_SUBJOBS
statuses_to_download = [RenderStatus.CANCELLED, RenderStatus.ERROR, RenderStatus.COMPLETED]
def subjobs_not_downloaded():
return {k: v for k, v in parent_job.children.items() if 'download_status' not in v or
v['download_status'] == 'working' or v['download_status'] is None}
logger.info(f'Waiting on {len(subjobs_not_downloaded())} subjobs for {parent_job.id}')
server_delay = 10
sleep_counter = 0
while parent_job.status == RenderStatus.WAITING_FOR_SUBJOBS:
if sleep_counter % server_delay == 0: # only ping servers every x seconds
for child_key, subjob_cached_data in subjobs_not_downloaded().items():
subjob_id = child_key.split('@')[0]
subjob_hostname = child_key.split('@')[-1]
# Fetch info from server and handle failing case
subjob_data = RenderServerProxy(subjob_hostname).get_job_info(subjob_id)
if not subjob_data:
logger.warning(f"No response from {subjob_hostname}")
# timeout / missing server situations
parent_job.children[child_key]['download_status'] = f'error: No response from {subjob_hostname}'
continue
# Update parent job cache but keep the download status
download_status = parent_job.children[child_key].get('download_status', None)
parent_job.children[child_key] = subjob_data
parent_job.children[child_key]['download_status'] = download_status
status = string_to_status(subjob_data.get('status', ''))
status_msg = f"Subjob {child_key} | {status} | " \
f"{float(subjob_data.get('percent_complete')) * 100.0}%"
logger.debug(status_msg)
# Check if job is finished, but has not had files copied yet over yet
if download_status is None and subjob_data['file_count'] and status in statuses_to_download:
try:
cls.download_missing_frames_from_subjob(parent_job, subjob_id, subjob_hostname)
parent_job.children[child_key]['download_status'] = 'complete'
except Exception as e:
logger.error(f"Error downloading missing frames from subjob: {e}")
parent_job.children[child_key]['download_status'] = 'error: {}'
# Any finished jobs not successfully downloaded at this point are skipped
if parent_job.children[child_key].get('download_status', None) is None and \
status in statuses_to_download:
logger.warning(f"Skipping waiting on downloading from subjob: {child_key}")
parent_job.children[child_key]['download_status'] = 'skipped'
if subjobs_not_downloaded():
logger.debug(f"Waiting on {len(subjobs_not_downloaded())} subjobs on "
f"{', '.join(list(subjobs_not_downloaded().keys()))}")
time.sleep(1)
sleep_counter += 1
else: # exit the loop
parent_job.status = RenderStatus.RUNNING
# --------------------------------------------
# Creating Subjobs
# --------------------------------------------
@classmethod
def split_into_subjobs_async(cls, parent_worker, job_data, project_path, system_os=None):
# todo: I don't love this
parent_worker.status = RenderStatus.CONFIGURING
cls.background_worker = threading.Thread(target=cls.split_into_subjobs, args=(parent_worker, job_data,
project_path, system_os))
cls.background_worker.start()
@classmethod
def split_into_subjobs(cls, parent_worker, job_data, project_path, system_os=None, specific_servers=None):
"""
Splits a job into subjobs and distributes them among available servers.
This method checks the availability of servers, distributes the work among them, and creates subjobs on each
server. If a server is the local host, it adjusts the frame range of the parent job instead of creating a
subjob.
Args:
parent_worker (Worker): The worker that is handling the job.
job_data (dict): The data for the job to be split.
project_path (str): The path to the project associated with the job.
system_os (str, optional): The operating system of the servers. Default is any OS.
specific_servers (list, optional): List of specific servers to split work between. Defaults to all found.
"""
# Check availability
available_servers = specific_servers if specific_servers else cls.find_available_servers(parent_worker.renderer,
system_os)
# skip if theres no external servers found
external_servers = [x for x in available_servers if x['hostname'] != parent_worker.hostname]
if not external_servers:
parent_worker.status = RenderStatus.NOT_STARTED
return
logger.debug(f"Splitting into subjobs - Available servers: {[x['hostname'] for x in available_servers]}")
all_subjob_server_data = cls.distribute_server_work(parent_worker.start_frame, parent_worker.end_frame, available_servers)
# Prep and submit these sub-jobs
logger.info(f"Job {parent_worker.id} split plan: {all_subjob_server_data}")
try:
for subjob_data in all_subjob_server_data:
subjob_hostname = subjob_data['hostname']
post_results = cls.__create_subjob(job_data, project_path, subjob_data, subjob_hostname,
parent_worker)
if not post_results.ok:
ValueError(f"Failed to create subjob on {subjob_hostname}")
# save child info
submission_results = post_results.json()[0]
child_key = f"{submission_results['id']}@{subjob_hostname}"
parent_worker.children[child_key] = submission_results
# start subjobs
logger.debug(f"Created {len(all_subjob_server_data)} subjobs successfully")
parent_worker.name = f"{parent_worker.name} (Parent)"
parent_worker.status = RenderStatus.NOT_STARTED # todo: this won't work with scheduled starts
except Exception as e:
# cancel all the subjobs
logger.error(f"Failed to split job into subjobs: {e}")
logger.debug(f"Cancelling {len(all_subjob_server_data) - 1} attempted subjobs")
RenderServerProxy(parent_worker.hostname).cancel_job(parent_worker.id, confirm=True)
@staticmethod
def __create_subjob(job_data, project_path, server_data, server_hostname, parent_worker):
subjob = job_data.copy()
subjob['name'] = f"{parent_worker.name}[{server_data['frame_range'][0]}-{server_data['frame_range'][-1]}]"
subjob['parent'] = f"{parent_worker.id}@{parent_worker.hostname}"
subjob['start_frame'] = server_data['frame_range'][0]
subjob['end_frame'] = server_data['frame_range'][-1]
subjob['engine_version'] = parent_worker.renderer_version
logger.debug(f"Posting subjob with frames {subjob['start_frame']}-"
f"{subjob['end_frame']} to {server_hostname}")
post_results = RenderServerProxy(server_hostname).post_job_to_server(
file_path=project_path, job_list=[subjob])
return post_results
# --------------------------------------------
# Server Handling
# --------------------------------------------
@staticmethod
def distribute_server_work(start_frame, end_frame, available_servers, method='cpu_benchmark'):
"""
Splits the frame range among available servers proportionally based on their performance (CPU count).
Args:
start_frame (int): The start frame number of the animation to be rendered.
end_frame (int): The end frame number of the animation to be rendered.
available_servers (list): A list of available server dictionaries. Each server dictionary should include
'hostname' and 'cpu_count' keys (see find_available_servers).
method (str, optional): Specifies the distribution method. Possible values are 'cpu_benchmark', 'cpu_count'
and 'evenly'.
Defaults to 'cpu_benchmark'.
Returns:
list: A list of server dictionaries where each dictionary includes the frame range and total number of
frames to be rendered by the server.
"""
# Calculate respective frames for each server
def divide_frames_by_cpu_count(frame_start, frame_end, servers):
total_frames = frame_end - frame_start + 1
total_cpus = sum(server['cpu_count'] for server in servers)
frame_ranges = {}
current_frame = frame_start
allocated_frames = 0
for i, server in enumerate(servers):
if i == len(servers) - 1: # if it's the last server
# Give all remaining frames to the last server
num_frames = total_frames - allocated_frames
else:
num_frames = round((server['cpu_count'] / total_cpus) * total_frames)
allocated_frames += num_frames
frame_end_for_server = current_frame + num_frames - 1
if current_frame <= frame_end_for_server:
frame_ranges[server['hostname']] = (current_frame, frame_end_for_server)
current_frame = frame_end_for_server + 1
return frame_ranges
def divide_frames_by_benchmark(frame_start, frame_end, servers):
def fetch_benchmark(server):
try:
benchmark = requests.get(f'http://{server["hostname"]}:{ZeroconfServer.server_port}'
f'/api/cpu_benchmark').text
server['cpu_benchmark'] = benchmark
logger.debug(f'Benchmark for {server["hostname"]}: {benchmark}')
except requests.exceptions.RequestException as e:
logger.error(f'Error fetching benchmark for {server["hostname"]}: {e}')
# Number of threads to use (can adjust based on your needs or number of servers)
threads = len(servers)
with ThreadPoolExecutor(max_workers=threads) as executor:
executor.map(fetch_benchmark, servers)
total_frames = frame_end - frame_start + 1
total_performance = sum(int(server['cpu_benchmark']) for server in servers)
frame_ranges = {}
current_frame = frame_start
allocated_frames = 0
for i, server in enumerate(servers):
if i == len(servers) - 1: # if it's the last server
# Give all remaining frames to the last server
num_frames = total_frames - allocated_frames
else:
num_frames = round((int(server['cpu_benchmark']) / total_performance) * total_frames)
allocated_frames += num_frames
frame_end_for_server = current_frame + num_frames - 1
if current_frame <= frame_end_for_server:
frame_ranges[server['hostname']] = (current_frame, frame_end_for_server)
current_frame = frame_end_for_server + 1
return frame_ranges
def divide_frames_equally(frame_start, frame_end, servers):
frame_range = frame_end - frame_start + 1
frames_per_server = frame_range // len(servers)
leftover_frames = frame_range % len(servers)
frame_ranges = {}
current_start = frame_start
for i, server in enumerate(servers):
current_end = current_start + frames_per_server - 1
if leftover_frames > 0:
current_end += 1
leftover_frames -= 1
if current_start <= current_end:
frame_ranges[server['hostname']] = (current_start, current_end)
current_start = current_end + 1
return frame_ranges
if len(available_servers) == 1:
breakdown = {available_servers[0]['hostname']: (start_frame, end_frame)}
else:
logger.debug(f'Splitting between {len(available_servers)} servers by {method} method')
if method == 'evenly':
breakdown = divide_frames_equally(start_frame, end_frame, available_servers)
elif method == 'cpu_benchmark':
breakdown = divide_frames_by_benchmark(start_frame, end_frame, available_servers)
elif method == 'cpu_count':
breakdown = divide_frames_by_cpu_count(start_frame, end_frame, available_servers)
else:
raise ValueError(f"Invalid distribution method: {method}")
server_breakdown = [server for server in available_servers if breakdown.get(server['hostname']) is not None]
for server in server_breakdown:
server['frame_range'] = breakdown[server['hostname']]
server['total_frames'] = breakdown[server['hostname']][-1] - breakdown[server['hostname']][0] + 1
return server_breakdown
@staticmethod
def find_available_servers(engine_name, system_os=None):
"""
Scan the Zeroconf network for currently available render servers supporting a specific engine.
:param engine_name: str, The engine type to search for
:param system_os: str, Restrict results to servers running a specific OS
:return: A list of dictionaries with each dict containing hostname and cpu_count of available servers
"""
available_servers = []
for hostname in ZeroconfServer.found_hostnames():
host_properties = ZeroconfServer.get_hostname_properties(hostname)
if not system_os or (system_os and system_os == host_properties.get('system_os')):
response = RenderServerProxy(hostname).is_engine_available(engine_name)
if response and response.get('available', False):
available_servers.append(response)
return available_servers
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
ZeroconfServer.configure("_zordon._tcp.local.", 'testing', 8080)
ZeroconfServer.start(listen_only=True)
print("Starting Zeroconf...")
time.sleep(2)
available_servers = DistributedJobManager.find_available_servers('blender')
print(f"AVAILABLE SERVERS ({len(available_servers)}): {available_servers}")
# results = DistributedJobManager.distribute_server_work(1, 100, available_servers)
# print(f"RESULTS: {results}")
ZeroconfServer.stop()