mirror of
https://github.com/blw1138/Zordon.git
synced 2025-12-17 16:58:12 +00:00
* Force start in render queue only starts NOT_STARTED and SCHEDULED jobs * Refactor adding jobs / subjobs * Remove dead code * Fixed issue with bulk job submission * Cancel job now cancels all subjobs * Misc fixes * JSON now returns job hostname * Add hostname as optional column in DB * Misc fixes * Error handling for removing zip file after download * Clean up imports * Fixed issue where worker child information would not be saved
This commit is contained in:
@@ -1,15 +1,17 @@
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import threading
|
||||
import time
|
||||
import zipfile
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import requests
|
||||
from plyer import notification
|
||||
from pubsub import pub
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from src.api.server_proxy import RenderServerProxy
|
||||
from src.engines.engine_manager import EngineManager
|
||||
from src.render_queue import RenderQueue
|
||||
from src.utilities.misc_helper import get_file_size_human
|
||||
from src.utilities.status_utils import RenderStatus, string_to_status
|
||||
@@ -86,6 +88,68 @@ class DistributedJobManager:
|
||||
except Exception as e:
|
||||
logger.debug(f"Unable to show UI notification: {e}")
|
||||
|
||||
# --------------------------------------------
|
||||
# Create Job
|
||||
# --------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def create_render_job(cls, job_data, loaded_project_local_path):
|
||||
"""
|
||||
Creates render jobs.
|
||||
|
||||
This method takes a list of job data, a local path to a loaded project, and a job directory. It creates a render
|
||||
job for each job data in the list and appends the result to a list. The list of results is then returned.
|
||||
|
||||
Args:
|
||||
job_data (dict): Job data.
|
||||
loaded_project_local_path (str): The local path to the loaded project.
|
||||
|
||||
Returns:
|
||||
worker: Created job worker
|
||||
"""
|
||||
|
||||
# get new output path in output_dir
|
||||
output_path = job_data.get('output_path')
|
||||
if not output_path:
|
||||
loaded_project_filename = os.path.basename(loaded_project_local_path)
|
||||
output_filename = os.path.splitext(loaded_project_filename)[0]
|
||||
else:
|
||||
output_filename = os.path.basename(output_path)
|
||||
|
||||
# Prepare output path
|
||||
output_dir = os.path.join(os.path.dirname(os.path.dirname(loaded_project_local_path)), 'output')
|
||||
output_path = os.path.join(output_dir, output_filename)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
logger.debug(f"New job output path: {output_path}")
|
||||
|
||||
# create & configure jobs
|
||||
worker = EngineManager.create_worker(renderer=job_data['renderer'],
|
||||
input_path=loaded_project_local_path,
|
||||
output_path=output_path,
|
||||
engine_version=job_data.get('engine_version'),
|
||||
args=job_data.get('args', {}))
|
||||
worker.status = job_data.get("initial_status", worker.status) # todo: is this necessary?
|
||||
worker.parent = job_data.get("parent", worker.parent)
|
||||
worker.name = job_data.get("name", worker.name)
|
||||
worker.priority = int(job_data.get('priority', worker.priority))
|
||||
worker.start_frame = int(job_data.get("start_frame", worker.start_frame))
|
||||
worker.end_frame = int(job_data.get("end_frame", worker.end_frame))
|
||||
worker.hostname = socket.gethostname()
|
||||
|
||||
# determine if we can / should split the job
|
||||
if job_data.get("enable_split_jobs", False) and (worker.total_frames > 1) and not worker.parent:
|
||||
cls.split_into_subjobs_async(worker, job_data, loaded_project_local_path)
|
||||
else:
|
||||
logger.debug("Not splitting into subjobs")
|
||||
|
||||
RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False))
|
||||
|
||||
return worker
|
||||
|
||||
# --------------------------------------------
|
||||
# Handling Subjobs
|
||||
# --------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def handle_subjob_status_change(cls, local_job, subjob_data):
|
||||
"""
|
||||
@@ -142,7 +206,7 @@ class DistributedJobManager:
|
||||
RenderServerProxy(subjob_hostname).get_job_files(subjob_id, zip_file_path)
|
||||
logger.info(f"File transfer complete for {logname} - Transferred {get_file_size_human(zip_file_path)}")
|
||||
except Exception as e:
|
||||
logger.exception(f"Exception downloading files from remote server: {e}")
|
||||
logger.error(f"Error downloading files from remote server: {e}")
|
||||
local_job.children[child_key]['download_status'] = 'failed'
|
||||
return False
|
||||
|
||||
@@ -218,8 +282,20 @@ class DistributedJobManager:
|
||||
f"{', '.join(list(subjobs_not_downloaded().keys()))}")
|
||||
time.sleep(5)
|
||||
|
||||
# --------------------------------------------
|
||||
# Creating Subjobs
|
||||
# --------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def split_into_subjobs(cls, worker, job_data, project_path, system_os=None):
|
||||
def split_into_subjobs_async(cls, parent_worker, job_data, project_path, system_os=None):
|
||||
# todo: I don't love this
|
||||
parent_worker.status = RenderStatus.CONFIGURING
|
||||
cls.background_worker = threading.Thread(target=cls.split_into_subjobs, args=(parent_worker, job_data,
|
||||
project_path, system_os))
|
||||
cls.background_worker.start()
|
||||
|
||||
@classmethod
|
||||
def split_into_subjobs(cls, parent_worker, job_data, project_path, system_os=None, specific_servers=None):
|
||||
"""
|
||||
Splits a job into subjobs and distributes them among available servers.
|
||||
|
||||
@@ -228,56 +304,50 @@ class DistributedJobManager:
|
||||
subjob.
|
||||
|
||||
Args:
|
||||
worker (Worker): The worker that is handling the job.
|
||||
parent_worker (Worker): The worker that is handling the job.
|
||||
job_data (dict): The data for the job to be split.
|
||||
project_path (str): The path to the project associated with the job.
|
||||
system_os (str, optional): The operating system of the servers. Defaults to None.
|
||||
system_os (str, optional): The operating system of the servers. Default is any OS.
|
||||
specific_servers (list, optional): List of specific servers to split work between. Defaults to all found.
|
||||
"""
|
||||
|
||||
# Check availability
|
||||
available_servers = cls.find_available_servers(worker.renderer, system_os)
|
||||
parent_worker.status = RenderStatus.CONFIGURING
|
||||
available_servers = specific_servers if specific_servers else cls.find_available_servers(parent_worker.renderer, system_os)
|
||||
logger.debug(f"Splitting into subjobs - Available servers: {available_servers}")
|
||||
subjob_servers = cls.distribute_server_work(worker.start_frame, worker.end_frame, available_servers)
|
||||
local_hostname = socket.gethostname()
|
||||
subjob_servers = cls.distribute_server_work(parent_worker.start_frame, parent_worker.end_frame, available_servers)
|
||||
|
||||
|
||||
# Prep and submit these sub-jobs
|
||||
logger.info(f"Job {worker.id} split plan: {subjob_servers}")
|
||||
logger.info(f"Job {parent_worker.id} split plan: {subjob_servers}")
|
||||
try:
|
||||
for server_data in subjob_servers:
|
||||
server_hostname = server_data['hostname']
|
||||
if server_hostname != local_hostname:
|
||||
post_results = cls.__create_subjob(job_data, local_hostname, project_path, server_data,
|
||||
server_hostname, worker)
|
||||
if post_results.ok:
|
||||
server_data['submission_results'] = post_results.json()[0]
|
||||
else:
|
||||
logger.error(f"Failed to create subjob on {server_hostname}")
|
||||
break
|
||||
for subjob_data in subjob_servers:
|
||||
subjob_hostname = subjob_data['hostname']
|
||||
if subjob_hostname != parent_worker.hostname:
|
||||
post_results = cls.__create_subjob(job_data, parent_worker.hostname, project_path, subjob_data,
|
||||
subjob_hostname, parent_worker)
|
||||
if not post_results.ok:
|
||||
ValueError(f"Failed to create subjob on {subjob_hostname}")
|
||||
|
||||
# save child info
|
||||
submission_results = post_results.json()[0]
|
||||
child_key = f"{submission_results['id']}@{subjob_hostname}"
|
||||
parent_worker.children[child_key] = submission_results
|
||||
else:
|
||||
# truncate parent render_job
|
||||
worker.start_frame = max(server_data['frame_range'][0], worker.start_frame)
|
||||
worker.end_frame = min(server_data['frame_range'][-1], worker.end_frame)
|
||||
logger.info(f"Local job now rendering from {worker.start_frame} to {worker.end_frame}")
|
||||
server_data['submission_results'] = worker.json()
|
||||
|
||||
# check that job posts were all successful.
|
||||
if not all(d.get('submission_results') is not None for d in subjob_servers):
|
||||
raise ValueError("Failed to create all subjobs") # look into recalculating job #s and use exising jobs
|
||||
parent_worker.start_frame = max(subjob_data['frame_range'][0], parent_worker.start_frame)
|
||||
parent_worker.end_frame = min(subjob_data['frame_range'][-1], parent_worker.end_frame)
|
||||
logger.info(f"Local job now rendering from {parent_worker.start_frame} to {parent_worker.end_frame}")
|
||||
|
||||
# start subjobs
|
||||
logger.debug(f"Starting {len(subjob_servers) - 1} attempted subjobs")
|
||||
for server_data in subjob_servers:
|
||||
if server_data['hostname'] != local_hostname:
|
||||
child_key = f"{server_data['submission_results']['id']}@{server_data['hostname']}"
|
||||
worker.children[child_key] = server_data['submission_results']
|
||||
worker.name = f"{worker.name}[{worker.start_frame}-{worker.end_frame}]"
|
||||
|
||||
logger.debug(f"Created {len(subjob_servers) - 1} subjobs successfully")
|
||||
parent_worker.name = f"{parent_worker.name}[{parent_worker.start_frame}-{parent_worker.end_frame}]"
|
||||
parent_worker.status = RenderStatus.NOT_STARTED # todo: this won't work with scheduled starts
|
||||
except Exception as e:
|
||||
# cancel all the subjobs
|
||||
logger.error(f"Failed to split job into subjobs: {e}")
|
||||
logger.debug(f"Cancelling {len(subjob_servers) - 1} attempted subjobs")
|
||||
# [RenderServerProxy(hostname).cancel_job(results['id'], confirm=True) for hostname, results in
|
||||
# submission_results.items()] # todo: fix this
|
||||
RenderServerProxy(parent_worker.hostname).cancel_job(parent_worker.id, confirm=True)
|
||||
|
||||
@staticmethod
|
||||
def __create_subjob(job_data, local_hostname, project_path, server_data, server_hostname, worker):
|
||||
@@ -292,6 +362,10 @@ class DistributedJobManager:
|
||||
file_path=project_path, job_list=[subjob])
|
||||
return post_results
|
||||
|
||||
# --------------------------------------------
|
||||
# Server Handling
|
||||
# --------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def distribute_server_work(start_frame, end_frame, available_servers, method='cpu_benchmark'):
|
||||
"""
|
||||
@@ -440,7 +514,7 @@ if __name__ == '__main__':
|
||||
print("Starting Zeroconf...")
|
||||
time.sleep(2)
|
||||
available_servers = DistributedJobManager.find_available_servers('blender')
|
||||
print(f"AVAILABLE SERVERS: {available_servers}")
|
||||
print(f"AVAILABLE SERVERS ({len(available_servers)}): {available_servers}")
|
||||
results = DistributedJobManager.distribute_server_work(1, 100, available_servers)
|
||||
print(f"RESULTS: {results}")
|
||||
ZeroconfServer.stop()
|
||||
|
||||
Reference in New Issue
Block a user