Create subjobs after submission - #54 (#79)

* Force start in render queue only starts NOT_STARTED and SCHEDULED jobs

* Refactor adding jobs / subjobs

* Remove dead code

* Fixed issue with bulk job submission

* Cancel job now cancels all subjobs

* Misc fixes

* JSON now returns job hostname

* Add hostname as optional column in DB

* Misc fixes

* Error handling for removing zip file after download

* Clean up imports

* Fixed issue where worker child information would not be saved
This commit is contained in:
2024-07-30 19:22:38 -05:00
committed by GitHub
parent 6d33f262b3
commit 8a3e74660c
8 changed files with 138 additions and 142 deletions

View File

@@ -1,15 +1,17 @@
import logging
import os
import socket
import threading
import time
import zipfile
from concurrent.futures import ThreadPoolExecutor
import requests
from plyer import notification
from pubsub import pub
from concurrent.futures import ThreadPoolExecutor
from src.api.server_proxy import RenderServerProxy
from src.engines.engine_manager import EngineManager
from src.render_queue import RenderQueue
from src.utilities.misc_helper import get_file_size_human
from src.utilities.status_utils import RenderStatus, string_to_status
@@ -86,6 +88,68 @@ class DistributedJobManager:
except Exception as e:
logger.debug(f"Unable to show UI notification: {e}")
# --------------------------------------------
# Create Job
# --------------------------------------------
@classmethod
def create_render_job(cls, job_data, loaded_project_local_path):
"""
Creates render jobs.
This method takes a list of job data, a local path to a loaded project, and a job directory. It creates a render
job for each job data in the list and appends the result to a list. The list of results is then returned.
Args:
job_data (dict): Job data.
loaded_project_local_path (str): The local path to the loaded project.
Returns:
worker: Created job worker
"""
# get new output path in output_dir
output_path = job_data.get('output_path')
if not output_path:
loaded_project_filename = os.path.basename(loaded_project_local_path)
output_filename = os.path.splitext(loaded_project_filename)[0]
else:
output_filename = os.path.basename(output_path)
# Prepare output path
output_dir = os.path.join(os.path.dirname(os.path.dirname(loaded_project_local_path)), 'output')
output_path = os.path.join(output_dir, output_filename)
os.makedirs(output_dir, exist_ok=True)
logger.debug(f"New job output path: {output_path}")
# create & configure jobs
worker = EngineManager.create_worker(renderer=job_data['renderer'],
input_path=loaded_project_local_path,
output_path=output_path,
engine_version=job_data.get('engine_version'),
args=job_data.get('args', {}))
worker.status = job_data.get("initial_status", worker.status) # todo: is this necessary?
worker.parent = job_data.get("parent", worker.parent)
worker.name = job_data.get("name", worker.name)
worker.priority = int(job_data.get('priority', worker.priority))
worker.start_frame = int(job_data.get("start_frame", worker.start_frame))
worker.end_frame = int(job_data.get("end_frame", worker.end_frame))
worker.hostname = socket.gethostname()
# determine if we can / should split the job
if job_data.get("enable_split_jobs", False) and (worker.total_frames > 1) and not worker.parent:
cls.split_into_subjobs_async(worker, job_data, loaded_project_local_path)
else:
logger.debug("Not splitting into subjobs")
RenderQueue.add_to_render_queue(worker, force_start=job_data.get('force_start', False))
return worker
# --------------------------------------------
# Handling Subjobs
# --------------------------------------------
@classmethod
def handle_subjob_status_change(cls, local_job, subjob_data):
"""
@@ -142,7 +206,7 @@ class DistributedJobManager:
RenderServerProxy(subjob_hostname).get_job_files(subjob_id, zip_file_path)
logger.info(f"File transfer complete for {logname} - Transferred {get_file_size_human(zip_file_path)}")
except Exception as e:
logger.exception(f"Exception downloading files from remote server: {e}")
logger.error(f"Error downloading files from remote server: {e}")
local_job.children[child_key]['download_status'] = 'failed'
return False
@@ -218,8 +282,20 @@ class DistributedJobManager:
f"{', '.join(list(subjobs_not_downloaded().keys()))}")
time.sleep(5)
# --------------------------------------------
# Creating Subjobs
# --------------------------------------------
@classmethod
def split_into_subjobs(cls, worker, job_data, project_path, system_os=None):
def split_into_subjobs_async(cls, parent_worker, job_data, project_path, system_os=None):
# todo: I don't love this
parent_worker.status = RenderStatus.CONFIGURING
cls.background_worker = threading.Thread(target=cls.split_into_subjobs, args=(parent_worker, job_data,
project_path, system_os))
cls.background_worker.start()
@classmethod
def split_into_subjobs(cls, parent_worker, job_data, project_path, system_os=None, specific_servers=None):
"""
Splits a job into subjobs and distributes them among available servers.
@@ -228,56 +304,50 @@ class DistributedJobManager:
subjob.
Args:
worker (Worker): The worker that is handling the job.
parent_worker (Worker): The worker that is handling the job.
job_data (dict): The data for the job to be split.
project_path (str): The path to the project associated with the job.
system_os (str, optional): The operating system of the servers. Defaults to None.
system_os (str, optional): The operating system of the servers. Default is any OS.
specific_servers (list, optional): List of specific servers to split work between. Defaults to all found.
"""
# Check availability
available_servers = cls.find_available_servers(worker.renderer, system_os)
parent_worker.status = RenderStatus.CONFIGURING
available_servers = specific_servers if specific_servers else cls.find_available_servers(parent_worker.renderer, system_os)
logger.debug(f"Splitting into subjobs - Available servers: {available_servers}")
subjob_servers = cls.distribute_server_work(worker.start_frame, worker.end_frame, available_servers)
local_hostname = socket.gethostname()
subjob_servers = cls.distribute_server_work(parent_worker.start_frame, parent_worker.end_frame, available_servers)
# Prep and submit these sub-jobs
logger.info(f"Job {worker.id} split plan: {subjob_servers}")
logger.info(f"Job {parent_worker.id} split plan: {subjob_servers}")
try:
for server_data in subjob_servers:
server_hostname = server_data['hostname']
if server_hostname != local_hostname:
post_results = cls.__create_subjob(job_data, local_hostname, project_path, server_data,
server_hostname, worker)
if post_results.ok:
server_data['submission_results'] = post_results.json()[0]
else:
logger.error(f"Failed to create subjob on {server_hostname}")
break
for subjob_data in subjob_servers:
subjob_hostname = subjob_data['hostname']
if subjob_hostname != parent_worker.hostname:
post_results = cls.__create_subjob(job_data, parent_worker.hostname, project_path, subjob_data,
subjob_hostname, parent_worker)
if not post_results.ok:
ValueError(f"Failed to create subjob on {subjob_hostname}")
# save child info
submission_results = post_results.json()[0]
child_key = f"{submission_results['id']}@{subjob_hostname}"
parent_worker.children[child_key] = submission_results
else:
# truncate parent render_job
worker.start_frame = max(server_data['frame_range'][0], worker.start_frame)
worker.end_frame = min(server_data['frame_range'][-1], worker.end_frame)
logger.info(f"Local job now rendering from {worker.start_frame} to {worker.end_frame}")
server_data['submission_results'] = worker.json()
# check that job posts were all successful.
if not all(d.get('submission_results') is not None for d in subjob_servers):
raise ValueError("Failed to create all subjobs") # look into recalculating job #s and use exising jobs
parent_worker.start_frame = max(subjob_data['frame_range'][0], parent_worker.start_frame)
parent_worker.end_frame = min(subjob_data['frame_range'][-1], parent_worker.end_frame)
logger.info(f"Local job now rendering from {parent_worker.start_frame} to {parent_worker.end_frame}")
# start subjobs
logger.debug(f"Starting {len(subjob_servers) - 1} attempted subjobs")
for server_data in subjob_servers:
if server_data['hostname'] != local_hostname:
child_key = f"{server_data['submission_results']['id']}@{server_data['hostname']}"
worker.children[child_key] = server_data['submission_results']
worker.name = f"{worker.name}[{worker.start_frame}-{worker.end_frame}]"
logger.debug(f"Created {len(subjob_servers) - 1} subjobs successfully")
parent_worker.name = f"{parent_worker.name}[{parent_worker.start_frame}-{parent_worker.end_frame}]"
parent_worker.status = RenderStatus.NOT_STARTED # todo: this won't work with scheduled starts
except Exception as e:
# cancel all the subjobs
logger.error(f"Failed to split job into subjobs: {e}")
logger.debug(f"Cancelling {len(subjob_servers) - 1} attempted subjobs")
# [RenderServerProxy(hostname).cancel_job(results['id'], confirm=True) for hostname, results in
# submission_results.items()] # todo: fix this
RenderServerProxy(parent_worker.hostname).cancel_job(parent_worker.id, confirm=True)
@staticmethod
def __create_subjob(job_data, local_hostname, project_path, server_data, server_hostname, worker):
@@ -292,6 +362,10 @@ class DistributedJobManager:
file_path=project_path, job_list=[subjob])
return post_results
# --------------------------------------------
# Server Handling
# --------------------------------------------
@staticmethod
def distribute_server_work(start_frame, end_frame, available_servers, method='cpu_benchmark'):
"""
@@ -440,7 +514,7 @@ if __name__ == '__main__':
print("Starting Zeroconf...")
time.sleep(2)
available_servers = DistributedJobManager.find_available_servers('blender')
print(f"AVAILABLE SERVERS: {available_servers}")
print(f"AVAILABLE SERVERS ({len(available_servers)}): {available_servers}")
results = DistributedJobManager.distribute_server_work(1, 100, available_servers)
print(f"RESULTS: {results}")
ZeroconfServer.stop()