Managing Jobs and Frames
This tutorial covers advanced job and frame management techniques in OpenCue, including priority management, dependency handling, resource optimization, and troubleshooting strategies for production environments.
What You’ll Learn
- Advanced job priority and resource management
- Frame dependency strategies
- Troubleshooting failed frames
- Job optimization techniques
- Batch operations and automation
- Production workflow best practices
Prerequisites
- Completed previous tutorials (Getting Started, Job Submission, CueGUI)
- Understanding of OpenCue job structure
- Access to OpenCue environment with multiple jobs
Job Priority Management
Understanding Priority Systems
OpenCue uses numerical priorities where higher numbers get precedence:
Priority Ranges:
├── 0-49 : Low priority (background jobs)
├── 50-99 : Normal priority (default)
├── 100-149 : High priority (urgent work)
└── 150+ : Critical priority (emergency fixes)
Setting Job Priorities
During Job Submission
# PyOutline script with priority
import outline
job = outline.Outline(
name="urgent-render-v001",
shot="shot010",
show="demo-project",
user="artist01"
)
# Set high priority
job.set_priority(125)
After Job Submission
# Using CueAdmin
cueadmin -setpriority job-name 150
# Using CueGUI
# Right-click job → Properties → Set Priority
Dynamic Priority Management
Automatic Priority Adjustment
# Script to adjust priorities based on deadlines
import opencue
import time
def manage_priorities():
jobs = opencue.api.getJobs()
for job in jobs:
if job.state() == opencue.compiled_proto.job_pb2.PENDING:
# Increase priority for jobs waiting too long
wait_time = time.time() - job.date_submitted()
if wait_time > 3600: # 1 hour
new_priority = min(job.priority() + 10, 200)
job.setPriority(new_priority)
print(f"Increased priority for {job.name()} to {new_priority}")
# Run periodically
manage_priorities()
Frame Dependency Management
Layer Dependencies
Basic Dependencies
import outline
import outline.modules.shell
job = outline.Outline("dependency-demo", shot="test", show="demo", user="student")
# Create layers
preprocess = outline.modules.shell.Shell("preprocess", command=["python", "prep.py"], range="1-10")
render = outline.modules.shell.Shell("render", command=["blender", "-f", "#IFRAME#"], range="1-10")
composite = outline.modules.shell.Shell("composite", command=["nuke", "-f", "#IFRAME#"], range="1-10")
# Set up dependencies
render.depend_on(preprocess) # Render waits for preprocess
composite.depend_on(render) # Composite waits for render
job.add_layer(preprocess)
job.add_layer(render)
job.add_layer(composite)
Frame-by-Frame Dependencies
# Each composite frame waits for corresponding render frame
composite.depend_on(render, opencue.DependType.FRAME_BY_FRAME)
# Composite waits for ALL render frames to complete
composite.depend_on(render, opencue.DependType.LAYER_ON_LAYER)
External Job Dependencies
# Current job waits for another job to complete
job.depend_on_job("previous-job-name")
# Layer waits for layer in another job
current_layer.depend_on_job("other-job-name", "other-layer-name")
Managing Dependencies in Production
Dependency Monitoring
# Check dependency status
import opencue
def check_dependencies(job_name):
job = opencue.api.findJob(job_name)
for layer in job.getLayers():
deps = layer.getWhatDependsOnThis()
if deps:
print(f"Layer {layer.name()} is blocking:")
for dep in deps:
print(f" - {dep.dependentJob().name()}/{dep.dependentLayer().name()}")
Breaking Dependencies
# Remove dependency via CueAdmin
cueadmin -satisfy-dependency job-name layer-name
# In emergency situations
cueadmin -kill-dependency job-name layer-name
Frame Troubleshooting Strategies
Identifying Problem Patterns
Frame Failure Analysis
# Analyze frame failure patterns
import opencue
def analyze_failures(job_name):
job = opencue.api.findJob(job_name)
failed_frames = []
for layer in job.getLayers():
for frame in layer.getFrames():
if frame.state() == opencue.compiled_proto.job_pb2.DEAD:
failed_frames.append({
'frame': frame.number(),
'layer': layer.name(),
'host': frame.lastResource(),
'exit_code': frame.exitStatus()
})
# Group by host to identify problem machines
by_host = {}
for frame in failed_frames:
host = frame['host']
if host not in by_host:
by_host[host] = []
by_host[host].append(frame)
print("Failures by host:")
for host, frames in by_host.items():
print(f" {host}: {len(frames)} failures")
Common Failure Scenarios
Memory Issues
# Detect memory-related failures
def check_memory_issues(job_name):
job = opencue.api.findJob(job_name)
for layer in job.getLayers():
for frame in layer.getFrames():
if frame.state() == opencue.compiled_proto.job_pb2.DEAD:
if frame.exitStatus() == 9: # SIGKILL often means OOM
print(f"Possible memory issue: Frame {frame.number()}")
print(f" Memory used: {frame.usedMemory()} MB")
print(f" Memory reserved: {frame.reservedMemory()} MB")
File System Issues
# Check for common file system problems
grep -i "permission denied\|no such file\|disk full" /path/to/frame/logs/*
Network Problems
# Identify network-related failures
def check_network_issues(job_name):
# Look for frames that fail on specific hosts
# Check for timeout errors in logs
# Monitor for connection refused errors
pass
Frame Recovery Strategies
Automatic Retry Logic
# Custom retry strategy
import opencue
import time
def smart_retry(job_name, max_retries=3):
job = opencue.api.findJob(job_name)
for layer in job.getLayers():
dead_frames = [f for f in layer.getFrames()
if f.state() == opencue.compiled_proto.job_pb2.DEAD]
for frame in dead_frames:
if frame.retryCount() < max_retries:
# Retry with different resource requirements
if frame.exitStatus() == 9: # Memory issue
# Increase memory requirement
layer.setMinMemory(layer.minimumMemory() * 1.5)
frame.retry()
print(f"Retrying frame {frame.number()}")
time.sleep(1) # Rate limiting
Selective Frame Management
# Retry specific frame ranges
cueadmin -retry-frames job-name layer-name 100-120
# Skip problematic frames
cueadmin -eat-frames job-name layer-name 115,118,119
# Kill and retry with different settings
cueadmin -kill-frames job-name layer-name 100-200
# Modify job resources then retry
Resource Optimization
Dynamic Resource Management
CPU Allocation Strategies
# Adjust core allocation based on job type
import outline
def optimize_cores(layer, job_type):
if job_type == "render":
layer.set_min_cores(4) # Minimum for good performance
layer.set_max_cores(16) # Don't monopolize hosts
elif job_type == "simulation":
layer.set_min_cores(8) # CPU intensive
layer.set_max_cores(32) # Can use more cores effectively
elif job_type == "composite":
layer.set_min_cores(1) # Usually single-threaded
layer.set_max_cores(4) # Limited benefit from more cores
Memory Management
# Progressive memory allocation
def set_memory_requirements(layer, frame_complexity):
base_memory = 2048 # 2GB base
if frame_complexity == "simple":
layer.set_min_memory(base_memory)
elif frame_complexity == "complex":
layer.set_min_memory(base_memory * 2)
elif frame_complexity == "heavy":
layer.set_min_memory(base_memory * 4)
layer.set_max_cores(8) # Limit cores to save memory for other jobs
Service Tag Management
Service-Based Allocation
# Target specific software versions
layer.set_service("maya2024") # Specific Maya version
layer.set_service("gpu") # GPU-enabled hosts
layer.set_service("highcpu") # High CPU count hosts
layer.set_service("highmem") # High memory hosts
Custom Service Tags
# Create custom host allocations
cueadmin -create-alloc facility workstation-pool workstation
cueadmin -tag-alloc workstation-pool "maya2024,highcpu"
Batch Operations and Automation
Bulk Job Management
Mass Job Operations
# Kill all jobs from a specific user
import opencue
def kill_user_jobs(username):
jobs = opencue.api.getJobs(user=[username])
for job in jobs:
if job.state() in [opencue.compiled_proto.job_pb2.PENDING,
opencue.compiled_proto.job_pb2.RUNNING]:
job.kill()
print(f"Killed job: {job.name()}")
Batch Frame Operations
# Retry all failed frames across multiple jobs
def retry_all_failures(show_name):
jobs = opencue.api.getJobs(show=[show_name])
for job in jobs:
for layer in job.getLayers():
dead_frames = [f for f in layer.getFrames()
if f.state() == opencue.compiled_proto.job_pb2.DEAD]
for frame in dead_frames:
frame.retry()
print(f"Retrying {job.name()}/{layer.name()}/frame{frame.number()}")
Automated Monitoring Scripts
Job Health Monitor
#!/usr/bin/env python3
# Production monitoring script
import opencue
import time
import smtplib
from email.mime.text import MIMEText
def monitor_job_health():
"""Monitor for stuck or problematic jobs"""
jobs = opencue.api.getJobs()
issues = []
for job in jobs:
# Check for jobs stuck in pending too long
if job.state() == opencue.compiled_proto.job_pb2.PENDING:
wait_time = time.time() - job.dateSubmitted()
if wait_time > 1800: # 30 minutes
issues.append(f"Job {job.name()} stuck pending for {wait_time/60:.1f} minutes")
# Check for high failure rate
total_frames = len(job.getFrames())
failed_frames = len([f for f in job.getFrames()
if f.state() == opencue.compiled_proto.job_pb2.DEAD])
if total_frames > 0 and failed_frames / total_frames > 0.5:
issues.append(f"Job {job.name()} has {failed_frames}/{total_frames} failed frames")
if issues:
send_alert("\n".join(issues))
def send_alert(message):
"""Send email alert for issues"""
# Implementation depends on your email setup
pass
if __name__ == "__main__":
monitor_job_health()
Production Workflow Best Practices
Job Lifecycle Management
Pre-Production Planning
# Job template system
class JobTemplate:
def __init__(self, job_type):
self.job_type = job_type
self.configure_defaults()
def configure_defaults(self):
if self.job_type == "animation_render":
self.priority = 100
self.min_cores = 4
self.max_cores = 16
self.min_memory = 4096
self.service = "maya2024"
elif self.job_type == "fx_simulation":
self.priority = 120 # Higher priority
self.min_cores = 16
self.max_cores = 32
self.min_memory = 16384
self.service = "houdini"
def create_job(self, name, shot, show, user):
job = outline.Outline(name, shot=shot, show=show, user=user)
job.set_priority(self.priority)
return job
Production Phases
# Different priorities for different production phases
PHASE_PRIORITIES = {
"previs": 50,
"animation": 75,
"lighting": 100,
"fx": 125,
"final_comp": 150
}
def set_phase_priority(job_name, phase):
job = opencue.api.findJob(job_name)
job.setPriority(PHASE_PRIORITIES.get(phase, 50))
Team Collaboration
Job Ownership and Handoffs
# Transfer job ownership
def transfer_job(job_name, new_owner):
job = opencue.api.findJob(job_name)
job.setOwner(new_owner)
# Add comment for tracking
job.addComment(f"Job transferred to {new_owner}")
Status Communication
# Automated status updates
def update_job_status(job_name, status_message):
job = opencue.api.findJob(job_name)
job.addComment(f"Status: {status_message}")
# Could integrate with Slack, email, or other systems
notify_team(job.name(), status_message)
Performance Monitoring
Resource Utilization Tracking
# Track resource efficiency
def analyze_resource_usage(job_name):
job = opencue.api.findJob(job_name)
total_core_hours = 0
total_memory_hours = 0
for layer in job.getLayers():
for frame in layer.getFrames():
if frame.state() == opencue.compiled_proto.job_pb2.SUCCEEDED:
runtime_hours = frame.runTime() / 3600.0
total_core_hours += frame.usedCores() * runtime_hours
total_memory_hours += frame.usedMemory() * runtime_hours
print(f"Job {job.name()} used:")
print(f" Core hours: {total_core_hours:.2f}")
print(f" Memory hours: {total_memory_hours:.2f} MB·h")
Next Steps
You’ve mastered advanced job and frame management:
- Priority management and resource optimization
- Dependency handling and troubleshooting
- Frame failure analysis and recovery
- Batch operations and automation
- Production workflow best practices
Continue your OpenCue journey:
- Creating Multi-Layer Jobs - Complex pipeline workflows
- DCC Integration Tutorial - Maya, Blender, Nuke integration
- Check out the Reference documentation for detailed API information
Troubleshooting Reference
Quick Diagnostic Commands
# Job status overview
cueadmin -lj | head -20
# Find stuck jobs
cueadmin -lj | grep PENDING
# Host resource check
cueadmin -lh | grep -v Up
# Recent failures
cueadmin -ll | grep ERROR | tail -10
# Resource usage
cueadmin -lp | awk '{sum+=$8} END {print "Total cores in use:", sum}'
Emergency Procedures
# Kill all jobs for maintenance
cueadmin -lj | awk '{print $1}' | xargs -I {} cueadmin -kill {}
# Clear all pending jobs
cueadmin -lj | grep PENDING | awk '{print $1}' | xargs -I {} cueadmin -kill {}
# Restart stuck host
cueadmin -safe-reboot hostname