diff --git a/.gitignore b/.gitignore index 15f041bbf..918aacc3a 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,5 @@ docs/bin/ sandbox/kafka-data sandbox/zookeeper-data sandbox/zookeeper-logs +sandbox/rqd/shots/ docs/_data/version.yml diff --git a/VERSION.in b/VERSION.in index 63738cc28..d40acaaea 100644 --- a/VERSION.in +++ b/VERSION.in @@ -1 +1 @@ -1.14 +1.15 diff --git a/cuebot/build.gradle b/cuebot/build.gradle index f3ff7d921..e944402a7 100644 --- a/cuebot/build.gradle +++ b/cuebot/build.gradle @@ -26,6 +26,8 @@ repositories { def grpcVersion = '1.47.0' def protobufVersion = '3.21.2' def activemqVersion = '5.12.0' +def kafkaVersion = '3.4.0' +def elasticsearchVersion = '8.8.0' // Spring dependency versions are managed by the io.spring.dependency-management plugin. // Appropriate versions will be pulled based on the spring boot version specified in the @@ -52,6 +54,15 @@ dependencies { implementation group: 'io.prometheus', name: 'simpleclient', version: '0.16.0' implementation group: 'io.prometheus', name: 'simpleclient_servlet', version: '0.16.0' + // Kafka for event publishing + implementation group: 'org.apache.kafka', name: 'kafka-clients', version: "${kafkaVersion}" + implementation group: 'org.springframework.kafka', name: 'spring-kafka', version: '2.9.0' + + // Elasticsearch for historical data storage + implementation group: 'co.elastic.clients', name: 'elasticsearch-java', version: "${elasticsearchVersion}" + implementation group: 'org.elasticsearch.client', name: 'elasticsearch-rest-client', version: "${elasticsearchVersion}" + implementation group: 'jakarta.json', name: 'jakarta.json-api', version: '2.1.1' + protobuf files("../proto/src/") testImplementation group: 'junit', name: 'junit', version: '4.12' @@ -67,12 +78,14 @@ dependencies { compileJava { dependsOn generateProto - options.compilerArgs << "-Xlint:all" << "-Werror" + // Exclude serial warning due to protobuf-generated code warnings + options.compilerArgs << "-Xlint:all,-serial" << "-Werror" } compileTestJava { dependsOn generateProto - options.compilerArgs << "-Xlint:all" << "-Werror" + // Exclude serial warning due to protobuf-generated code warnings + options.compilerArgs << "-Xlint:all,-serial" << "-Werror" } protobuf { diff --git a/cuebot/src/main/java/com/imageworks/spcue/CuebotApplication.java b/cuebot/src/main/java/com/imageworks/spcue/CuebotApplication.java index f20dcdfb5..7f006b992 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/CuebotApplication.java +++ b/cuebot/src/main/java/com/imageworks/spcue/CuebotApplication.java @@ -24,8 +24,9 @@ import org.apache.logging.log4j.LogManager; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.boot.autoconfigure.kafka.KafkaAutoConfiguration; -@SpringBootApplication +@SpringBootApplication(exclude = {KafkaAutoConfiguration.class}) public class CuebotApplication extends SpringApplication { private static String[] checkArgs(String[] args) { Optional deprecatedFlag = Arrays.stream(args) diff --git a/cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java b/cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java index 1d39e394a..2730f0c9f 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java +++ b/cuebot/src/main/java/com/imageworks/spcue/ExecutionSummary.java @@ -28,6 +28,15 @@ public class ExecutionSummary { public long gpuTimeSuccess; public long gpuTimeFail; public long highMemoryKb; + public int highFrameSec; + + public int getHighFrameSec() { + return highFrameSec; + } + + public void setHighFrameSec(int highFrameSec) { + this.highFrameSec = highFrameSec; + } public long getHighMemoryKb() { return highMemoryKb; diff --git a/cuebot/src/main/java/com/imageworks/spcue/PrometheusMetricsCollector.java b/cuebot/src/main/java/com/imageworks/spcue/PrometheusMetricsCollector.java index 3f12c1352..4c988487d 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/PrometheusMetricsCollector.java +++ b/cuebot/src/main/java/com/imageworks/spcue/PrometheusMetricsCollector.java @@ -119,6 +119,38 @@ public class PrometheusMetricsCollector { .labelNames("env", "cuebot_host", "render_node", "job_name", "frame_name", "frame_id") .register(); + private static final Counter frameCompletedCounter = Counter.build() + .name("cue_frames_completed_total").help("Total number of frames completed") + .labelNames("env", "cuebot_host", "state", "show", "shot").register(); + + private static final Counter jobCompletedCounter = + Counter.build().name("cue_jobs_completed_total").help("Total number of jobs completed") + .labelNames("env", "cuebot_host", "state", "show", "shot").register(); + + private static final Histogram jobCoreSecondsHistogram = Histogram.build() + .name("cue_job_core_seconds").help("Histogram of total core seconds per job") + .labelNames("env", "cuebot_host", "show", "shot") + .buckets(3600, 36000, 360000, 3600000, 36000000).register(); + + private static final Histogram layerMaxRuntimeHistogram = + Histogram.build().name("cue_layer_max_runtime_seconds") + .help("Histogram of max frame runtime per layer in seconds") + .labelNames("env", "cuebot_host", "show", "shot", "layer_type") + .buckets(60, 300, 600, 1800, 3600, 7200, 14400, 28800, 86400).register(); + + private static final Histogram layerMaxMemoryHistogram = + Histogram.build().name("cue_layer_max_memory_bytes") + .help("Histogram of max frame memory usage per layer in bytes") + .labelNames("env", "cuebot_host", "show", "shot", "layer_type") + .buckets(256L * 1024 * 1024, 512L * 1024 * 1024, 1024L * 1024 * 1024, + 2048L * 1024 * 1024, 4096L * 1024 * 1024, 8192L * 1024 * 1024, + 16384L * 1024 * 1024, 32768L * 1024 * 1024) + .register(); + + private static final Counter hostReportsReceivedCounter = Counter.build() + .name("cue_host_reports_received_total").help("Total number of host reports received") + .labelNames("env", "cuebot_host", "facility").register(); + private String deployment_environment; private String cuebot_host; @@ -269,6 +301,82 @@ public void incrementFrameKillFailureCounter(String hostname, String jobName, St jobName, frameName, frameId).inc(); } + /** + * Record a frame completion + * + * @param state final state of the frame + * @param show show name + * @param shot shot name + */ + public void recordFrameCompleted(String state, String show, String shot) { + frameCompletedCounter + .labels(this.deployment_environment, this.cuebot_host, state, show, shot).inc(); + } + + /** + * Record a job completion + * + * @param state final state of the job + * @param show show name + * @param shot shot name + */ + public void recordJobCompleted(String state, String show, String shot) { + jobCompletedCounter.labels(this.deployment_environment, this.cuebot_host, state, show, shot) + .inc(); + } + + /** + * Record job total core seconds for histogramming + * + * @param coreSeconds total core seconds consumed by the job + * @param show show name + * @param shot shot name + */ + public void recordJobCoreSeconds(double coreSeconds, String show, String shot) { + jobCoreSecondsHistogram.labels(this.deployment_environment, this.cuebot_host, show, shot) + .observe(coreSeconds); + } + + /** + * Record layer max runtime for histogramming + * + * @param runtimeSeconds max runtime in seconds for the layer + * @param show show name + * @param shot shot name + * @param layerType layer type + */ + public void recordLayerMaxRuntime(double runtimeSeconds, String show, String shot, + String layerType) { + layerMaxRuntimeHistogram + .labels(this.deployment_environment, this.cuebot_host, show, shot, layerType) + .observe(runtimeSeconds); + } + + /** + * Record layer max memory usage for histogramming + * + * @param memoryBytes max memory in bytes for the layer + * @param show show name + * @param shot shot name + * @param layerType layer type + */ + public void recordLayerMaxMemory(double memoryBytes, String show, String shot, + String layerType) { + layerMaxMemoryHistogram + .labels(this.deployment_environment, this.cuebot_host, show, shot, layerType) + .observe(memoryBytes); + } + + /** + * Record a host report received + * + * @param facility facility name + */ + public void recordHostReport(String facility) { + hostReportsReceivedCounter.labels(this.deployment_environment, this.cuebot_host, facility) + .inc(); + } + // Setters used for dependency injection public void setBookingQueue(BookingQueue bookingQueue) { this.bookingQueue = bookingQueue; diff --git a/cuebot/src/main/java/com/imageworks/spcue/config/AppConfig.java b/cuebot/src/main/java/com/imageworks/spcue/config/AppConfig.java index 7ab5cda91..63a59c0a8 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/config/AppConfig.java +++ b/cuebot/src/main/java/com/imageworks/spcue/config/AppConfig.java @@ -38,7 +38,8 @@ "classpath:conf/spring/applicationContext-grpcServer.xml", "classpath:conf/spring/applicationContext-service.xml", "classpath:conf/spring/applicationContext-jms.xml", - "classpath:conf/spring/applicationContext-criteria.xml"}) + "classpath:conf/spring/applicationContext-criteria.xml", + "classpath:conf/spring/applicationContext-monitoring.xml"}) @EnableConfigurationProperties @PropertySource({"classpath:opencue.properties"}) public class AppConfig { diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/DependDao.java b/cuebot/src/main/java/com/imageworks/spcue/dao/DependDao.java index 2c408479e..00c42f3bc 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/DependDao.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/DependDao.java @@ -194,6 +194,14 @@ public interface DependDao { */ boolean decrementDependCount(FrameInterface f); + /** + * Check if a frame is dispatchable (has depend_count = 0). + * + * @param f the frame to check + * @return true if the frame's depend_count is 0 + */ + boolean isFrameDispatchable(FrameInterface f); + /** * Returns true if this is the thread that set the depend to inactive. * diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DependDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DependDaoJdbc.java index b93d8962a..37f49fc99 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DependDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/DependDaoJdbc.java @@ -215,6 +215,15 @@ public boolean decrementDependCount(FrameInterface f) { return getJdbcTemplate().update(DECREMENT_DEPEND_COUNT, f.getFrameId()) == 1; } + private static final String IS_FRAME_DISPATCHABLE = + "SELECT int_depend_count = 0 FROM frame WHERE pk_frame = ?"; + + @Override + public boolean isFrameDispatchable(FrameInterface f) { + return Boolean.TRUE.equals(getJdbcTemplate().queryForObject(IS_FRAME_DISPATCHABLE, + Boolean.class, f.getFrameId())); + } + private static final String[] DELETE_DEPEND = {"DELETE FROM depend WHERE pk_parent=?", "DELETE FROM depend WHERE pk_depend=?"}; diff --git a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java index d994d9050..d9ef93e2b 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dao/postgres/LayerDaoJdbc.java @@ -409,12 +409,13 @@ public FrameStateTotals mapRow(ResultSet rs, int rowNum) throws SQLException { }, layer.getLayerId()); } - private static final String GET_EXECUTION_SUMMARY = "SELECT " - + "layer_usage.int_core_time_success," + "layer_usage.int_core_time_fail," - + "layer_usage.int_gpu_time_success," + "layer_usage.int_gpu_time_fail," - + "layer_usage.int_clock_time_success," + "layer_mem.int_max_rss " + "FROM " + "layer," - + "layer_usage, " + "layer_mem " + "WHERE " + "layer.pk_layer = layer_usage.pk_layer " - + "AND " + "layer.pk_layer = layer_mem.pk_layer " + "AND " + "layer.pk_layer = ?"; + private static final String GET_EXECUTION_SUMMARY = + "SELECT " + "layer_usage.int_core_time_success," + "layer_usage.int_core_time_fail," + + "layer_usage.int_gpu_time_success," + "layer_usage.int_gpu_time_fail," + + "layer_usage.int_clock_time_success," + "layer_usage.int_clock_time_high," + + "layer_mem.int_max_rss " + "FROM " + "layer," + "layer_usage, " + "layer_mem " + + "WHERE " + "layer.pk_layer = layer_usage.pk_layer " + "AND " + + "layer.pk_layer = layer_mem.pk_layer " + "AND " + "layer.pk_layer = ?"; @Override public ExecutionSummary getExecutionSummary(LayerInterface layer) { @@ -429,6 +430,7 @@ public ExecutionSummary mapRow(ResultSet rs, int rowNum) throws SQLException { e.gpuTimeFail = rs.getLong("int_gpu_time_fail"); e.gpuTime = e.gpuTimeSuccess + e.gpuTimeFail; e.highMemoryKb = rs.getLong("int_max_rss"); + e.highFrameSec = rs.getInt("int_clock_time_high"); return e; } }, layer.getLayerId()); diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java index af6c54b99..3eb1ad3f2 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java @@ -54,7 +54,12 @@ import com.imageworks.spcue.grpc.host.ThreadMode; import com.imageworks.spcue.grpc.job.CheckpointState; import com.imageworks.spcue.grpc.job.FrameState; +import com.imageworks.spcue.grpc.monitoring.EventType; +import com.imageworks.spcue.grpc.monitoring.FrameEvent; +import com.imageworks.spcue.grpc.monitoring.ProcEvent; import com.imageworks.spcue.grpc.rqd.RunFrame; +import com.imageworks.spcue.monitoring.KafkaEventPublisher; +import com.imageworks.spcue.monitoring.MonitoringEventBuilder; import com.imageworks.spcue.rqd.RqdClient; import com.imageworks.spcue.service.BookingManager; import com.imageworks.spcue.service.DependManager; @@ -77,6 +82,8 @@ public class DispatchSupportService implements DispatchSupport { private RedirectManager redirectManager; private BookingManager bookingManager; private BookingDao bookingDao; + private KafkaEventPublisher kafkaEventPublisher; + private MonitoringEventBuilder monitoringEventBuilder; private ConcurrentHashMap strandedCores = new ConcurrentHashMap(); @@ -216,9 +223,15 @@ public void runFrame(VirtualProc proc, DispatchFrame frame) { public void startFrameAndProc(VirtualProc proc, DispatchFrame frame) { logger.trace("starting frame: " + frame); + // Capture previous state before update for event publishing + FrameState previousState = frame.state; + frameDao.updateFrameStarted(proc, frame); reserveProc(proc, frame); + + // Publish FRAME_STARTED event (WAITING -> RUNNING transition) + publishFrameStartedEvent(frame, proc, previousState); } @Transactional(propagation = Propagation.REQUIRED, readOnly = true) @@ -460,6 +473,7 @@ private void reserveProc(VirtualProc proc, DispatchFrame frame) { if (proc.isNew()) { logger.info("creating proc " + proc.getName() + " for " + frame.getName()); procDao.insertVirtualProc(proc); + publishProcEvent(EventType.PROC_BOOKED, proc); } else { logger.info("updated proc " + proc.getName() + " for " + frame.getName()); procDao.updateVirtualProcAssignment(proc); @@ -481,6 +495,7 @@ public void unbookProc(VirtualProc proc, String reason) { } proc.unbooked = true; procDao.deleteVirtualProc(proc); + publishProcEvent(EventType.PROC_UNBOOKED, proc); DispatchSupport.unbookedProcs.getAndIncrement(); logger.info(proc + " " + reason); @@ -680,4 +695,42 @@ public void setBookingDao(BookingDao bookingDao) { public void clearCache() { dispatcherDao.clearCache(); } + + public KafkaEventPublisher getKafkaEventPublisher() { + return kafkaEventPublisher; + } + + public void setKafkaEventPublisher(KafkaEventPublisher kafkaEventPublisher) { + this.kafkaEventPublisher = kafkaEventPublisher; + } + + public void setMonitoringEventBuilder(MonitoringEventBuilder monitoringEventBuilder) { + this.monitoringEventBuilder = monitoringEventBuilder; + } + + /** + * Publishes a proc event to Kafka for monitoring purposes. + */ + private void publishProcEvent(EventType eventType, VirtualProc proc) { + if (kafkaEventPublisher == null || !kafkaEventPublisher.isEnabled()) { + return; + } + + ProcEvent event = monitoringEventBuilder.buildProcEvent(eventType, proc); + kafkaEventPublisher.publishProcEvent(event); + } + + /** + * Publishes a frame started event to Kafka for monitoring purposes. This captures the WAITING + * -> RUNNING transition for pickup time analysis. + */ + private void publishFrameStartedEvent(DispatchFrame frame, VirtualProc proc, + FrameState previousState) { + if (kafkaEventPublisher == null || !kafkaEventPublisher.isEnabled()) { + return; + } + + FrameEvent event = monitoringEventBuilder.buildFrameStartedEvent(frame, proc); + kafkaEventPublisher.publishFrameEvent(event); + } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java index 59abeb0e3..2b329d934 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java @@ -29,6 +29,7 @@ import com.imageworks.spcue.DispatchFrame; import com.imageworks.spcue.DispatchHost; import com.imageworks.spcue.DispatchJob; +import com.imageworks.spcue.ExecutionSummary; import com.imageworks.spcue.FrameDetail; import com.imageworks.spcue.JobDetail; import com.imageworks.spcue.LayerDetail; @@ -56,6 +57,13 @@ import com.imageworks.spcue.dao.ServiceDao; import com.imageworks.spcue.grpc.service.Service; import com.imageworks.spcue.grpc.service.ServiceOverride; +import com.imageworks.spcue.monitoring.KafkaEventPublisher; +import com.imageworks.spcue.monitoring.MonitoringEventBuilder; +import com.imageworks.spcue.grpc.monitoring.EventType; +import com.imageworks.spcue.grpc.monitoring.FrameEvent; +import com.imageworks.spcue.grpc.monitoring.JobEvent; +import com.imageworks.spcue.grpc.monitoring.LayerEvent; +import com.imageworks.spcue.PrometheusMetricsCollector; /** * The FrameCompleteHandler encapsulates all logic necessary for processing FrameComplete reports @@ -83,6 +91,9 @@ public class FrameCompleteHandler { private ServiceDao serviceDao; private ShowDao showDao; private Environment env; + private KafkaEventPublisher kafkaEventPublisher; + private MonitoringEventBuilder monitoringEventBuilder; + private PrometheusMetricsCollector prometheusMetrics; /* * The last time a proc was unbooked for subscription or job balancing. Since there are so many @@ -255,6 +266,11 @@ public void handlePostFrameCompleteOperations(VirtualProc proc, FrameCompleteRep FrameDetail frameDetail) { try { + /* + * Publish frame complete event to Kafka for monitoring + */ + publishFrameCompleteEvent(report, frame, frameDetail, newFrameState, proc); + /* * The default behavior is to keep the proc on the same job. */ @@ -270,6 +286,29 @@ public void handlePostFrameCompleteOperations(VirtualProc proc, FrameCompleteRep isLayerComplete = jobManager.isLayerComplete(frame); if (isLayerComplete) { jobManagerSupport.satisfyWhatDependsOn((LayerInterface) frame); + + // Record layer max runtime and memory metrics + if (prometheusMetrics != null) { + ExecutionSummary layerSummary = + jobManager.getExecutionSummary((LayerInterface) frame); + LayerDetail layerDetail = jobManager.getLayerDetail(frame.getLayerId()); + prometheusMetrics.recordLayerMaxRuntime(layerSummary.highFrameSec, + frame.show, frame.shot, layerDetail.type.toString()); + if (layerSummary.highMemoryKb > 0) { + prometheusMetrics.recordLayerMaxMemory( + layerSummary.highMemoryKb * 1024L, frame.show, frame.shot, + layerDetail.type.toString()); + } + } + + // Publish layer completed event to Kafka + if (kafkaEventPublisher != null && kafkaEventPublisher.isEnabled()) { + LayerDetail layerDetail = jobManager.getLayerDetail(frame.getLayerId()); + LayerEvent layerEvent = + monitoringEventBuilder.buildLayerEvent(EventType.LAYER_COMPLETED, + layerDetail, frame.getName(), frame.show); + kafkaEventPublisher.publishLayerEvent(layerEvent); + } } } @@ -721,4 +760,45 @@ public void setShowDao(ShowDao showDao) { this.showDao = showDao; } + public KafkaEventPublisher getKafkaEventPublisher() { + return kafkaEventPublisher; + } + + public void setKafkaEventPublisher(KafkaEventPublisher kafkaEventPublisher) { + this.kafkaEventPublisher = kafkaEventPublisher; + } + + public void setMonitoringEventBuilder(MonitoringEventBuilder monitoringEventBuilder) { + this.monitoringEventBuilder = monitoringEventBuilder; + } + + public PrometheusMetricsCollector getPrometheusMetrics() { + return prometheusMetrics; + } + + public void setPrometheusMetrics(PrometheusMetricsCollector prometheusMetrics) { + this.prometheusMetrics = prometheusMetrics; + } + + /** + * Publishes a frame complete event to Kafka for monitoring purposes. This method is called + * asynchronously to avoid blocking the dispatch thread. + */ + private void publishFrameCompleteEvent(FrameCompleteReport report, DispatchFrame frame, + FrameDetail frameDetail, FrameState newFrameState, VirtualProc proc) { + // Record Prometheus metrics for frame completion + if (prometheusMetrics != null) { + prometheusMetrics.recordFrameCompleted(newFrameState.name(), frame.show, frame.shot); + } + + // Publish to Kafka if enabled + if (kafkaEventPublisher == null || !kafkaEventPublisher.isEnabled()) { + return; + } + + FrameEvent event = monitoringEventBuilder.buildFrameCompleteEvent(report, newFrameState, + frameDetail.state, frame, proc); + kafkaEventPublisher.publishFrameEvent(event); + } + } diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java index 86524fdf3..9adb4ed62 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/HostReportHandler.java @@ -68,6 +68,11 @@ import org.springframework.dao.DataAccessException; import org.springframework.dao.EmptyResultDataAccessException; +import com.imageworks.spcue.monitoring.KafkaEventPublisher; +import com.imageworks.spcue.monitoring.MonitoringEventBuilder; +import com.imageworks.spcue.grpc.monitoring.EventType; +import com.imageworks.spcue.grpc.monitoring.HostEvent; + public class HostReportHandler { private static final Logger logger = LogManager.getLogger(HostReportHandler.class); @@ -84,6 +89,8 @@ public class HostReportHandler { private JobManager jobManager; private JobDao jobDao; private LayerDao layerDao; + private KafkaEventPublisher kafkaEventPublisher; + private MonitoringEventBuilder monitoringEventBuilder; @Autowired private Environment env; @@ -162,6 +169,12 @@ public void queueHostReport(HostReport report) { public void handleHostReport(HostReport report, boolean isBoot) { long startTime = System.currentTimeMillis(); try { + // Record Prometheus metric for host report + if (prometheusMetrics != null) { + String facility = report.getHost().getFacility(); + prometheusMetrics.recordHostReport(facility != null ? facility : "unknown"); + } + long swapOut = 0; if (report.getHost().getAttributesMap().containsKey("swapout")) { swapOut = Integer.parseInt(report.getHost().getAttributesMap().get("swapout")); @@ -361,10 +374,14 @@ private void changeHardwareState(DispatchHost host, HardwareState reportState, b return; } + HardwareState previousState = host.hardwareState; + boolean stateChanged = false; + switch (host.hardwareState) { case DOWN: hostManager.setHostState(host, HardwareState.UP); host.hardwareState = HardwareState.UP; + stateChanged = true; break; case REBOOTING: case REBOOT_WHEN_IDLE: @@ -372,6 +389,7 @@ private void changeHardwareState(DispatchHost host, HardwareState reportState, b if (isBoot) { hostManager.setHostState(host, HardwareState.UP); host.hardwareState = HardwareState.UP; + stateChanged = true; } break; case REPAIR: @@ -380,8 +398,14 @@ private void changeHardwareState(DispatchHost host, HardwareState reportState, b default: hostManager.setHostState(host, reportState); host.hardwareState = reportState; + stateChanged = true; break; } + + // Publish host state change event + if (stateChanged) { + publishHostEvent(host, previousState, null); + } } /** @@ -1084,4 +1108,33 @@ public ThreadPoolExecutor getKillQueue() { public void setKillQueue(ThreadPoolExecutor killQueue) { this.killQueue = killQueue; } + + public KafkaEventPublisher getKafkaEventPublisher() { + return kafkaEventPublisher; + } + + public void setKafkaEventPublisher(KafkaEventPublisher kafkaEventPublisher) { + this.kafkaEventPublisher = kafkaEventPublisher; + } + + public void setMonitoringEventBuilder(MonitoringEventBuilder monitoringEventBuilder) { + this.monitoringEventBuilder = monitoringEventBuilder; + } + + /** + * Publishes a host state change event to Kafka for monitoring purposes. + */ + private void publishHostEvent(DispatchHost host, HardwareState previousState, String reason) { + if (kafkaEventPublisher == null || !kafkaEventPublisher.isEnabled()) { + return; + } + + try { + HostEvent event = monitoringEventBuilder.buildHostEvent(EventType.HOST_STATE_CHANGED, + host, previousState, host.lockState, reason); + kafkaEventPublisher.publishHostEvent(event); + } catch (Exception e) { + logger.trace("Failed to publish host event: {}", e.getMessage()); + } + } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/monitoring/KafkaEventPublisher.java b/cuebot/src/main/java/com/imageworks/spcue/monitoring/KafkaEventPublisher.java new file mode 100644 index 000000000..16583b9a7 --- /dev/null +++ b/cuebot/src/main/java/com/imageworks/spcue/monitoring/KafkaEventPublisher.java @@ -0,0 +1,361 @@ +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package com.imageworks.spcue.monitoring; + +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.UUID; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +import javax.annotation.PostConstruct; +import javax.annotation.PreDestroy; + +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.AdminClientConfig; +import org.apache.kafka.clients.admin.CreateTopicsResult; +import org.apache.kafka.clients.admin.NewTopic; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.errors.TopicExistsException; +import org.apache.kafka.common.serialization.StringSerializer; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.env.Environment; + +import com.google.protobuf.Message; +import com.google.protobuf.util.JsonFormat; +import com.imageworks.spcue.grpc.monitoring.EventHeader; +import com.imageworks.spcue.grpc.monitoring.EventType; +import com.imageworks.spcue.grpc.monitoring.FrameEvent; +import com.imageworks.spcue.grpc.monitoring.HostEvent; +import com.imageworks.spcue.grpc.monitoring.JobEvent; +import com.imageworks.spcue.grpc.monitoring.LayerEvent; +import com.imageworks.spcue.grpc.monitoring.ProcEvent; +import com.imageworks.spcue.util.CueExceptionUtil; + +/** + * KafkaEventPublisher publishes monitoring events to Kafka topics for downstream processing. Events + * are serialized as JSON for compatibility with Elasticsearch and other consumers. + * + * This service is the central point for all monitoring event publishing in Cuebot. Events are + * queued and published asynchronously to avoid blocking the main dispatch threads. + */ +public class KafkaEventPublisher extends ThreadPoolExecutor { + private static final Logger logger = LogManager.getLogger(KafkaEventPublisher.class); + + // Thread pool configuration + private static final int THREAD_POOL_SIZE_INITIAL = 2; + private static final int THREAD_POOL_SIZE_MAX = 4; + private static final int QUEUE_SIZE = 5000; + + // Kafka topic names + private static final String TOPIC_JOB_EVENTS = "opencue.job.events"; + private static final String TOPIC_LAYER_EVENTS = "opencue.layer.events"; + private static final String TOPIC_FRAME_EVENTS = "opencue.frame.events"; + private static final String TOPIC_HOST_EVENTS = "opencue.host.events"; + private static final String TOPIC_PROC_EVENTS = "opencue.proc.events"; + + // All topics managed by this publisher + private static final List ALL_TOPICS = Arrays.asList(TOPIC_JOB_EVENTS, + TOPIC_LAYER_EVENTS, TOPIC_FRAME_EVENTS, TOPIC_HOST_EVENTS, TOPIC_PROC_EVENTS); + + // Default topic configuration + private static final int DEFAULT_NUM_PARTITIONS = 3; + private static final short DEFAULT_REPLICATION_FACTOR = 1; + private static final String DEFAULT_RETENTION_MS = "604800000"; // 7 days + private static final String DEFAULT_CLEANUP_POLICY = "delete"; + private static final String DEFAULT_SEGMENT_MS = "86400000"; // 1 day + private static final String DEFAULT_SEGMENT_BYTES = "1073741824"; // 1GB + + @Autowired + private Environment env; + + private KafkaProducer producer; + private AdminClient adminClient; + private JsonFormat.Printer jsonPrinter; + private String sourceCuebot; + private String bootstrapServers; + private boolean enabled = false; + + public KafkaEventPublisher() { + super(THREAD_POOL_SIZE_INITIAL, THREAD_POOL_SIZE_MAX, 10, TimeUnit.SECONDS, + new LinkedBlockingQueue(QUEUE_SIZE)); + } + + @PostConstruct + public void initialize() { + enabled = env.getProperty("monitoring.kafka.enabled", Boolean.class, false); + + if (!enabled) { + logger.info("Kafka event publishing is disabled"); + return; + } + + try { + sourceCuebot = InetAddress.getLocalHost().getHostName(); + } catch (UnknownHostException e) { + sourceCuebot = "unknown"; + } + + bootstrapServers = env.getProperty("monitoring.kafka.bootstrap.servers", "localhost:9092"); + + jsonPrinter = + JsonFormat.printer().includingDefaultValueFields().preservingProtoFieldNames(); + + // Initialize admin client and create topics before starting the producer + initializeAdminClient(); + createTopics(); + initializeKafkaProducer(); + + logger.info("Kafka event publishing initialized, source cuebot: {}", sourceCuebot); + } + + private void initializeAdminClient() { + Properties props = new Properties(); + props.put(AdminClientConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + props.put(AdminClientConfig.REQUEST_TIMEOUT_MS_CONFIG, + env.getProperty("monitoring.kafka.admin.timeout.ms", Integer.class, 30000)); + adminClient = AdminClient.create(props); + logger.info("Kafka AdminClient initialized"); + } + + /** + * Creates all monitoring topics with proper configuration. Topics that already exist are + * skipped. + */ + private void createTopics() { + int numPartitions = env.getProperty("monitoring.kafka.topic.partitions", Integer.class, + DEFAULT_NUM_PARTITIONS); + short replicationFactor = env.getProperty("monitoring.kafka.topic.replication.factor", + Short.class, DEFAULT_REPLICATION_FACTOR); + String retentionMs = + env.getProperty("monitoring.kafka.topic.retention.ms", DEFAULT_RETENTION_MS); + String cleanupPolicy = + env.getProperty("monitoring.kafka.topic.cleanup.policy", DEFAULT_CLEANUP_POLICY); + String segmentMs = env.getProperty("monitoring.kafka.topic.segment.ms", DEFAULT_SEGMENT_MS); + String segmentBytes = + env.getProperty("monitoring.kafka.topic.segment.bytes", DEFAULT_SEGMENT_BYTES); + + // Topic configuration + Map topicConfig = new HashMap<>(); + topicConfig.put("retention.ms", retentionMs); + topicConfig.put("cleanup.policy", cleanupPolicy); + topicConfig.put("segment.ms", segmentMs); + topicConfig.put("segment.bytes", segmentBytes); + + for (String topicName : ALL_TOPICS) { + createTopic(topicName, numPartitions, replicationFactor, topicConfig); + } + } + + /** + * Creates a single topic with the specified configuration. + */ + private void createTopic(String topicName, int numPartitions, short replicationFactor, + Map config) { + NewTopic newTopic = new NewTopic(topicName, numPartitions, replicationFactor); + newTopic.configs(config); + + CreateTopicsResult result = adminClient.createTopics(Collections.singletonList(newTopic)); + + try { + result.values().get(topicName).get(); + logger.info("Topic '{}' created successfully with {} partitions, replication={}", + topicName, numPartitions, replicationFactor); + } catch (ExecutionException e) { + if (e.getCause() instanceof TopicExistsException) { + logger.info("Topic '{}' already exists", topicName); + } else { + logger.error("Failed to create topic '{}': {}", topicName, e.getMessage()); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + logger.error("Interrupted while creating topic '{}': {}", topicName, e.getMessage()); + } + } + + private void initializeKafkaProducer() { + Properties props = new Properties(); + + // Kafka broker configuration + props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + + // Serialization + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); + + // Producer configuration for reliability + props.put(ProducerConfig.ACKS_CONFIG, env.getProperty("monitoring.kafka.acks", "1")); + props.put(ProducerConfig.RETRIES_CONFIG, + env.getProperty("monitoring.kafka.retries", Integer.class, 3)); + props.put(ProducerConfig.BATCH_SIZE_CONFIG, + env.getProperty("monitoring.kafka.batch.size", Integer.class, 16384)); + props.put(ProducerConfig.LINGER_MS_CONFIG, + env.getProperty("monitoring.kafka.linger.ms", Integer.class, 10)); + props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, + env.getProperty("monitoring.kafka.buffer.memory", Long.class, 33554432L)); + + // Compression + props.put(ProducerConfig.COMPRESSION_TYPE_CONFIG, + env.getProperty("monitoring.kafka.compression.type", "lz4")); + + // Client ID + props.put(ProducerConfig.CLIENT_ID_CONFIG, "cuebot-" + sourceCuebot); + + producer = new KafkaProducer<>(props); + } + + @PreDestroy + public void shutdown() { + if (producer != null) { + producer.flush(); + producer.close(); + } + if (adminClient != null) { + adminClient.close(); + } + shutdownNow(); + logger.info("Kafka event publisher shut down"); + } + + /** + * Creates a standard event header with common fields populated. + */ + public EventHeader.Builder createEventHeader(EventType eventType) { + return EventHeader.newBuilder().setEventId(UUID.randomUUID().toString()) + .setEventType(eventType).setTimestamp(System.currentTimeMillis()) + .setSourceCuebot(sourceCuebot); + } + + /** + * Creates an event header with a correlation ID for tracing related events. + */ + public EventHeader.Builder createEventHeader(EventType eventType, String correlationId) { + return createEventHeader(eventType).setCorrelationId(correlationId); + } + + /** + * Publishes a job event to Kafka. + */ + public void publishJobEvent(JobEvent event) { + if (!enabled) + return; + publishEvent(TOPIC_JOB_EVENTS, event.getJob().getId(), event, + event.getHeader().getEventType().name()); + } + + /** + * Publishes a layer event to Kafka. + */ + public void publishLayerEvent(LayerEvent event) { + if (!enabled) + return; + publishEvent(TOPIC_LAYER_EVENTS, event.getLayer().getId(), event, + event.getHeader().getEventType().name()); + } + + /** + * Publishes a frame event to Kafka. + */ + public void publishFrameEvent(FrameEvent event) { + if (!enabled) + return; + publishEvent(TOPIC_FRAME_EVENTS, event.getFrame().getId(), event, + event.getHeader().getEventType().name()); + } + + /** + * Publishes a host event to Kafka. + */ + public void publishHostEvent(HostEvent event) { + if (!enabled) + return; + publishEvent(TOPIC_HOST_EVENTS, event.getHost().getName(), event, + event.getHeader().getEventType().name()); + } + + /** + * Publishes a proc event to Kafka. + */ + public void publishProcEvent(ProcEvent event) { + if (!enabled) + return; + publishEvent(TOPIC_PROC_EVENTS, event.getProcId(), event, + event.getHeader().getEventType().name()); + } + + /** + * Internal method to publish any protobuf message to a Kafka topic. + */ + private void publishEvent(String topic, String key, Message event, String eventType) { + try { + execute(() -> { + try { + String jsonValue = jsonPrinter.print(event); + ProducerRecord record = + new ProducerRecord<>(topic, key, jsonValue); + + producer.send(record, (metadata, exception) -> { + if (exception != null) { + logger.warn("Failed to publish event to topic {}: {}", topic, + exception.getMessage()); + } else { + logger.trace("Published event to {}, partition={}, offset={}", topic, + metadata.partition(), metadata.offset()); + } + }); + } catch (Exception e) { + logger.warn("Error serializing event for topic {}: {}", topic, e.getMessage()); + CueExceptionUtil.logStackTrace("KafkaEventPublisher error", e); + } + }); + } catch (RejectedExecutionException e) { + logger.warn("Event queue is full, dropping event for topic {}", topic); + } + } + + /** + * Returns true if Kafka event publishing is enabled. + */ + public boolean isEnabled() { + return enabled; + } + + /** + * Returns the name of this cuebot instance for event attribution. + */ + public String getSourceCuebot() { + return sourceCuebot; + } + + /** + * Returns the number of pending events in the queue. + */ + public int getPendingEventCount() { + return getQueue().size(); + } +} diff --git a/cuebot/src/main/java/com/imageworks/spcue/monitoring/MonitoringEventBuilder.java b/cuebot/src/main/java/com/imageworks/spcue/monitoring/MonitoringEventBuilder.java new file mode 100644 index 000000000..b039beab7 --- /dev/null +++ b/cuebot/src/main/java/com/imageworks/spcue/monitoring/MonitoringEventBuilder.java @@ -0,0 +1,334 @@ +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package com.imageworks.spcue.monitoring; + +import com.imageworks.spcue.DispatchFrame; +import com.imageworks.spcue.DispatchHost; +import com.imageworks.spcue.DispatchJob; +import com.imageworks.spcue.FrameDetail; +import com.imageworks.spcue.JobDetail; +import com.imageworks.spcue.LayerDetail; +import com.imageworks.spcue.VirtualProc; +import com.imageworks.spcue.grpc.host.HardwareState; +import com.imageworks.spcue.grpc.host.Host; +import com.imageworks.spcue.grpc.host.LockState; +import com.imageworks.spcue.grpc.host.ThreadMode; +import com.imageworks.spcue.grpc.job.CheckpointState; +import com.imageworks.spcue.grpc.job.Frame; +import com.imageworks.spcue.grpc.job.FrameState; +import com.imageworks.spcue.grpc.job.Job; +import com.imageworks.spcue.grpc.job.JobState; +import com.imageworks.spcue.grpc.job.Layer; +import com.imageworks.spcue.grpc.job.LayerType; +import com.imageworks.spcue.grpc.monitoring.EventHeader; +import com.imageworks.spcue.grpc.monitoring.EventType; +import com.imageworks.spcue.grpc.monitoring.FrameEvent; +import com.imageworks.spcue.grpc.monitoring.HostEvent; +import com.imageworks.spcue.grpc.monitoring.JobEvent; +import com.imageworks.spcue.grpc.monitoring.LayerEvent; +import com.imageworks.spcue.grpc.monitoring.ProcEvent; +import com.imageworks.spcue.grpc.report.FrameCompleteReport; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +/** + * Helper class for building monitoring events from OpenCue domain objects. Provides factory methods + * to create properly populated event messages. + */ +public class MonitoringEventBuilder { + + private final KafkaEventPublisher publisher; + + public MonitoringEventBuilder(KafkaEventPublisher publisher) { + this.publisher = publisher; + } + + /** + * Builds a JobEvent for a job state change. + */ + public JobEvent buildJobEvent(EventType eventType, JobDetail job, JobState previousState, + String reason, String killedBy) { + EventHeader header = publisher.createEventHeader(eventType, job.getJobId()).build(); + + // Build the embedded Job message + Job.Builder jobBuilder = Job.newBuilder().setId(job.getJobId()).setName(job.getName()) + .setShow(job.showName != null ? job.showName : "").setShot(job.shot) + .setUser(job.user).setFacility(job.facilityName) + .setGroup(job.groupId != null ? job.groupId : "").setState(job.state) + .setPriority(job.priority).setMinCores(job.minCoreUnits / 100.0f) + .setMaxCores(job.maxCoreUnits / 100.0f).setMinGpus(job.minGpuUnits) + .setMaxGpus(job.maxGpuUnits).setIsPaused(job.isPaused).setAutoEat(job.isAutoEat) + .setLogDir(job.logDir != null ? job.logDir : "") + .setOs(job.os != null ? job.os : ""); + + if (job.startTime > 0) { + jobBuilder.setStartTime(job.startTime); + } + if (job.stopTime > 0) { + jobBuilder.setStopTime(job.stopTime); + } + + JobEvent.Builder builder = JobEvent.newBuilder().setHeader(header) + .setJob(jobBuilder.build()).setPreviousState(previousState); + + if (reason != null) { + builder.setReason(reason); + } + if (killedBy != null) { + builder.setKilledBy(killedBy); + } + + return builder.build(); + } + + /** + * Builds a JobEvent for a dispatch job (lighter weight). + */ + public JobEvent buildJobEvent(EventType eventType, DispatchJob job, JobState previousState) { + EventHeader header = publisher.createEventHeader(eventType, job.getJobId()).build(); + + // Build the embedded Job message (minimal fields) + Job jobProto = Job.newBuilder().setId(job.getJobId()).setName(job.getName()) + .setState(job.state).setIsPaused(job.paused).setAutoEat(job.autoEat).build(); + + return JobEvent.newBuilder().setHeader(header).setJob(jobProto) + .setPreviousState(previousState).build(); + } + + /** + * Builds a LayerEvent for a layer. + */ + public LayerEvent buildLayerEvent(EventType eventType, LayerDetail layer, String jobName, + String show) { + EventHeader header = publisher.createEventHeader(eventType, layer.getJobId()).build(); + + // Build the embedded Layer message + Layer.Builder layerBuilder = + Layer.newBuilder().setId(layer.getLayerId()).setName(layer.getName()) + .setType(layer.type).setMinCores(layer.minimumCores / 100.0f) + .setMaxCores(layer.maximumCores / 100.0f).setMinGpus(layer.minimumGpus) + .setMaxGpus(layer.maximumGpus).setMinMemory(layer.minimumMemory) + .setMinGpuMemory(layer.minimumGpuMemory).setIsThreadable(layer.isThreadable) + .setChunkSize(layer.chunkSize).setTimeout(layer.timeout) + .setTimeoutLlu(layer.timeout_llu).setParentId(layer.getJobId()); + + if (layer.tags != null && !layer.tags.isEmpty()) { + layerBuilder.addAllTags(layer.tags); + } + if (layer.services != null && !layer.services.isEmpty()) { + layerBuilder.addAllServices(layer.services); + } + if (layer.command != null) { + layerBuilder.setCommand(layer.command); + } + + return LayerEvent.newBuilder().setHeader(header).setLayer(layerBuilder.build()) + .setJobId(layer.getJobId()).setJobName(jobName).setShow(show).build(); + } + + /** + * Builds a FrameEvent for a frame completion. + */ + public FrameEvent buildFrameCompleteEvent(FrameCompleteReport report, FrameState newState, + FrameState previousState, DispatchFrame frame, VirtualProc proc) { + EventType eventType = determineFrameEventType(newState); + EventHeader header = publisher.createEventHeader(eventType, frame.getJobId()).build(); + + // Build the embedded Frame message + Frame frameProto = Frame.newBuilder().setId(frame.getFrameId()).setName(frame.getName()) + .setLayerName(frame.layerName).setState(newState).setRetryCount(frame.retries) + .setExitStatus(report.getExitStatus()) + .setStartTime((int) report.getFrame().getStartTime()) + .setStopTime((int) (System.currentTimeMillis() / 1000)) + .setMaxRss(report.getFrame().getMaxRss()).setUsedMemory(report.getFrame().getRss()) + .setReservedMemory(proc.memoryReserved).setReservedGpuMemory(proc.gpuMemoryReserved) + .setLluTime((int) report.getFrame().getLluTime()) + .setMaxGpuMemory(report.getFrame().getMaxUsedGpuMemory()) + .setUsedGpuMemory(report.getFrame().getUsedGpuMemory()) + .setLastResource(report.getFrame().getResourceId()).build(); + + return FrameEvent.newBuilder().setHeader(header).setFrame(frameProto) + .setLayerId(frame.getLayerId()).setJobId(frame.getJobId()) + .setJobName(report.getFrame().getJobName()).setShow(frame.show) + .setPreviousState(previousState).setExitSignal(report.getExitSignal()) + .setRunTime(report.getRunTime()).setNumCores(report.getFrame().getNumCores()) + .setNumGpus(report.getFrame().getNumGpus()).setHostName(report.getHost().getName()) + .setResourceId(report.getFrame().getResourceId()).build(); + } + + /** + * Builds a FrameEvent for a frame becoming dispatchable (DEPEND -> WAITING transition). + */ + public FrameEvent buildFrameDispatchableEvent(FrameDetail frame) { + EventHeader header = + publisher.createEventHeader(EventType.FRAME_DISPATCHED, frame.getJobId()).build(); + + // Build the embedded Frame message + Frame frameProto = Frame.newBuilder().setId(frame.getFrameId()).setName(frame.getName()) + .setNumber(frame.number).setState(FrameState.WAITING) + .setRetryCount(frame.retryCount).setDispatchOrder(frame.dispatchOrder).build(); + + return FrameEvent.newBuilder().setHeader(header).setFrame(frameProto) + .setLayerId(frame.getLayerId()).setJobId(frame.getJobId()) + .setPreviousState(FrameState.DEPEND).build(); + } + + /** + * Builds a FrameEvent for a frame being started (WAITING -> RUNNING transition). + */ + public FrameEvent buildFrameStartedEvent(DispatchFrame frame, VirtualProc proc) { + EventHeader header = + publisher.createEventHeader(EventType.FRAME_STARTED, frame.getJobId()).build(); + + // Build the embedded Frame message + Frame frameProto = Frame.newBuilder().setId(frame.getFrameId()).setName(frame.getName()) + .setLayerName(frame.layerName).setState(FrameState.RUNNING) + .setRetryCount(frame.retries) + .setStartTime((int) (System.currentTimeMillis() / 1000)) + .setReservedMemory(proc.memoryReserved).setReservedGpuMemory(proc.gpuMemoryReserved) + .build(); + + return FrameEvent.newBuilder().setHeader(header).setFrame(frameProto) + .setLayerId(frame.getLayerId()).setJobId(frame.getJobId()).setJobName(frame.jobName) + .setShow(frame.show).setPreviousState(frame.state) + .setNumCores((int) (proc.coresReserved / 100.0f)).setNumGpus(proc.gpusReserved) + .setHostName(proc.hostName).build(); + } + + /** + * Builds a FrameEvent for a frame state change (not completion). + */ + public FrameEvent buildFrameEvent(EventType eventType, FrameDetail frame, String jobName, + String layerName, String show, FrameState previousState, String reason, + String killedBy) { + EventHeader header = publisher.createEventHeader(eventType, frame.getJobId()).build(); + + // Build the embedded Frame message + Frame.Builder frameBuilder = Frame.newBuilder().setId(frame.getFrameId()) + .setName(frame.getName()).setLayerName(layerName).setNumber(frame.number) + .setState(frame.state).setRetryCount(frame.retryCount) + .setExitStatus(frame.exitStatus).setDispatchOrder(frame.dispatchOrder); + + if (frame.dateStarted != null) { + frameBuilder.setStartTime((int) (frame.dateStarted.getTime() / 1000)); + } + if (frame.dateStopped != null) { + frameBuilder.setStopTime((int) (frame.dateStopped.getTime() / 1000)); + } + if (frame.maxRss > 0) { + frameBuilder.setMaxRss(frame.maxRss); + } + if (frame.lastResource != null) { + frameBuilder.setLastResource(frame.lastResource); + } + + FrameEvent.Builder builder = + FrameEvent.newBuilder().setHeader(header).setFrame(frameBuilder.build()) + .setLayerId(frame.getLayerId()).setJobId(frame.getJobId()) + .setJobName(jobName).setShow(show).setPreviousState(previousState); + + if (frame.lastResource != null) { + builder.setHostName(frame.lastResource); + } + if (reason != null) { + builder.setReason(reason); + } + if (killedBy != null) { + builder.setKilledBy(killedBy); + } + + return builder.build(); + } + + /** + * Builds a HostEvent for a host state change. + */ + public HostEvent buildHostEvent(EventType eventType, DispatchHost host, + HardwareState previousState, LockState previousLockState, String reason) { + EventHeader header = publisher.createEventHeader(eventType, host.getHostId()).build(); + + // Convert int threadMode to ThreadMode enum + ThreadMode threadMode = host.threadMode == 0 ? ThreadMode.AUTO : ThreadMode.ALL; + + // Build the embedded Host message + Host.Builder hostBuilder = Host.newBuilder().setId(host.getHostId()).setName(host.getName()) + .setAllocName(host.getAllocationId() != null ? host.getAllocationId() : "") + .setNimbyEnabled(host.isNimby).setCores(host.cores / 100.0f) + .setIdleCores(host.idleCores / 100.0f).setMemory(host.memory) + .setIdleMemory(host.idleMemory).setTotalMemory(host.memory).setGpus(host.gpus) + .setIdleGpus(host.idleGpus).setGpuMemory(host.gpuMemory) + .setIdleGpuMemory(host.idleGpuMemory).setTotalGpuMemory(host.gpuMemory) + .setState(host.hardwareState).setLockState(host.lockState) + .setThreadMode(threadMode); + + String[] osArray = host.getOs(); + if (osArray != null && osArray.length > 0) { + hostBuilder.setOs(String.join(",", osArray)); + } + if (host.tags != null) { + hostBuilder.addAllTags(Arrays.asList(host.tags.split("\\|"))); + } + + HostEvent.Builder builder = + HostEvent.newBuilder().setHeader(header).setHost(hostBuilder.build()) + .setFacility(host.getFacilityId() != null ? host.getFacilityId() : "") + .setPreviousState(previousState).setPreviousLockState(previousLockState); + + if (reason != null) { + builder.setReason(reason); + } + + return builder.build(); + } + + /** + * Builds a ProcEvent for a proc booking/unbooking. + */ + public ProcEvent buildProcEvent(EventType eventType, VirtualProc proc) { + EventHeader header = publisher.createEventHeader(eventType, proc.getJobId()).build(); + + ProcEvent.Builder builder = ProcEvent.newBuilder().setHeader(header) + .setProcId(proc.getProcId()).setProcName(proc.getName()).setHostId(proc.getHostId()) + .setHostName(proc.hostName).setJobId(proc.getJobId()) + .setFrameId(proc.frameId != null ? proc.frameId : "") + .setReservedCores(proc.coresReserved / 100.0f).setReservedGpus(proc.gpusReserved) + .setReservedMemory(proc.memoryReserved).setReservedGpuMemory(proc.gpuMemoryReserved) + .setIsLocalDispatch(proc.isLocalDispatch).setIsUnbooked(proc.unbooked); + + return builder.build(); + } + + /** + * Determines the appropriate event type based on the frame's new state. + */ + private EventType determineFrameEventType(FrameState state) { + switch (state) { + case SUCCEEDED: + return EventType.FRAME_COMPLETED; + case DEAD: + return EventType.FRAME_FAILED; + case EATEN: + return EventType.FRAME_EATEN; + case WAITING: + return EventType.FRAME_RETRIED; + case CHECKPOINT: + return EventType.FRAME_CHECKPOINT; + default: + return EventType.FRAME_COMPLETED; + } + } +} diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageMonitoring.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageMonitoring.java new file mode 100644 index 000000000..30865132d --- /dev/null +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageMonitoring.java @@ -0,0 +1,99 @@ +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package com.imageworks.spcue.servant; + +import com.imageworks.spcue.grpc.monitoring.GetFarmStatisticsRequest; +import com.imageworks.spcue.grpc.monitoring.GetFarmStatisticsResponse; +import com.imageworks.spcue.grpc.monitoring.GetFrameHistoryRequest; +import com.imageworks.spcue.grpc.monitoring.GetFrameHistoryResponse; +import com.imageworks.spcue.grpc.monitoring.GetHostHistoryRequest; +import com.imageworks.spcue.grpc.monitoring.GetHostHistoryResponse; +import com.imageworks.spcue.grpc.monitoring.GetJobHistoryRequest; +import com.imageworks.spcue.grpc.monitoring.GetJobHistoryResponse; +import com.imageworks.spcue.grpc.monitoring.GetLayerHistoryRequest; +import com.imageworks.spcue.grpc.monitoring.GetLayerHistoryResponse; +import com.imageworks.spcue.grpc.monitoring.GetLayerMemoryHistoryRequest; +import com.imageworks.spcue.grpc.monitoring.GetLayerMemoryHistoryResponse; +import com.imageworks.spcue.grpc.monitoring.MonitoringInterfaceGrpc; + +import io.grpc.stub.StreamObserver; + +/** + * gRPC servant for the MonitoringInterface service. + * + * Historical data queries are not implemented here - historical event data is indexed to + * Elasticsearch by the external kafka-es-indexer service and should be queried directly via the + * Elasticsearch HTTP API or Kibana. + */ +public class ManageMonitoring extends MonitoringInterfaceGrpc.MonitoringInterfaceImplBase { + + @Override + public void getJobHistory(GetJobHistoryRequest request, + StreamObserver responseObserver) { + responseObserver + .onError(io.grpc.Status.UNIMPLEMENTED + .withDescription("Historical data is indexed to Elasticsearch. " + + "Query via Elasticsearch HTTP API or Kibana.") + .asRuntimeException()); + } + + @Override + public void getFrameHistory(GetFrameHistoryRequest request, + StreamObserver responseObserver) { + responseObserver + .onError(io.grpc.Status.UNIMPLEMENTED + .withDescription("Historical data is indexed to Elasticsearch. " + + "Query via Elasticsearch HTTP API or Kibana.") + .asRuntimeException()); + } + + @Override + public void getLayerHistory(GetLayerHistoryRequest request, + StreamObserver responseObserver) { + responseObserver + .onError(io.grpc.Status.UNIMPLEMENTED + .withDescription("Historical data is indexed to Elasticsearch. " + + "Query via Elasticsearch HTTP API or Kibana.") + .asRuntimeException()); + } + + @Override + public void getHostHistory(GetHostHistoryRequest request, + StreamObserver responseObserver) { + responseObserver + .onError(io.grpc.Status.UNIMPLEMENTED + .withDescription("Historical data is indexed to Elasticsearch. " + + "Query via Elasticsearch HTTP API or Kibana.") + .asRuntimeException()); + } + + @Override + public void getFarmStatistics(GetFarmStatisticsRequest request, + StreamObserver responseObserver) { + responseObserver.onError(io.grpc.Status.UNIMPLEMENTED + .withDescription("Farm statistics should be queried from Prometheus/Elasticsearch.") + .asRuntimeException()); + } + + @Override + public void getLayerMemoryHistory(GetLayerMemoryHistoryRequest request, + StreamObserver responseObserver) { + responseObserver + .onError(io.grpc.Status.UNIMPLEMENTED + .withDescription("Historical data is indexed to Elasticsearch. " + + "Query via Elasticsearch HTTP API or Kibana.") + .asRuntimeException()); + } +} diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/DependManagerService.java b/cuebot/src/main/java/com/imageworks/spcue/service/DependManagerService.java index 1b62e54c0..9c0687549 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/DependManagerService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/DependManagerService.java @@ -28,6 +28,7 @@ import com.imageworks.spcue.BuildableDependency; import com.imageworks.spcue.DependencyManagerException; +import com.imageworks.spcue.FrameDetail; import com.imageworks.spcue.FrameInterface; import com.imageworks.spcue.JobInterface; import com.imageworks.spcue.LayerDetail; @@ -54,6 +55,11 @@ import com.imageworks.spcue.depend.PreviousFrame; import com.imageworks.spcue.grpc.depend.DependTarget; import com.imageworks.spcue.grpc.depend.DependType; +import com.imageworks.spcue.grpc.job.FrameState; +import com.imageworks.spcue.grpc.monitoring.EventType; +import com.imageworks.spcue.grpc.monitoring.FrameEvent; +import com.imageworks.spcue.monitoring.KafkaEventPublisher; +import com.imageworks.spcue.monitoring.MonitoringEventBuilder; import com.imageworks.spcue.util.CueUtil; import com.imageworks.spcue.util.FrameSet; @@ -67,6 +73,8 @@ public class DependManagerService implements DependManager { private LayerDao layerDao; private FrameDao frameDao; private FrameSearchFactory frameSearchFactory; + private KafkaEventPublisher kafkaEventPublisher; + private MonitoringEventBuilder monitoringEventBuilder; /** Job Depends **/ @Override @@ -510,6 +518,12 @@ public void satisfyDepend(LightweightDependency depend) { logger.warn( "warning, depend count for " + depend.getId() + "was not decremented " + "for frame " + f + "because the count is " + "already 0."); + } else { + // Check if frame just became dispatchable (depend_count = 0) + // and publish FRAME_DISPATCHED event (DEPEND -> WAITING transition) + if (dependDao.isFrameDispatchable(f)) { + publishFrameDispatchableEvent(f); + } } } } @@ -613,4 +627,26 @@ public FrameSearchFactory getFrameSearchFactory() { public void setFrameSearchFactory(FrameSearchFactory frameSearchFactory) { this.frameSearchFactory = frameSearchFactory; } + + public void setKafkaEventPublisher(KafkaEventPublisher kafkaEventPublisher) { + this.kafkaEventPublisher = kafkaEventPublisher; + } + + public void setMonitoringEventBuilder(MonitoringEventBuilder monitoringEventBuilder) { + this.monitoringEventBuilder = monitoringEventBuilder; + } + + /** + * Publishes a frame dispatchable event to Kafka for monitoring purposes. This captures the + * DEPEND -> WAITING transition for pickup time analysis. + */ + private void publishFrameDispatchableEvent(FrameInterface frame) { + if (kafkaEventPublisher == null || !kafkaEventPublisher.isEnabled()) { + return; + } + + FrameDetail frameDetail = frameDao.getFrameDetail(frame); + FrameEvent event = monitoringEventBuilder.buildFrameDispatchableEvent(frameDetail); + kafkaEventPublisher.publishFrameEvent(event); + } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerSupport.java b/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerSupport.java index bb91ad1d8..4c2985da6 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerSupport.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerSupport.java @@ -27,6 +27,7 @@ import com.imageworks.spcue.FrameInterface; import com.imageworks.spcue.HostInterface; +import com.imageworks.spcue.JobDetail; import com.imageworks.spcue.JobInterface; import com.imageworks.spcue.LayerInterface; import com.imageworks.spcue.LightweightDependency; @@ -49,6 +50,13 @@ import com.imageworks.spcue.rqd.RqdClient; import com.imageworks.spcue.util.CueExceptionUtil; import com.imageworks.spcue.util.FrameSet; +import com.imageworks.spcue.PrometheusMetricsCollector; +import com.imageworks.spcue.ExecutionSummary; +import com.imageworks.spcue.dao.ShowDao; +import com.imageworks.spcue.monitoring.KafkaEventPublisher; +import com.imageworks.spcue.monitoring.MonitoringEventBuilder; +import com.imageworks.spcue.grpc.monitoring.EventType; +import com.imageworks.spcue.grpc.monitoring.JobEvent; /** * A non-transaction support class for managing jobs. @@ -66,6 +74,10 @@ public class JobManagerSupport { private RedirectManager redirectManager; private EmailSupport emailSupport; private FrameSearchFactory frameSearchFactory; + private PrometheusMetricsCollector prometheusMetrics; + private ShowDao showDao; + private KafkaEventPublisher kafkaEventPublisher; + private MonitoringEventBuilder monitoringEventBuilder; public void queueShutdownJob(JobInterface job, Source source, boolean isManualKill) { manageQueue.execute(new DispatchJobComplete(job, source, isManualKill, this)); @@ -150,6 +162,34 @@ public boolean shutdownJob(JobInterface job, Source source, boolean isManualKill */ emailSupport.sendShutdownEmail(job); + // Record job completion metric + if (prometheusMetrics != null && showDao != null) { + JobDetail jobDetail = jobManager.getJobDetail(job.getId()); + String showName = showDao.getShowDetail(job.getShowId()).getName(); + String state = isManualKill ? "KILLED" : "FINISHED"; + prometheusMetrics.recordJobCompleted(state, showName, jobDetail.shot); + + // Record job core seconds histogram + ExecutionSummary execSummary = jobManager.getExecutionSummary(job); + prometheusMetrics.recordJobCoreSeconds(execSummary.coreTime, showName, + jobDetail.shot); + } + + // Publish job completed/killed event to Kafka + if (kafkaEventPublisher != null && kafkaEventPublisher.isEnabled()) { + try { + JobDetail jobDetail = jobManager.getJobDetail(job.getId()); + EventType eventType = + isManualKill ? EventType.JOB_KILLED : EventType.JOB_FINISHED; + JobState previousState = JobState.PENDING; + JobEvent jobEvent = monitoringEventBuilder.buildJobEvent(eventType, + jobDetail, previousState, null, null); + kafkaEventPublisher.publishJobEvent(jobEvent); + } catch (Exception e) { + logger.warn("Failed to publish job event", e); + } + } + return true; } } @@ -593,4 +633,32 @@ public FrameSearchFactory getFrameSearchFactory() { public void setFrameSearchFactory(FrameSearchFactory frameSearchFactory) { this.frameSearchFactory = frameSearchFactory; } + + public PrometheusMetricsCollector getPrometheusMetrics() { + return prometheusMetrics; + } + + public void setPrometheusMetrics(PrometheusMetricsCollector prometheusMetrics) { + this.prometheusMetrics = prometheusMetrics; + } + + public ShowDao getShowDao() { + return showDao; + } + + public void setShowDao(ShowDao showDao) { + this.showDao = showDao; + } + + public KafkaEventPublisher getKafkaEventPublisher() { + return kafkaEventPublisher; + } + + public void setKafkaEventPublisher(KafkaEventPublisher kafkaEventPublisher) { + this.kafkaEventPublisher = kafkaEventPublisher; + } + + public void setMonitoringEventBuilder(MonitoringEventBuilder monitoringEventBuilder) { + this.monitoringEventBuilder = monitoringEventBuilder; + } } diff --git a/cuebot/src/main/resources/conf/spring/applicationContext-grpc.xml b/cuebot/src/main/resources/conf/spring/applicationContext-grpc.xml index 3dbc429b0..612aeaff5 100644 --- a/cuebot/src/main/resources/conf/spring/applicationContext-grpc.xml +++ b/cuebot/src/main/resources/conf/spring/applicationContext-grpc.xml @@ -198,4 +198,7 @@ + + + diff --git a/cuebot/src/main/resources/conf/spring/applicationContext-monitoring.xml b/cuebot/src/main/resources/conf/spring/applicationContext-monitoring.xml new file mode 100644 index 000000000..cae80bac0 --- /dev/null +++ b/cuebot/src/main/resources/conf/spring/applicationContext-monitoring.xml @@ -0,0 +1,45 @@ + + + + + + + + + + + + + + + + + + diff --git a/cuebot/src/main/resources/conf/spring/applicationContext-service.xml b/cuebot/src/main/resources/conf/spring/applicationContext-service.xml index 0d69e2aab..65e65d315 100644 --- a/cuebot/src/main/resources/conf/spring/applicationContext-service.xml +++ b/cuebot/src/main/resources/conf/spring/applicationContext-service.xml @@ -179,6 +179,8 @@ + + @@ -195,6 +197,8 @@ + + @@ -248,6 +252,10 @@ + + + + @@ -379,6 +387,9 @@ + + + @@ -394,6 +405,8 @@ + + diff --git a/cuebot/src/main/resources/opencue.properties b/cuebot/src/main/resources/opencue.properties index 233d00516..897a693a1 100644 --- a/cuebot/src/main/resources/opencue.properties +++ b/cuebot/src/main/resources/opencue.properties @@ -234,3 +234,38 @@ frame.finished_jobs_readonly=false metrics.prometheus.collector=false # What environment variable to use to acquire the deployment environment id (et. dev, prod, staging) metrics.prometheus.environment_id.environment_variable=DEPLOYMENT_ENVIRONMENT + +# ================================================================================== +# Render Farm Monitoring Configuration +# ================================================================================== + +# Kafka Event Publishing +# Enable/disable Kafka event publishing for render farm statistics +monitoring.kafka.enabled=${MONITORING_KAFKA_ENABLED:false} + +# Kafka broker connection string (comma-separated list of host:port) +monitoring.kafka.bootstrap.servers=${MONITORING_KAFKA_SERVERS:localhost:9092} + +# Kafka producer settings +monitoring.kafka.acks=1 +monitoring.kafka.retries=3 +monitoring.kafka.batch.size=16384 +monitoring.kafka.linger.ms=10 +monitoring.kafka.buffer.memory=33554432 +monitoring.kafka.compression.type=lz4 + +# Kafka consumer settings (for Elasticsearch indexing) +monitoring.kafka.consumer.group.id=opencue-elasticsearch-indexer +monitoring.kafka.consumer.auto.offset.reset=earliest +monitoring.kafka.consumer.enable.auto.commit=true +monitoring.kafka.consumer.auto.commit.interval.ms=5000 +monitoring.kafka.consumer.max.poll.records=500 + +# Elasticsearch Historical Storage +# Enable/disable Elasticsearch storage for historical data +monitoring.elasticsearch.enabled=${MONITORING_ELASTICSEARCH_ENABLED:false} + +# Elasticsearch connection settings +monitoring.elasticsearch.host=${MONITORING_ELASTICSEARCH_HOST:localhost} +monitoring.elasticsearch.port=${MONITORING_ELASTICSEARCH_PORT:9200} +monitoring.elasticsearch.scheme=${MONITORING_ELASTICSEARCH_SCHEME:http} diff --git a/cuebot/src/test/java/com/imageworks/spcue/config/TestAppConfig.java b/cuebot/src/test/java/com/imageworks/spcue/config/TestAppConfig.java index c68b525c6..4648088fd 100644 --- a/cuebot/src/test/java/com/imageworks/spcue/config/TestAppConfig.java +++ b/cuebot/src/test/java/com/imageworks/spcue/config/TestAppConfig.java @@ -33,7 +33,8 @@ "classpath:conf/spring/applicationContext-grpcServer.xml", "classpath:conf/spring/applicationContext-service.xml", "classpath:conf/spring/applicationContext-jms.xml", - "classpath:conf/spring/applicationContext-criteria.xml"}) + "classpath:conf/spring/applicationContext-criteria.xml", + "classpath:conf/spring/applicationContext-monitoring.xml"}) @EnableConfigurationProperties @PropertySource({"classpath:opencue.properties"}) public class TestAppConfig { diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/monitoring/MonitoringEventBuilderTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/monitoring/MonitoringEventBuilderTests.java new file mode 100644 index 000000000..f96d27040 --- /dev/null +++ b/cuebot/src/test/java/com/imageworks/spcue/test/monitoring/MonitoringEventBuilderTests.java @@ -0,0 +1,247 @@ + +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package com.imageworks.spcue.test.monitoring; + +import org.junit.Before; +import org.junit.Test; + +import com.imageworks.spcue.DispatchFrame; +import com.imageworks.spcue.FrameDetail; +import com.imageworks.spcue.VirtualProc; +import com.imageworks.spcue.grpc.job.FrameState; +import com.imageworks.spcue.grpc.monitoring.EventHeader; +import com.imageworks.spcue.grpc.monitoring.EventType; +import com.imageworks.spcue.grpc.monitoring.FrameEvent; +import com.imageworks.spcue.monitoring.KafkaEventPublisher; +import com.imageworks.spcue.monitoring.MonitoringEventBuilder; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +/** + * Unit tests for MonitoringEventBuilder, specifically testing the pickup time tracking events: - + * FRAME_STARTED (WAITING -> RUNNING transition) - FRAME_DISPATCHED (DEPEND -> WAITING transition) + */ +public class MonitoringEventBuilderTests { + + private MonitoringEventBuilder eventBuilder; + private TestKafkaEventPublisher testPublisher; + + /** + * A test implementation of KafkaEventPublisher that provides event headers without requiring + * actual Kafka connectivity. + */ + private static class TestKafkaEventPublisher extends KafkaEventPublisher { + private final String sourceCuebot = "test-cuebot"; + + @Override + public EventHeader.Builder createEventHeader(EventType eventType) { + return EventHeader.newBuilder().setEventId("test-event-id").setEventType(eventType) + .setTimestamp(System.currentTimeMillis()).setSourceCuebot(sourceCuebot); + } + + @Override + public EventHeader.Builder createEventHeader(EventType eventType, String correlationId) { + return createEventHeader(eventType).setCorrelationId(correlationId); + } + + @Override + public boolean isEnabled() { + return false; // Disabled for testing - we don't want to publish to real Kafka + } + } + + @Before + public void setUp() { + testPublisher = new TestKafkaEventPublisher(); + eventBuilder = new MonitoringEventBuilder(testPublisher); + } + + /** + * Test buildFrameStartedEvent for WAITING -> RUNNING transition. This event is used to + * calculate pickup time (time from ready to dispatch). + */ + @Test + public void testBuildFrameStartedEvent() { + // Setup test data + DispatchFrame frame = createTestDispatchFrame(); + VirtualProc proc = createTestVirtualProc(); + + // Build the event + FrameEvent event = eventBuilder.buildFrameStartedEvent(frame, proc); + + // Verify the event + assertNotNull("Event should not be null", event); + assertNotNull("Event header should not be null", event.getHeader()); + assertEquals("Event type should be FRAME_STARTED", EventType.FRAME_STARTED, + event.getHeader().getEventType()); + + // Verify embedded frame fields (now using composition) + assertNotNull("Embedded frame should not be null", event.getFrame()); + assertEquals("test-frame-id", event.getFrame().getId()); + assertEquals("0001-test_layer", event.getFrame().getName()); + assertEquals("test_layer", event.getFrame().getLayerName()); + + // Verify context fields (still on event level) + assertEquals("test-layer-id", event.getLayerId()); + assertEquals("test-job-id", event.getJobId()); + assertEquals("test-job-name", event.getJobName()); + assertEquals("testing", event.getShow()); + + // Verify state transition + assertEquals("State should be RUNNING", FrameState.RUNNING, event.getFrame().getState()); + assertEquals("Previous state should be WAITING", FrameState.WAITING, + event.getPreviousState()); + + // Verify proc fields + assertEquals("test-host", event.getHostName()); + assertEquals(1, event.getNumCores()); + assertEquals(0, event.getNumGpus()); + assertTrue("Start time should be set", event.getFrame().getStartTime() > 0); + } + + /** + * Test buildFrameDispatchableEvent for DEPEND -> WAITING transition. This event marks when a + * frame becomes ready for dispatch after dependencies are satisfied. + */ + @Test + public void testBuildFrameDispatchableEvent() { + // Setup test data + FrameDetail frame = createTestFrameDetail(); + + // Build the event + FrameEvent event = eventBuilder.buildFrameDispatchableEvent(frame); + + // Verify the event + assertNotNull("Event should not be null", event); + assertNotNull("Event header should not be null", event.getHeader()); + assertEquals("Event type should be FRAME_DISPATCHED", EventType.FRAME_DISPATCHED, + event.getHeader().getEventType()); + + // Verify embedded frame fields (now using composition) + assertNotNull("Embedded frame should not be null", event.getFrame()); + assertEquals("test-frame-id", event.getFrame().getId()); + assertEquals("0001-test_layer", event.getFrame().getName()); + assertEquals(1, event.getFrame().getNumber()); + + // Verify context fields (still on event level) + assertEquals("test-layer-id", event.getLayerId()); + assertEquals("test-job-id", event.getJobId()); + + // Verify state transition + assertEquals("State should be WAITING", FrameState.WAITING, event.getFrame().getState()); + assertEquals("Previous state should be DEPEND", FrameState.DEPEND, + event.getPreviousState()); + + // Verify other fields (now in embedded frame) + assertEquals(0, event.getFrame().getRetryCount()); + assertEquals(1, event.getFrame().getDispatchOrder()); + } + + /** + * Test that FRAME_STARTED event includes reserved resources from the proc. + */ + @Test + public void testBuildFrameStartedEventIncludesResources() { + DispatchFrame frame = createTestDispatchFrame(); + VirtualProc proc = createTestVirtualProc(); + proc.memoryReserved = 4194304; // 4GB + proc.gpuMemoryReserved = 2097152; // 2GB + proc.coresReserved = 200; // 2 cores + proc.gpusReserved = 1; + + FrameEvent event = eventBuilder.buildFrameStartedEvent(frame, proc); + + // Reserved memory is now in the embedded Frame + assertEquals("Reserved memory should match", 4194304, event.getFrame().getReservedMemory()); + assertEquals("Reserved GPU memory should match", 2097152, + event.getFrame().getReservedGpuMemory()); + // Num cores and GPUs are still on event level (resource allocation info) + assertEquals("Num cores should be calculated from coresReserved/100", 2, + event.getNumCores()); + assertEquals("Num GPUs should match", 1, event.getNumGpus()); + } + + /** + * Test that events maintain correct correlation ID for tracing. + */ + @Test + public void testEventCorrelationId() { + DispatchFrame frame = createTestDispatchFrame(); + VirtualProc proc = createTestVirtualProc(); + + FrameEvent event = eventBuilder.buildFrameStartedEvent(frame, proc); + + assertEquals("Correlation ID should be the job ID", "test-job-id", + event.getHeader().getCorrelationId()); + } + + /** + * Test building event with retry count. + */ + @Test + public void testBuildFrameStartedEventWithRetries() { + DispatchFrame frame = createTestDispatchFrame(); + frame.retries = 3; + VirtualProc proc = createTestVirtualProc(); + + FrameEvent event = eventBuilder.buildFrameStartedEvent(frame, proc); + + // Retry count is now in the embedded Frame + assertEquals("Retry count should be included", 3, event.getFrame().getRetryCount()); + } + + // Helper methods to create test objects + + private DispatchFrame createTestDispatchFrame() { + DispatchFrame frame = new DispatchFrame(); + frame.id = "test-frame-id"; + frame.name = "0001-test_layer"; + frame.layerId = "test-layer-id"; + frame.layerName = "test_layer"; + frame.jobId = "test-job-id"; + frame.jobName = "test-job-name"; + frame.show = "testing"; + frame.state = FrameState.WAITING; + frame.retries = 0; + return frame; + } + + private FrameDetail createTestFrameDetail() { + FrameDetail frame = new FrameDetail(); + frame.id = "test-frame-id"; + frame.name = "0001-test_layer"; + frame.number = 1; + frame.layerId = "test-layer-id"; + frame.jobId = "test-job-id"; + frame.state = FrameState.WAITING; + frame.retryCount = 0; + frame.dispatchOrder = 1; + return frame; + } + + private VirtualProc createTestVirtualProc() { + VirtualProc proc = new VirtualProc(); + proc.id = "test-proc-id"; + proc.hostName = "test-host"; + proc.coresReserved = 100; // 1 core + proc.gpusReserved = 0; + proc.memoryReserved = 3355443; + proc.gpuMemoryReserved = 0; + return proc; + } +} diff --git a/cuebot/src/test/java/com/imageworks/spcue/test/monitoring/PickupTimeTrackingTests.java b/cuebot/src/test/java/com/imageworks/spcue/test/monitoring/PickupTimeTrackingTests.java new file mode 100644 index 000000000..ee8908fae --- /dev/null +++ b/cuebot/src/test/java/com/imageworks/spcue/test/monitoring/PickupTimeTrackingTests.java @@ -0,0 +1,212 @@ + +/* + * Copyright Contributors to the OpenCue Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package com.imageworks.spcue.test.monitoring; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import javax.annotation.Resource; + +import org.junit.Before; +import org.junit.Test; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.transaction.annotation.Transactional; + +import com.imageworks.spcue.FrameInterface; +import com.imageworks.spcue.JobDetail; +import com.imageworks.spcue.LayerInterface; +import com.imageworks.spcue.LightweightDependency; +import com.imageworks.spcue.dao.DependDao; +import com.imageworks.spcue.dao.FrameDao; +import com.imageworks.spcue.dao.LayerDao; +import com.imageworks.spcue.depend.FrameOnFrame; +import com.imageworks.spcue.depend.LayerOnLayer; +import com.imageworks.spcue.grpc.monitoring.EventType; +import com.imageworks.spcue.grpc.monitoring.FrameEvent; +import com.imageworks.spcue.monitoring.KafkaEventPublisher; +import com.imageworks.spcue.monitoring.MonitoringEventBuilder; +import com.imageworks.spcue.service.DependManager; +import com.imageworks.spcue.service.JobLauncher; +import com.imageworks.spcue.service.JobManager; +import com.imageworks.spcue.test.TransactionalTest; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** + * Integration tests for pickup time tracking events. + * + * These tests verify that: 1. FRAME_DISPATCHED events are published when frames transition from + * DEPEND to WAITING (when dependencies are satisfied) 2. The events contain correct state + * transition information for pickup time calculation + * + * Pickup time = FRAME_STARTED.timestamp - FRAME_DISPATCHED.timestamp This measures how long a frame + * waits in the queue after becoming dispatchable. + */ +@ContextConfiguration +public class PickupTimeTrackingTests extends TransactionalTest { + + @Resource + DependDao dependDao; + + @Resource + DependManager dependManager; + + @Resource + FrameDao frameDao; + + @Resource + LayerDao layerDao; + + @Resource + JobManager jobManager; + + @Resource + JobLauncher jobLauncher; + + @Before + public void launchTestJobs() { + jobLauncher.testMode = true; + jobLauncher.launch(new File("src/test/resources/conf/jobspec/jobspec_depend_test.xml")); + } + + public JobDetail getJobA() { + return jobManager.findJobDetail("pipe-dev.cue-testuser_depend_test_a"); + } + + public JobDetail getJobB() { + return jobManager.findJobDetail("pipe-dev.cue-testuser_depend_test_b"); + } + + /** + * Test that isFrameDispatchable returns true when depend_count is 0. + */ + @Test + @Transactional + @Rollback(true) + public void testIsFrameDispatchable() { + JobDetail job_a = getJobA(); + JobDetail job_b = getJobB(); + LayerInterface layer_a = layerDao.findLayer(job_a, "pass_1"); + LayerInterface layer_b = layerDao.findLayer(job_b, "pass_1"); + FrameInterface frame_a = frameDao.findFrame(layer_a, 1); + FrameInterface frame_b = frameDao.findFrame(layer_b, 1); + + // Initially, frame_a should be dispatchable (no dependencies) + assertTrue("Frame with no dependencies should be dispatchable", + dependDao.isFrameDispatchable(frame_a)); + + // Create a dependency: frame_a depends on frame_b + FrameOnFrame depend = new FrameOnFrame(frame_a, frame_b); + dependManager.createDepend(depend); + + // Now frame_a should NOT be dispatchable (has a dependency) + assertFalse("Frame with active dependency should not be dispatchable", + dependDao.isFrameDispatchable(frame_a)); + + // Satisfy the dependency + LightweightDependency lwd = dependManager.getDepend(depend.getId()); + dependManager.satisfyDepend(lwd); + + // Now frame_a should be dispatchable again + assertTrue("Frame with satisfied dependency should be dispatchable", + dependDao.isFrameDispatchable(frame_a)); + } + + /** + * Test that depend_count correctly tracks multiple dependencies. + */ + @Test + @Transactional + @Rollback(true) + public void testMultipleDependencies() { + JobDetail job_a = getJobA(); + JobDetail job_b = getJobB(); + LayerInterface layer_a = layerDao.findLayer(job_a, "pass_1"); + LayerInterface layer_b = layerDao.findLayer(job_b, "pass_1"); + FrameInterface frame_a = frameDao.findFrame(layer_a, 1); + FrameInterface frame_b1 = frameDao.findFrame(layer_b, 1); + FrameInterface frame_b2 = frameDao.findFrame(layer_b, 2); + + // Create two dependencies for frame_a + FrameOnFrame depend1 = new FrameOnFrame(frame_a, frame_b1); + FrameOnFrame depend2 = new FrameOnFrame(frame_a, frame_b2); + dependManager.createDepend(depend1); + dependManager.createDepend(depend2); + + // Frame should not be dispatchable + assertFalse("Frame with two dependencies should not be dispatchable", + dependDao.isFrameDispatchable(frame_a)); + + // Satisfy first dependency + dependManager.satisfyDepend(dependManager.getDepend(depend1.getId())); + + // Still not dispatchable (one dependency remaining) + assertFalse("Frame with one remaining dependency should not be dispatchable", + dependDao.isFrameDispatchable(frame_a)); + + // Satisfy second dependency + dependManager.satisfyDepend(dependManager.getDepend(depend2.getId())); + + // Now should be dispatchable + assertTrue("Frame with all dependencies satisfied should be dispatchable", + dependDao.isFrameDispatchable(frame_a)); + } + + /** + * Test that LayerOnLayer dependency satisfaction makes all frames dispatchable. + */ + @Test + @Transactional + @Rollback(true) + public void testLayerOnLayerMakesFramesDispatchable() { + JobDetail job_a = getJobA(); + JobDetail job_b = getJobB(); + LayerInterface layer_a = layerDao.findLayer(job_a, "pass_1"); + LayerInterface layer_b = layerDao.findLayer(job_b, "pass_1"); + + // Create layer-on-layer dependency + LayerOnLayer depend = new LayerOnLayer(layer_a, layer_b); + dependManager.createDepend(depend); + + // All frames in layer_a should not be dispatchable + FrameInterface frame_a1 = frameDao.findFrame(layer_a, 1); + FrameInterface frame_a5 = frameDao.findFrame(layer_a, 5); + assertFalse("Frame in dependent layer should not be dispatchable", + dependDao.isFrameDispatchable(frame_a1)); + assertFalse("Frame in dependent layer should not be dispatchable", + dependDao.isFrameDispatchable(frame_a5)); + + // Satisfy the layer dependency + for (LightweightDependency lwd : dependDao.getWhatDependsOn(layer_b)) { + dependManager.satisfyDepend(lwd); + } + + // All frames should now be dispatchable + assertTrue("Frame should be dispatchable after layer dependency satisfied", + dependDao.isFrameDispatchable(frame_a1)); + assertTrue("Frame should be dispatchable after layer dependency satisfied", + dependDao.isFrameDispatchable(frame_a5)); + } +} diff --git a/docs/_docs/concepts/command-execution.md b/docs/_docs/concepts/command-execution.md index 76be62d7e..3e0b03e21 100644 --- a/docs/_docs/concepts/command-execution.md +++ b/docs/_docs/concepts/command-execution.md @@ -1,6 +1,6 @@ --- title: "Command Execution on the Render Farm" -nav_order: 15 +nav_order: 16 parent: "Concepts" layout: default date: 2025-10-02 diff --git a/docs/_docs/concepts/cueweb-rest-gateway.md b/docs/_docs/concepts/cueweb-rest-gateway.md index 483c2a1dc..89fa1af3a 100644 --- a/docs/_docs/concepts/cueweb-rest-gateway.md +++ b/docs/_docs/concepts/cueweb-rest-gateway.md @@ -1,6 +1,6 @@ --- title: "CueWeb and REST Gateway" -nav_order: 16 +nav_order: 17 parent: Concepts layout: default linkTitle: "CueWeb and REST Gateway" diff --git a/docs/_docs/concepts/filters-and-actions.md b/docs/_docs/concepts/filters-and-actions.md index e4a0aa497..809424327 100644 --- a/docs/_docs/concepts/filters-and-actions.md +++ b/docs/_docs/concepts/filters-and-actions.md @@ -1,6 +1,6 @@ --- title: "Filters and Actions" -nav_order: 14 +nav_order: 15 parent: Concepts layout: default date: 2025-10-15 diff --git a/docs/_docs/concepts/glossary.md b/docs/_docs/concepts/glossary.md index af474b885..549611d9f 100644 --- a/docs/_docs/concepts/glossary.md +++ b/docs/_docs/concepts/glossary.md @@ -1,6 +1,6 @@ --- title: "Glossary" -nav_order: 11 +nav_order: 12 parent: Concepts layout: default linkTitle: "Glossary" diff --git a/docs/_docs/concepts/index.md b/docs/_docs/concepts/index.md index 2f98d6936..d535b6b56 100644 --- a/docs/_docs/concepts/index.md +++ b/docs/_docs/concepts/index.md @@ -1,7 +1,7 @@ --- layout: default title: Concepts -nav_order: 9 +nav_order: 10 has_children: true permalink: /docs/concepts --- diff --git a/docs/_docs/concepts/nimby.md b/docs/_docs/concepts/nimby.md index 5e9ede996..536c3ac02 100644 --- a/docs/_docs/concepts/nimby.md +++ b/docs/_docs/concepts/nimby.md @@ -1,6 +1,6 @@ --- title: "NIMBY" -nav_order: 13 +nav_order: 14 parent: Concepts layout: default linkTitle: "NIMBY" diff --git a/docs/_docs/concepts/opencue-overview.md b/docs/_docs/concepts/opencue-overview.md index d89f810be..f141f8a88 100644 --- a/docs/_docs/concepts/opencue-overview.md +++ b/docs/_docs/concepts/opencue-overview.md @@ -1,6 +1,6 @@ --- title: "OpenCue overview" -nav_order: 10 +nav_order: 11 parent: Concepts layout: default linkTitle: "OpenCue overview" diff --git a/docs/_docs/concepts/render-farm-monitoring.md b/docs/_docs/concepts/render-farm-monitoring.md new file mode 100644 index 000000000..a69076108 --- /dev/null +++ b/docs/_docs/concepts/render-farm-monitoring.md @@ -0,0 +1,181 @@ +--- +title: "Render farm monitoring" +nav_order: 18 +parent: Concepts +layout: default +linkTitle: "Render farm monitoring" +date: 2024-11-24 +description: > + Understanding the OpenCue render farm monitoring system +--- + +# Render farm monitoring + +### Understanding the OpenCue render farm monitoring system + +--- + +OpenCue provides a comprehensive monitoring system for tracking render farm operations, collecting metrics, and analyzing historical data. This system enables real-time visibility into job execution, resource utilization, and system health. + +## Overview + +The monitoring system is built on an event-driven architecture that captures lifecycle events from jobs, layers, frames, hosts, and processes. These events can be: + +![OpenCue Monitoring Grafana Dashboard](/assets/images/opencue_monitoring/opencue_monitoring_grafana_chart.png) + +- **Published to Kafka** for real-time streaming and integration with external systems +- **Stored in Elasticsearch** for historical analysis and querying +- **Exposed as Prometheus metrics** for real-time dashboards and alerting + +## Architecture + +The monitoring system uses a decoupled architecture: + +``` +┌────────────────────────────────────────────────────────────────────────────┐ +│ Cuebot │ +│ │ +│ ┌─────────────┐ ┌─────────────────────┐ │ +│ │ Service │────>│ KafkaEventPublisher │──────────> Kafka │ +│ │ Layer │ └─────────────────────┘ │ │ +│ └─────────────┘ │ │ │ +│ │ v │ │ +│ └─────────────>┌──────────────┐ │ │ +│ │ Prometheus │ │ │ +│ │ Metrics │ │ │ +│ └──────────────┘ │ │ +└────────────────────────────────────────────────────────────│───────────────┘ + │ + v +┌────────────────────────────────────────────────────────────────────────────┐ +│ kafka-es-indexer (Rust) │ +│ │ +│ ┌───────────────────┐ ┌─────────────────────────┐ │ +│ │ Kafka Consumer │────────>│ Elasticsearch Client │ │ +│ │ (rdkafka) │ │ (bulk indexing) │ │ +│ └───────────────────┘ └─────────────────────────┘ │ +│ │ │ +└────────────────────────────────────────────│───────────────────────────────┘ + v + Elasticsearch +``` + +### Event publishing (Kafka) + +Cuebot publishes events to Apache Kafka topics when significant state changes occur: + +| Topic | Description | +|-------|-------------| +| `opencue.job.events` | Job lifecycle events (created, started, finished, killed) | +| `opencue.layer.events` | Layer state changes | +| `opencue.frame.events` | Frame execution events (started, completed, failed, retried) | +| `opencue.host.events` | Host state changes (up, down, locked, nimby) | +| `opencue.proc.events` | Process allocation and deallocation events | + +Events are published asynchronously to avoid impacting render farm performance. A bounded queue ensures the system remains responsive even under high load. + +![UI for Apache Kafka](/assets/images/opencue_monitoring/opencue_monitoring_ui_for_apache_kafka.png) + +### Historical storage (Elasticsearch) + +A standalone Rust-based service (`kafka-es-indexer`) consumes events from Kafka and indexes them into Elasticsearch for long-term storage and analysis. This decoupled architecture enables: + +- **Historical queries**: Search for jobs, frames, or hosts by any attribute +- **Trend analysis**: Track metrics over time (job completion rates, failure patterns) +- **Capacity planning**: Analyze resource utilization patterns +- **Debugging**: Investigate issues by examining historical event sequences + +Elasticsearch indices are organized by event type and time-based partitioning for efficient querying. + +![Elasticsearch](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch.png) + +### Metrics collection (Prometheus) + +Cuebot exposes a `/metrics` endpoint compatible with Prometheus. Key metrics include: + +![Prometheus Metrics Interface](/assets/images/opencue_monitoring/opencue_monitoring_prometheus.png) + +**Job and frame metrics:** +- `cue_frames_completed_total` - Counter of completed frames by state +- `cue_jobs_completed_total` - Counter of completed jobs by show +- `cue_frame_runtime_seconds` - Histogram of frame execution times +- `cue_frame_memory_bytes` - Histogram of frame memory usage + +**Queue metrics:** +- `cue_dispatch_waiting_total` - Tasks waiting in dispatch queue +- `cue_booking_waiting_total` - Tasks waiting in booking queue +- `cue_report_executed_total` - Host reports processed + +## Event types + +### Job events + +Job events capture the complete lifecycle of rendering jobs: + +- **JOB_CREATED**: A new job was submitted to the queue +- **JOB_STARTED**: The job began executing (first frame dispatched) +- **JOB_FINISHED**: All frames completed successfully +- **JOB_KILLED**: The job was manually terminated +- **JOB_PAUSED**: The job was paused +- **JOB_RESUMED**: The job was resumed from paused state + +### Frame events + +Frame events track individual frame execution: + +- **FRAME_STARTED**: A frame began rendering on a host +- **FRAME_COMPLETED**: A frame finished successfully +- **FRAME_FAILED**: A frame failed with an error +- **FRAME_RETRIED**: A failed frame was retried +- **FRAME_EATEN**: A frame was marked as complete without rendering + +### Host events + +Host events monitor render node status: + +- **HOST_UP**: A host came online +- **HOST_DOWN**: A host went offline +- **HOST_LOCKED**: A host was locked for maintenance +- **HOST_UNLOCKED**: A host was unlocked +- **HOST_NIMBY_LOCKED**: A host entered NIMBY mode +- **HOST_NIMBY_UNLOCKED**: A host exited NIMBY mode + +## Configuration + +### Cuebot configuration + +Enable Kafka and Prometheus through Cuebot properties: + +```properties +# Kafka event publishing +monitoring.kafka.enabled=true +monitoring.kafka.bootstrap.servers=kafka:9092 + +# Prometheus metrics +metrics.prometheus.collector=true +``` + +### kafka-es-indexer configuration + +The standalone Rust indexer (`rust/crates/kafka-es-indexer/`) is configured via environment variables or CLI arguments: + +```bash +# Using environment variables +export KAFKA_BOOTSTRAP_SERVERS=kafka:9092 +export ELASTICSEARCH_URL=http://elasticsearch:9200 +kafka-es-indexer + +# Or using CLI arguments +kafka-es-indexer \ + --kafka-servers kafka:9092 \ + --elasticsearch-url http://elasticsearch:9200 \ + --index-prefix opencue +``` + +Each component can be enabled or disabled independently based on your infrastructure needs. + +## What's next? + +- [Quick start: Setting up monitoring](/docs/quick-starts/quick-start-monitoring/) - Deploy the monitoring stack +- [Monitoring user guide](/docs/user-guides/render-farm-monitoring-guide/) - Configure dashboards and alerts +- [Monitoring developer guide](/docs/developer-guide/monitoring-development/) - Extend and customize the monitoring system diff --git a/docs/_docs/concepts/spi-case-study.md b/docs/_docs/concepts/spi-case-study.md index a2499eb2c..4d30eebb7 100644 --- a/docs/_docs/concepts/spi-case-study.md +++ b/docs/_docs/concepts/spi-case-study.md @@ -1,6 +1,6 @@ --- title: "OpenCue Sony Pictures Imageworks case study" -nav_order: 17 +nav_order: 19 parent: Concepts layout: default linkTitle: "OpenCue Sony Pictures Imageworks case study" diff --git a/docs/_docs/concepts/versioning.md b/docs/_docs/concepts/versioning.md index 84ad232a5..339b437b2 100644 --- a/docs/_docs/concepts/versioning.md +++ b/docs/_docs/concepts/versioning.md @@ -1,6 +1,6 @@ --- title: "Versioning" -nav_order: 12 +nav_order: 13 parent: Concepts layout: default linkTitle: "Versioning" diff --git a/docs/_docs/developer-guide/contributing.md b/docs/_docs/developer-guide/contributing.md index c7b383c12..8e461f47f 100644 --- a/docs/_docs/developer-guide/contributing.md +++ b/docs/_docs/developer-guide/contributing.md @@ -2,7 +2,7 @@ title: "Contributing to OpenCue" linkTitle: "Contributing to OpenCue" parent: "Developer Guide" -nav_order: 81 +nav_order: 87 layout: default date: 2020-05-04 description: > diff --git a/docs/_docs/developer-guide/cuecmd-development.md b/docs/_docs/developer-guide/cuecmd-development.md index 34a8fc7b3..f58ad4262 100644 --- a/docs/_docs/developer-guide/cuecmd-development.md +++ b/docs/_docs/developer-guide/cuecmd-development.md @@ -1,6 +1,6 @@ --- title: "Cuecmd Development Guide" -nav_order: 85 +nav_order: 91 parent: "Developer Guide" layout: default date: 2025-10-02 diff --git a/docs/_docs/developer-guide/cuecommander-technical-reference.md b/docs/_docs/developer-guide/cuecommander-technical-reference.md index b836b4d70..a47db19dd 100644 --- a/docs/_docs/developer-guide/cuecommander-technical-reference.md +++ b/docs/_docs/developer-guide/cuecommander-technical-reference.md @@ -2,7 +2,7 @@ title: "CueCommander Technical Reference" layout: default parent: "Developer Guide" -nav_order: 84 +nav_order: 90 linkTitle: "CueCommander Technical Reference" date: 2025-01-13 description: > diff --git a/docs/_docs/developer-guide/cuenimby-development.md b/docs/_docs/developer-guide/cuenimby-development.md index f3019c3c3..5fe5587c0 100644 --- a/docs/_docs/developer-guide/cuenimby-development.md +++ b/docs/_docs/developer-guide/cuenimby-development.md @@ -1,6 +1,6 @@ --- title: "CueNIMBY development guide" -nav_order: 86 +nav_order: 92 parent: Developer Guide layout: default linkTitle: "CueNIMBY development" diff --git a/docs/_docs/developer-guide/cuetopia-technical-reference.md b/docs/_docs/developer-guide/cuetopia-technical-reference.md index 6f70ed588..3c9438417 100644 --- a/docs/_docs/developer-guide/cuetopia-technical-reference.md +++ b/docs/_docs/developer-guide/cuetopia-technical-reference.md @@ -2,7 +2,7 @@ title: "Cuetopia Technical Reference" layout: default parent: "Developer Guide" -nav_order: 83 +nav_order: 89 linkTitle: "Cuetopia Technical Reference" date: 2025-01-07 description: > diff --git a/docs/_docs/developer-guide/cueweb-development.md b/docs/_docs/developer-guide/cueweb-development.md index ee9b0982a..9358c2e27 100644 --- a/docs/_docs/developer-guide/cueweb-development.md +++ b/docs/_docs/developer-guide/cueweb-development.md @@ -2,7 +2,7 @@ layout: default title: CueWeb Development parent: Developer Guide -nav_order: 89 +nav_order: 95 --- # CueWeb Development Guide diff --git a/docs/_docs/developer-guide/filter-development.md b/docs/_docs/developer-guide/filter-development.md index 26965cfc0..6aa50e749 100644 --- a/docs/_docs/developer-guide/filter-development.md +++ b/docs/_docs/developer-guide/filter-development.md @@ -1,6 +1,6 @@ --- title: "Filter Development" -nav_order: 87 +nav_order: 93 parent: Developer Guide layout: default date: 2025-10-15 diff --git a/docs/_docs/developer-guide/hybrid-rqd-setup.md b/docs/_docs/developer-guide/hybrid-rqd-setup.md index ab4fed94e..8638de5ca 100644 --- a/docs/_docs/developer-guide/hybrid-rqd-setup.md +++ b/docs/_docs/developer-guide/hybrid-rqd-setup.md @@ -1,6 +1,6 @@ --- title: "Hybrid RQD Setup for Testing" -nav_order: 90 +nav_order: 96 parent: "Developer Guide" layout: default date: 2025-10-29 diff --git a/docs/_docs/developer-guide/index.md b/docs/_docs/developer-guide/index.md index 6e70a1988..d04957f8d 100644 --- a/docs/_docs/developer-guide/index.md +++ b/docs/_docs/developer-guide/index.md @@ -1,6 +1,6 @@ --- title: "Developer Guide" -nav_order: 80 +nav_order: 86 has_children: true layout: default linkTitle: "Developer Guide" diff --git a/docs/_docs/developer-guide/monitoring-development.md b/docs/_docs/developer-guide/monitoring-development.md new file mode 100644 index 000000000..ae3ed294a --- /dev/null +++ b/docs/_docs/developer-guide/monitoring-development.md @@ -0,0 +1,437 @@ +--- +title: "Monitoring system development" +nav_order: 97 +parent: Developer Guide +layout: default +linkTitle: "Monitoring development" +date: 2024-11-24 +description: > + Extend and customize the OpenCue monitoring system +--- + +# Monitoring system development + +### Extend and customize the OpenCue monitoring system + +--- + +This guide explains how to extend, customize, and develop against the OpenCue monitoring system. + +## Architecture overview + +The monitoring system uses a decoupled architecture with Cuebot publishing events to Kafka and a standalone Rust-based indexer consuming events for Elasticsearch storage: + +``` +┌────────────────────────────────────────────────────────────────────────────┐ +│ Cuebot │ +│ │ +│ ┌─────────────┐ ┌─────────────────────┐ │ +│ │ Service │────>│ KafkaEventPublisher │──────────> Kafka │ +│ │ Layer │ └─────────────────────┘ │ │ +│ └─────────────┘ │ │ │ +│ │ v │ │ +│ └─────────────>┌──────────────┐ │ │ +│ │ Prometheus │ │ │ +│ │ Metrics │ │ │ +│ └──────────────┘ │ │ +└────────────────────────────────────────────────────────────│───────────────┘ + │ + v +┌────────────────────────────────────────────────────────────────────────────┐ +│ kafka-es-indexer (Rust) │ +│ │ +│ ┌───────────────────┐ ┌─────────────────────────┐ │ +│ │ Kafka Consumer │────────>│ Elasticsearch Client │ │ +│ │ (rdkafka) │ │ (bulk indexing) │ │ +│ └───────────────────┘ └─────────────────────────┘ │ +│ │ │ +└────────────────────────────────────────────│───────────────────────────────┘ + v + Elasticsearch +``` + +**Data flow:** +1. **Service Layer** (e.g., FrameCompleteHandler, HostReportHandler) generates events and calls KafkaEventPublisher +2. **KafkaEventPublisher** serializes events as JSON and publishes them to Kafka topics +3. **kafka-es-indexer** (standalone Rust service) consumes events from Kafka topics +4. **kafka-es-indexer** bulk indexes events into Elasticsearch for historical storage +5. **Prometheus Metrics** are updated directly by the Service Layer and KafkaEventPublisher (for queue metrics) + +### Key components + +| Component | Location | Purpose | +|-----------|----------|---------| +| `KafkaEventPublisher` | `com.imageworks.spcue.monitoring` | Publishes events to Kafka | +| `MonitoringEventBuilder` | `com.imageworks.spcue.monitoring` | Builds event payloads | +| `PrometheusMetricsCollector` | `com.imageworks.spcue` | Exposes Prometheus metrics | +| `kafka-es-indexer` | `rust/crates/kafka-es-indexer/` | Consumes Kafka, indexes to Elasticsearch | + +### Why a separate indexer? + +The Kafka-to-Elasticsearch indexer is implemented as a standalone Rust service rather than within Cuebot for several reasons: + +- **Decoupling**: Cuebot focuses on core scheduling; indexing is a separate concern +- **Scalability**: The indexer can be scaled independently from Cuebot +- **Reliability**: Kafka buffering ensures events are not lost if Elasticsearch is temporarily unavailable +- **Performance**: Rust provides efficient resource usage for high-throughput event processing +- **Operational flexibility**: The indexer can be updated, restarted, or replayed without affecting Cuebot + +## Adding new event types + +### Step 1: Define the event type + +Add the new event type to the `MonitoringEventType` enum: + +```java +// MonitoringEventType.java +public enum MonitoringEventType { + // Existing types... + JOB_CREATED, + JOB_STARTED, + JOB_FINISHED, + + // Add new type + JOB_PRIORITY_CHANGED +} +``` + +### Step 2: Create the event builder method + +Add a builder method in `MonitoringEventBuilder`: + +```java +// MonitoringEventBuilder.java +public static MonitoringEvent buildJobPriorityChangedEvent( + JobDetail job, int oldPriority, int newPriority) { + + MonitoringEvent.Builder builder = MonitoringEvent.newBuilder() + .setEventType(MonitoringEventType.JOB_PRIORITY_CHANGED) + .setTimestamp(Instant.now().toString()) + .setJobId(job.id) + .setJobName(job.name) + .setShowName(job.showName); + + // Add custom fields + builder.putMetadata("oldPriority", String.valueOf(oldPriority)); + builder.putMetadata("newPriority", String.valueOf(newPriority)); + + return builder.build(); +} +``` + +### Step 3: Publish the event + +Call the Kafka publisher from the service layer: + +```java +// JobManagerService.java +@Autowired +private KafkaEventPublisher kafkaEventPublisher; + +@Autowired +private MonitoringEventBuilder monitoringEventBuilder; + +public void setJobPriority(JobInterface job, int priority) { + int oldPriority = jobDao.getJobPriority(job); + jobDao.updatePriority(job, priority); + + // Publish monitoring event + try { + JobDetail detail = jobDao.getJobDetail(job.getJobId()); + JobEvent event = monitoringEventBuilder.buildJobPriorityChangedEvent( + detail, oldPriority, priority); + kafkaEventPublisher.publishJobEvent(event); + } catch (Exception e) { + logger.trace("Failed to publish job priority event: {}", e.getMessage()); + } +} +``` + +### Step 4: Add Kafka topic (if needed) + +If the event requires a new topic, add it to `KafkaEventPublisher`: + +```java +// KafkaEventPublisher.java +private static final String TOPIC_JOB_EVENTS = "opencue.job.events"; +private static final String TOPIC_JOB_ADMIN_EVENTS = "opencue.job.admin.events"; // New topic + +private String getTopicForEvent(MonitoringEventType type) { + switch (type) { + case JOB_PRIORITY_CHANGED: + return TOPIC_JOB_ADMIN_EVENTS; + // ... existing mappings + default: + return TOPIC_JOB_EVENTS; + } +} +``` + +## Adding new Prometheus metrics + +### Counter metrics + +```java +// PrometheusMetrics.java +private static final Counter jobPriorityChanges = Counter.build() + .name("cue_job_priority_changes_total") + .help("Total number of job priority changes") + .labelNames("show", "direction") + .register(); + +public static void incrementJobPriorityChange(String show, boolean increased) { + String direction = increased ? "increased" : "decreased"; + jobPriorityChanges.labels(show, direction).inc(); +} +``` + +### Histogram metrics + +```java +private static final Histogram frameQueueTime = Histogram.build() + .name("cue_frame_queue_time_seconds") + .help("Time frames spend waiting in queue") + .labelNames("show") + .buckets(1, 5, 15, 30, 60, 300, 900, 1800, 3600) + .register(); + +public static void observeFrameQueueTime(String show, double seconds) { + frameQueueTime.labels(show).observe(seconds); +} +``` + +### Gauge metrics + +```java +private static final Gauge activeJobs = Gauge.build() + .name("cue_active_jobs") + .help("Number of currently active jobs") + .labelNames("show", "state") + .register(); + +public static void setActiveJobs(String show, String state, int count) { + activeJobs.labels(show, state).set(count); +} +``` + +## Customizing Elasticsearch indexing + +The `kafka-es-indexer` service handles all Elasticsearch indexing. It automatically routes events to indices based on the Kafka topic name. + +### Index templates + +Create custom index templates for new event types. Note that events use snake_case field names and include a `header` object: + +```json +{ + "index_patterns": ["opencue-job-admin-*"], + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0 + }, + "mappings": { + "properties": { + "header": { + "properties": { + "event_id": { "type": "keyword" }, + "event_type": { "type": "keyword" }, + "timestamp": { "type": "date", "format": "epoch_millis" }, + "source_cuebot": { "type": "keyword" }, + "correlation_id": { "type": "keyword" } + } + }, + "job_id": { "type": "keyword" }, + "job_name": { "type": "keyword" }, + "show": { "type": "keyword" }, + "old_priority": { "type": "integer" }, + "new_priority": { "type": "integer" }, + "user": { "type": "keyword" } + } + } +} +``` + +### Index naming convention + +The kafka-es-indexer creates daily indices using the pattern: + +``` +{topic-name-converted}-YYYY-MM-DD +``` + +For example: +- `opencue.job.events` → `opencue-job-events-2024-11-29` +- `opencue.frame.events` → `opencue-frame-events-2024-11-29` + +## Testing + +### Unit testing event builders + +```java +@Test +public void testBuildJobPriorityChangedEvent() { + JobDetail job = createTestJob(); + + MonitoringEvent event = MonitoringEventBuilder + .buildJobPriorityChangedEvent(job, 50, 100); + + assertEquals(MonitoringEventType.JOB_PRIORITY_CHANGED, + event.getEventType()); + assertEquals("50", event.getMetadataMap().get("oldPriority")); + assertEquals("100", event.getMetadataMap().get("newPriority")); +} +``` + +### Integration testing with embedded Kafka + +```java +@EmbeddedKafka(partitions = 1, topics = {"opencue.job.events"}) +public class KafkaEventPublisherIntegrationTest { + + @Autowired + private EmbeddedKafkaBroker embeddedKafka; + + @Autowired + private KafkaEventPublisher publisher; + + @Test + public void testPublishEvent() { + MonitoringEvent event = createTestEvent(); + publisher.publishEvent(event); + + // Verify event was published + ConsumerRecord record = + KafkaTestUtils.getSingleRecord(consumer, "opencue.job.events"); + assertNotNull(record); + } +} +``` + +## Configuration reference + +### Kafka configuration + +| Property | Default | Description | +|----------|---------|-------------| +| `monitoring.kafka.enabled` | `false` | Enable Kafka publishing | +| `monitoring.kafka.bootstrap.servers` | `localhost:9092` | Kafka broker addresses | +| `monitoring.kafka.queue.capacity` | `1000` | Event queue size | +| `monitoring.kafka.batch.size` | `100` | Batch size for publishing | +| `monitoring.kafka.linger.ms` | `100` | Time to wait before sending batch | +| `monitoring.kafka.acks` | `1` | Required acknowledgments | + +### kafka-es-indexer configuration + +The kafka-es-indexer is configured via command-line arguments, environment variables, or a YAML config file: + +| CLI Argument | Env Variable | Default | Description | +|--------------|--------------|---------|-------------| +| `--kafka-servers` | `KAFKA_BOOTSTRAP_SERVERS` | `localhost:9092` | Kafka broker addresses | +| `--kafka-group-id` | `KAFKA_GROUP_ID` | `opencue-elasticsearch-indexer` | Consumer group ID | +| `--elasticsearch-url` | `ELASTICSEARCH_URL` | `http://localhost:9200` | Elasticsearch URL | +| `--index-prefix` | `ELASTICSEARCH_INDEX_PREFIX` | `opencue` | Elasticsearch index prefix | +| `--log-level` | `LOG_LEVEL` | `info` | Log level (debug, info, warn, error) | +| `--config` | - | - | Path to YAML config file | + +The indexer automatically subscribes to all OpenCue Kafka topics: +- `opencue.job.events` +- `opencue.layer.events` +- `opencue.frame.events` +- `opencue.host.events` +- `opencue.proc.events` + +Example with CLI arguments: + +```bash +kafka-es-indexer \ + --kafka-servers kafka:9092 \ + --kafka-group-id opencue-elasticsearch-indexer \ + --elasticsearch-url http://elasticsearch:9200 \ + --index-prefix opencue \ + --log-level info +``` + +Example with environment variables: + +```bash +export KAFKA_BOOTSTRAP_SERVERS=kafka:9092 +export KAFKA_GROUP_ID=opencue-elasticsearch-indexer +export ELASTICSEARCH_URL=http://elasticsearch:9200 +export ELASTICSEARCH_INDEX_PREFIX=opencue +kafka-es-indexer +``` + +### Prometheus configuration + +| Property | Default | Description | +|----------|---------|-------------| +| `metrics.prometheus.collector` | `false` | Enable Prometheus metrics | +| `metrics.prometheus.endpoint` | `/metrics` | Metrics endpoint path | + +## Debugging + +### Enable debug logging in Cuebot + +Add to `log4j2.xml`: + +```xml + +``` + +### Verify Kafka connectivity + +```bash +# Check if events are being published +kafka-console-consumer --bootstrap-server kafka:9092 \ + --topic opencue.job.events --from-beginning + +# Check consumer group lag +kafka-consumer-groups --bootstrap-server kafka:9092 \ + --group opencue-elasticsearch-indexer --describe +``` + +### Debugging kafka-es-indexer + +```bash +# View indexer logs +docker logs opencue-kafka-es-indexer + +# Check indexer help +docker exec opencue-kafka-es-indexer kafka-es-indexer --help + +# Verify Elasticsearch indices are being created +curl -s "http://localhost:9200/_cat/indices/opencue-*?v" + +# Check event counts in Elasticsearch +curl -s "http://localhost:9200/opencue-job-events-*/_count" +curl -s "http://localhost:9200/opencue-frame-events-*/_count" +``` + +## Best practices + +### Event design + +- Keep events immutable and self-contained +- Include all relevant context in the event payload +- Use consistent naming conventions for event types +- Version event schemas for backward compatibility + +### Performance + +- Use bounded queues to prevent memory exhaustion +- Batch events when possible for better throughput +- Monitor queue sizes and dropped events +- Consider event sampling for high-frequency events + +### Reliability + +- Handle Kafka unavailability gracefully +- Implement retry logic with exponential backoff +- Log dropped events for debugging +- Use idempotent consumers for Elasticsearch indexing + +## What's next? + +- [Render farm monitoring concepts](/docs/concepts/render-farm-monitoring/) - Understand the monitoring architecture +- [Monitoring user guide](/docs/user-guides/render-farm-monitoring-guide/) - Configure dashboards and alerts +- [Contributing to OpenCue](/docs/developer-guide/contributing/) - Submit your changes diff --git a/docs/_docs/developer-guide/rest-gateway-development.md b/docs/_docs/developer-guide/rest-gateway-development.md index 54b8a45ba..25b8cd66a 100644 --- a/docs/_docs/developer-guide/rest-gateway-development.md +++ b/docs/_docs/developer-guide/rest-gateway-development.md @@ -1,6 +1,6 @@ --- title: "REST Gateway Development" -nav_order: 88 +nav_order: 94 parent: Developer Guide layout: default linkTitle: "Developing the OpenCue REST Gateway" diff --git a/docs/_docs/developer-guide/sandbox-testing.md b/docs/_docs/developer-guide/sandbox-testing.md index 7b22b0d29..616c4a55a 100644 --- a/docs/_docs/developer-guide/sandbox-testing.md +++ b/docs/_docs/developer-guide/sandbox-testing.md @@ -1,6 +1,6 @@ --- title: "Using the OpenCue Sandbox for Testing" -nav_order: 82 +nav_order: 88 parent: "Developer Guide" layout: default date: 2025-08-06 diff --git a/docs/_docs/getting-started/checking-out-the-source-code.md b/docs/_docs/getting-started/checking-out-the-source-code.md index 29ceac0a1..4973b4587 100644 --- a/docs/_docs/getting-started/checking-out-the-source-code.md +++ b/docs/_docs/getting-started/checking-out-the-source-code.md @@ -1,6 +1,6 @@ --- title: "Checking out the source code" -nav_order: 22 +nav_order: 24 parent: Getting Started layout: default linkTitle: "Checking out the source code" diff --git a/docs/_docs/getting-started/deploying-cuebot.md b/docs/_docs/getting-started/deploying-cuebot.md index af0b75992..3f18450cd 100644 --- a/docs/_docs/getting-started/deploying-cuebot.md +++ b/docs/_docs/getting-started/deploying-cuebot.md @@ -1,6 +1,6 @@ --- title: "Deploying Cuebot" -nav_order: 20 +nav_order: 22 parent: Getting Started layout: default linkTitle: "Deploying Cuebot" diff --git a/docs/_docs/getting-started/deploying-cueweb.md b/docs/_docs/getting-started/deploying-cueweb.md index c7dc81d24..65b90be01 100644 --- a/docs/_docs/getting-started/deploying-cueweb.md +++ b/docs/_docs/getting-started/deploying-cueweb.md @@ -2,7 +2,7 @@ layout: default title: Deploying CueWeb parent: Getting Started -nav_order: 28 +nav_order: 30 --- # Deploying CueWeb diff --git a/docs/_docs/getting-started/deploying-monitoring.md b/docs/_docs/getting-started/deploying-monitoring.md new file mode 100644 index 000000000..3566940c6 --- /dev/null +++ b/docs/_docs/getting-started/deploying-monitoring.md @@ -0,0 +1,362 @@ +--- +title: "Deploying the monitoring stack" +nav_order: 31 +parent: Getting Started +layout: default +linkTitle: "Deploying monitoring" +date: 2024-11-24 +description: > + Deploy the OpenCue render farm monitoring stack for production environments +--- + +# Deploying the monitoring stack + +### Deploy the OpenCue render farm monitoring stack for production environments + +--- + +This guide explains how to deploy the OpenCue monitoring stack components for production use. The monitoring system provides real-time metrics, event streaming, and historical data storage for your render farm. + +## Overview + +The OpenCue monitoring system consists of: + +| Component | Purpose | Required | +|-----------|---------|----------| +| **Kafka** | Event streaming for job, frame, and host events | Optional | +| **kafka-es-indexer** | Standalone Rust service that indexes Kafka events to Elasticsearch | Optional (required for ES) | +| **Elasticsearch** | Historical event storage and analysis | Optional | +| **Prometheus** | Real-time metrics collection | Optional | + +Each component can be enabled independently based on your monitoring requirements. + +## System requirements + +### Kafka cluster + +- **Memory**: Minimum 4GB RAM per broker +- **Storage**: SSD recommended, size depends on retention period +- **Network**: Low-latency connection to Cuebot + +### Elasticsearch + +- **Memory**: Minimum 4GB RAM (8GB+ recommended for production) +- **Storage**: SSD recommended, plan for ~1KB per event +- **JVM**: Heap size should be 50% of available RAM (max 32GB) + +### Prometheus + +- **Memory**: 2GB minimum, scales with number of metrics +- **Storage**: SSD recommended, ~2 bytes per sample + +## Before you begin + +Ensure you have: + +- A working Cuebot deployment (see [Deploying Cuebot](/docs/getting-started/deploying-cuebot/)) +- Docker and Docker Compose (for containerized deployment) +- Network connectivity between Cuebot and monitoring services + +## Deployment options + +### Option 1: Docker Compose (recommended for testing) + +Use the provided Docker Compose file for a complete monitoring stack: + +```bash +cd /path/to/OpenCue +docker compose -f sandbox/docker-compose.monitoring-full.yml up -d +``` + +This starts all monitoring services with default configurations suitable for development and testing. + +### Option 2: Production deployment + +For production environments, deploy each component separately with appropriate configurations. + +#### Deploying Kafka + +1. Set up a Kafka cluster with Zookeeper (or use KRaft mode for Kafka 3.x+): + + ```bash + # Example using Docker + docker run -d --name zookeeper \ + -p 2181:2181 \ + confluentinc/cp-zookeeper:7.4.0 + + docker run -d --name kafka \ + -p 9092:9092 \ + -e KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181 \ + -e KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092 \ + confluentinc/cp-kafka:7.4.0 + ``` + +2. Create the required topics: + + ```bash + kafka-topics --bootstrap-server kafka:9092 --create \ + --topic opencue.job.events --partitions 3 --replication-factor 1 + + kafka-topics --bootstrap-server kafka:9092 --create \ + --topic opencue.frame.events --partitions 6 --replication-factor 1 + + kafka-topics --bootstrap-server kafka:9092 --create \ + --topic opencue.host.events --partitions 3 --replication-factor 1 + + kafka-topics --bootstrap-server kafka:9092 --create \ + --topic opencue.layer.events --partitions 3 --replication-factor 1 + + kafka-topics --bootstrap-server kafka:9092 --create \ + --topic opencue.proc.events --partitions 3 --replication-factor 1 + ``` + +#### Deploying Elasticsearch + +1. Deploy Elasticsearch: + + ```bash + docker run -d --name elasticsearch \ + -p 9200:9200 \ + -e discovery.type=single-node \ + -e xpack.security.enabled=false \ + -e "ES_JAVA_OPTS=-Xms4g -Xmx4g" \ + docker.elastic.co/elasticsearch/elasticsearch:8.8.0 + ``` + +2. Create index templates for OpenCue events: + + ```bash + curl -X PUT "localhost:9200/_index_template/opencue-events" \ + -H "Content-Type: application/json" \ + -d '{ + "index_patterns": ["opencue-*"], + "template": { + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0 + }, + "mappings": { + "properties": { + "eventType": { "type": "keyword" }, + "timestamp": { "type": "date" }, + "jobId": { "type": "keyword" }, + "jobName": { "type": "keyword" }, + "showName": { "type": "keyword" } + } + } + } + }' + ``` + +#### Deploying kafka-es-indexer + +The `kafka-es-indexer` is a standalone Rust service that consumes events from Kafka and indexes them into Elasticsearch. It runs separately from Cuebot. + +1. Build the Docker image (from OpenCue repository root): + + ```bash + cd rust + docker build -f crates/kafka-es-indexer/Dockerfile -t opencue/kafka-es-indexer . + ``` + +2. Run the indexer: + + ```bash + docker run -d --name kafka-es-indexer \ + --network your-network \ + -e KAFKA_BOOTSTRAP_SERVERS=kafka:9092 \ + -e KAFKA_GROUP_ID=opencue-elasticsearch-indexer \ + -e ELASTICSEARCH_URL=http://elasticsearch:9200 \ + -e ELASTICSEARCH_INDEX_PREFIX=opencue \ + opencue/kafka-es-indexer + ``` + + Or with CLI arguments: + + ```bash + docker run -d --name kafka-es-indexer \ + --network your-network \ + opencue/kafka-es-indexer \ + --kafka-servers kafka:9092 \ + --kafka-group-id opencue-elasticsearch-indexer \ + --elasticsearch-url http://elasticsearch:9200 \ + --index-prefix opencue + ``` + +3. Verify the indexer is running: + + ```bash + docker logs kafka-es-indexer + ``` + + You should see log messages indicating successful connection to Kafka and Elasticsearch. + +#### Deploying Prometheus + +1. Create a Prometheus configuration file (`prometheus.yml`): + + ```yaml + global: + scrape_interval: 15s + evaluation_interval: 15s + + scrape_configs: + - job_name: 'cuebot' + static_configs: + - targets: ['cuebot-host:8080'] + metrics_path: /metrics + ``` + +2. Deploy Prometheus: + + ```bash + docker run -d --name prometheus \ + -p 9090:9090 \ + -v /path/to/prometheus.yml:/etc/prometheus/prometheus.yml \ + prom/prometheus:v2.45.0 + ``` + +#### Deploying Grafana (optional) + +1. Deploy Grafana for visualization: + + ```bash + docker run -d --name grafana \ + -p 3000:3000 \ + -e GF_SECURITY_ADMIN_PASSWORD=admin \ + grafana/grafana:10.0.0 + ``` + +2. Configure Prometheus as a data source in Grafana. + +3. Import the OpenCue dashboard from `sandbox/config/grafana/dashboards/opencue-monitoring.json`. + +## Configuring Cuebot + +Enable monitoring in Cuebot by adding configuration properties. Note that Elasticsearch indexing is handled by the standalone `kafka-es-indexer` service, not Cuebot. + +### Using command-line arguments + +```bash +java -jar cuebot.jar \ + --datasource.cue-data-source.jdbc-url=jdbc:postgresql://db-host/cuebot \ + --datasource.cue-data-source.username=cuebot \ + --datasource.cue-data-source.password= \ + --monitoring.kafka.enabled=true \ + --monitoring.kafka.bootstrap.servers=kafka-host:9092 \ + --metrics.prometheus.collector=true +``` + +### Using environment variables + +```bash +export MONITORING_KAFKA_ENABLED=true +export MONITORING_KAFKA_BOOTSTRAP_SERVERS=kafka-host:9092 +export METRICS_PROMETHEUS_COLLECTOR=true +``` + +### Using application properties + +Add to `application.properties` or `opencue.properties`: + +```properties +# Kafka event publishing +monitoring.kafka.enabled=true +monitoring.kafka.bootstrap.servers=kafka-host:9092 + +# Prometheus metrics +metrics.prometheus.collector=true +``` + +## Verifying the deployment + +### Check Kafka topics + +```bash +kafka-topics --bootstrap-server kafka-host:9092 --list +``` + +Expected output includes: +``` +opencue.frame.events +opencue.host.events +opencue.job.events +opencue.layer.events +opencue.proc.events +``` + +### Check Prometheus targets + +Open Prometheus at `http://prometheus-host:9090/targets` and verify the Cuebot target shows status `UP`. + +### Check Cuebot metrics + +```bash +curl -s http://localhost:8080/metrics | grep -E "^cue_" +``` + +**Note:** Replace localhost with the Cuebot hostname or IP. + +You should see metrics like: +``` +cue_frames_completed_total +cue_dispatch_waiting_total +cue_host_reports_received_total +``` + +### Check Elasticsearch indices + +```bash +curl http://elasticsearch-host:9200/_cat/indices/opencue-* +``` + +## Security considerations + +### Kafka security + +For production deployments, configure: + +- **SSL/TLS encryption** for data in transit +- **SASL authentication** for client authentication +- **ACLs** to restrict topic access + +### Elasticsearch security + +Enable X-Pack security features: + +- **Authentication** for API access +- **TLS** for transport and HTTP layers +- **Role-based access control** for indices + +### Prometheus security + +- Use **basic authentication** or **OAuth** for the web UI +- Configure **TLS** for scrape endpoints +- Use **network policies** to restrict access + +## Troubleshooting + +### Cuebot fails to connect to Kafka + +1. Verify Kafka is running: `kafka-broker-api-versions --bootstrap-server kafka-host:9092` +2. Check network connectivity from Cuebot to Kafka +3. Verify the bootstrap servers configuration matches your Kafka deployment + +### Events not appearing in Elasticsearch + +1. Check kafka-es-indexer logs: `docker logs kafka-es-indexer` +2. Verify Elasticsearch is healthy: `curl http://elasticsearch-host:9200/_cluster/health` +3. Verify kafka-es-indexer is connected to Kafka and consuming messages +4. Check that indices are being created: `curl http://elasticsearch-host:9200/_cat/indices/opencue-*` + +### Prometheus not scraping metrics + +1. Verify the metrics endpoint is accessible: `curl http://cuebot-host:8080/metrics` +2. Check Prometheus configuration for correct target address +3. Review Prometheus logs for scrape errors + +## What's next? + +- [Render farm monitoring concepts](/docs/concepts/render-farm-monitoring/) - Understand the monitoring architecture +- [Monitoring user guide](/docs/user-guides/render-farm-monitoring-guide/) - Configure dashboards and alerts +- [Monitoring reference](/docs/reference/monitoring-reference/) - Complete configuration reference diff --git a/docs/_docs/getting-started/deploying-rest-gateway.md b/docs/_docs/getting-started/deploying-rest-gateway.md index 7b1d9baa8..a0ac7f97c 100644 --- a/docs/_docs/getting-started/deploying-rest-gateway.md +++ b/docs/_docs/getting-started/deploying-rest-gateway.md @@ -1,6 +1,6 @@ --- title: "Deploying OpenCue REST Gateway" -nav_order: 27 +nav_order: 29 parent: Getting Started layout: default linkTitle: "Deploying REST Gateway" diff --git a/docs/_docs/getting-started/deploying-rqd.md b/docs/_docs/getting-started/deploying-rqd.md index f51f875ed..4b299c79f 100644 --- a/docs/_docs/getting-started/deploying-rqd.md +++ b/docs/_docs/getting-started/deploying-rqd.md @@ -1,6 +1,6 @@ --- title: "Deploying RQD" -nav_order: 21 +nav_order: 23 parent: Getting Started layout: default linkTitle: "Deploying RQD" diff --git a/docs/_docs/getting-started/index.md b/docs/_docs/getting-started/index.md index 9df323c7e..e02a89a5e 100644 --- a/docs/_docs/getting-started/index.md +++ b/docs/_docs/getting-started/index.md @@ -1,7 +1,7 @@ --- layout: default title: Getting Started -nav_order: 18 +nav_order: 20 has_children: true permalink: /docs/getting-started --- diff --git a/docs/_docs/getting-started/installing-cueadmin.md b/docs/_docs/getting-started/installing-cueadmin.md index b4b81d317..120828dcc 100644 --- a/docs/_docs/getting-started/installing-cueadmin.md +++ b/docs/_docs/getting-started/installing-cueadmin.md @@ -1,6 +1,6 @@ --- title: "Installing CueAdmin" -nav_order: 24 +nav_order: 26 parent: Getting Started layout: default linkTitle: "Installing CueAdmin" diff --git a/docs/_docs/getting-started/installing-cuegui.md b/docs/_docs/getting-started/installing-cuegui.md index 92aa3190d..5fab6ce58 100644 --- a/docs/_docs/getting-started/installing-cuegui.md +++ b/docs/_docs/getting-started/installing-cuegui.md @@ -1,6 +1,6 @@ --- title: "Installing CueGUI" -nav_order: 25 +nav_order: 27 parent: Getting Started layout: default linkTitle: "Installing CueGUI" diff --git a/docs/_docs/getting-started/installing-cuesubmit.md b/docs/_docs/getting-started/installing-cuesubmit.md index 00bceb4a2..89c6d3dd3 100644 --- a/docs/_docs/getting-started/installing-cuesubmit.md +++ b/docs/_docs/getting-started/installing-cuesubmit.md @@ -1,6 +1,6 @@ --- title: "Installing CueSubmit" -nav_order: 26 +nav_order: 28 parent: Getting Started layout: default linkTitle: "Installing CueSubmit" diff --git a/docs/_docs/getting-started/installing-pycue-and-pyoutline.md b/docs/_docs/getting-started/installing-pycue-and-pyoutline.md index 67cd0f7cf..c08264a49 100644 --- a/docs/_docs/getting-started/installing-pycue-and-pyoutline.md +++ b/docs/_docs/getting-started/installing-pycue-and-pyoutline.md @@ -1,6 +1,6 @@ --- title: "Installing PyCue and PyOutline" -nav_order: 23 +nav_order: 25 parent: Getting Started layout: default linkTitle: "Installing PyCue and PyOutline" diff --git a/docs/_docs/getting-started/setting-up-the-database.md b/docs/_docs/getting-started/setting-up-the-database.md index 1df46545f..d51581fe7 100644 --- a/docs/_docs/getting-started/setting-up-the-database.md +++ b/docs/_docs/getting-started/setting-up-the-database.md @@ -1,6 +1,6 @@ --- title: "Setting up the database" -nav_order: 19 +nav_order: 21 parent: Getting Started layout: default linkTitle: "Setting up the database" diff --git a/docs/_docs/other-guides/applying-database-migrations.md b/docs/_docs/other-guides/applying-database-migrations.md index 16e1ad4f9..3c9e798a4 100644 --- a/docs/_docs/other-guides/applying-database-migrations.md +++ b/docs/_docs/other-guides/applying-database-migrations.md @@ -2,7 +2,7 @@ title: "Applying database migrations" layout: default parent: Other Guides -nav_order: 44 +nav_order: 48 linkTitle: "Applying database migrations" date: 2019-08-22 description: > diff --git a/docs/_docs/other-guides/configuring-limits.md b/docs/_docs/other-guides/configuring-limits.md index df5214342..0a44c20ed 100644 --- a/docs/_docs/other-guides/configuring-limits.md +++ b/docs/_docs/other-guides/configuring-limits.md @@ -2,7 +2,7 @@ title: "Configuring limits" layout: default parent: Other Guides -nav_order: 42 +nav_order: 46 linkTitle: "Configuring limits" date: 2020-03-26 description: > diff --git a/docs/_docs/other-guides/configuring-opencue.md b/docs/_docs/other-guides/configuring-opencue.md index 965582c5b..f8401fa92 100644 --- a/docs/_docs/other-guides/configuring-opencue.md +++ b/docs/_docs/other-guides/configuring-opencue.md @@ -2,7 +2,7 @@ title: "Configuring OpenCue" layout: default parent: Other Guides -nav_order: 41 +nav_order: 45 linkTitle: "Configuring OpenCue" date: 2023-01-26 description: > diff --git a/docs/_docs/other-guides/containerized_frames.md b/docs/_docs/other-guides/containerized_frames.md index f739ff189..794fb5f5e 100644 --- a/docs/_docs/other-guides/containerized_frames.md +++ b/docs/_docs/other-guides/containerized_frames.md @@ -2,7 +2,7 @@ title: "Running RQD at Docker mode" layout: default parent: Other Guides -nav_order: 48 +nav_order: 52 linkTitle: "Running RQD at Docker mode" date: 2024-11-06 description: > diff --git a/docs/_docs/other-guides/cueweb.md b/docs/_docs/other-guides/cueweb.md index 19bc0896e..eef391a3e 100644 --- a/docs/_docs/other-guides/cueweb.md +++ b/docs/_docs/other-guides/cueweb.md @@ -2,7 +2,7 @@ title: "CueWeb System" layout: default parent: Other Guides -nav_order: 52 +nav_order: 56 linkTitle: "CueWeb system" date: 2025-02-04 description: > diff --git a/docs/_docs/other-guides/customizing-rqd.md b/docs/_docs/other-guides/customizing-rqd.md index ae25a3057..2f14cc8aa 100644 --- a/docs/_docs/other-guides/customizing-rqd.md +++ b/docs/_docs/other-guides/customizing-rqd.md @@ -2,7 +2,7 @@ title: "Customizing RQD rendering hosts" layout: default parent: Other Guides -nav_order: 43 +nav_order: 47 linkTitle: "Customizing RQD rendering hosts" date: 2019-12-10 description: > diff --git a/docs/_docs/other-guides/deploying-rest-gateway.md b/docs/_docs/other-guides/deploying-rest-gateway.md index 06c3fe336..5b0229ccf 100644 --- a/docs/_docs/other-guides/deploying-rest-gateway.md +++ b/docs/_docs/other-guides/deploying-rest-gateway.md @@ -1,6 +1,6 @@ --- title: "Deploying REST Gateway" -nav_order: 51 +nav_order: 55 parent: Other Guides layout: default linkTitle: "Deploying the OpenCue REST Gateway" diff --git a/docs/_docs/other-guides/desktop-rendering-control.md b/docs/_docs/other-guides/desktop-rendering-control.md index bafeea5f0..a709fe29c 100644 --- a/docs/_docs/other-guides/desktop-rendering-control.md +++ b/docs/_docs/other-guides/desktop-rendering-control.md @@ -1,6 +1,6 @@ --- title: "Desktop rendering control" -nav_order: 50 +nav_order: 54 parent: Other Guides layout: default linkTitle: "Desktop rendering control" diff --git a/docs/_docs/other-guides/framelogging-with-loki.md b/docs/_docs/other-guides/framelogging-with-loki.md index a6a355a57..a53d01a85 100644 --- a/docs/_docs/other-guides/framelogging-with-loki.md +++ b/docs/_docs/other-guides/framelogging-with-loki.md @@ -2,7 +2,7 @@ title: "Configuring OpenCue with Loki for framelogs" layout: default parent: Other Guides -nav_order: 49 +nav_order: 53 linkTitle: "Configuring OpenCue with Loki for framelogs" date: 2024-11-27 description: > diff --git a/docs/_docs/other-guides/index.md b/docs/_docs/other-guides/index.md index e97b093cf..b2b1d7247 100644 --- a/docs/_docs/other-guides/index.md +++ b/docs/_docs/other-guides/index.md @@ -1,7 +1,7 @@ --- layout: default title: Other Guides -nav_order: 40 +nav_order: 44 has_children: true permalink: /docs/other-guides --- diff --git a/docs/_docs/other-guides/monitoring-with-prometheus-loki-and-grafana.md b/docs/_docs/other-guides/monitoring-with-prometheus-loki-and-grafana.md index 2faae1717..8cbb60f4f 100644 --- a/docs/_docs/other-guides/monitoring-with-prometheus-loki-and-grafana.md +++ b/docs/_docs/other-guides/monitoring-with-prometheus-loki-and-grafana.md @@ -2,7 +2,7 @@ title: "Monitoring with Prometheus, Loki, and Grafana" layout: default parent: Other Guides -nav_order: 47 +nav_order: 51 linkTitle: "Monitoring with Prometheus, Loki, and Grafana" date: 2021-08-01 description: > diff --git a/docs/_docs/other-guides/troubleshooting-deployment.md b/docs/_docs/other-guides/troubleshooting-deployment.md index 5656c0d5b..c0059e746 100644 --- a/docs/_docs/other-guides/troubleshooting-deployment.md +++ b/docs/_docs/other-guides/troubleshooting-deployment.md @@ -2,7 +2,7 @@ title: "Troubleshooting deployment" layout: default parent: Other Guides -nav_order: 45 +nav_order: 49 linkTitle: "Troubleshooting deployment" date: 2019-02-22 description: > diff --git a/docs/_docs/other-guides/troubleshooting-rendering.md b/docs/_docs/other-guides/troubleshooting-rendering.md index 46b98e7c2..0db6d1591 100644 --- a/docs/_docs/other-guides/troubleshooting-rendering.md +++ b/docs/_docs/other-guides/troubleshooting-rendering.md @@ -2,7 +2,7 @@ title: "Troubleshooting rendering" layout: default parent: Other Guides -nav_order: 46 +nav_order: 50 linkTitle: "Troubleshooting rendering" date: 2019-02-22 description: > diff --git a/docs/_docs/quick-starts/quick-start-monitoring.md b/docs/_docs/quick-starts/quick-start-monitoring.md new file mode 100644 index 000000000..fa643723a --- /dev/null +++ b/docs/_docs/quick-starts/quick-start-monitoring.md @@ -0,0 +1,318 @@ +--- +title: "Quick start for OpenCue monitoring stack" +nav_order: 9 +parent: Quick Starts +layout: default +linkTitle: "Quick start for monitoring" +date: 2024-11-24 +description: > + Deploy the OpenCue monitoring stack with Kafka, Elasticsearch, Prometheus, and Grafana +--- + +# Quick start for OpenCue monitoring stack + +### Deploy the OpenCue monitoring stack + +--- + +This guide walks you through deploying the OpenCue monitoring stack, which provides real-time metrics, event streaming, and historical data storage for your render farm. + +## Before you begin + +Ensure you have the following: + +- A working OpenCue sandbox environment (see [Using the OpenCue Sandbox for Testing](/docs/developer-guide/sandbox-testing/)) +- Docker and Docker Compose installed +- At least 8GB of available RAM for the monitoring services + +## Monitoring stack components + +The monitoring stack includes: + +| Component | Purpose | Port | +|-----------|---------|------| +| **Kafka** | Event streaming | 9092 | +| **Zookeeper** | Kafka coordination | 2181 | +| **kafka-es-indexer** | Kafka to Elasticsearch indexer (Rust) | - | +| **Elasticsearch** | Historical data storage | 9200 | +| **Prometheus** | Metrics collection | 9090 | +| **Grafana** | Dashboards and visualization | 3000 | +| **Kafka UI** | Kafka topic browser | 8090 | +| **Kibana** | Elasticsearch visualization | 5601 | + +## Step-by-step setup + +### Step 1: Start the monitoring stack + +From the OpenCue repository root, start the full monitoring stack: + +```bash +docker compose -f sandbox/docker-compose.monitoring-full.yml up -d +``` + +This command starts all monitoring services along with Cuebot configured to publish events. + +Wait for all services to become healthy: + +```bash +docker compose -f sandbox/docker-compose.monitoring-full.yml ps +``` + +All containers should show status `Up` or `healthy`. + +### Step 2: Verify Kafka topics + +Check that Kafka topics were created: + +```bash +docker exec opencue-kafka kafka-topics --bootstrap-server localhost:29092 --list +``` + +You should see: + +``` +opencue.frame.events +opencue.host.events +opencue.job.events +opencue.layer.events +opencue.proc.events +``` + +### Step 3: Access Grafana + +1. Open Grafana at [http://localhost:3000](http://localhost:3000) +2. Log in with: + - Username: `admin` + - Password: `admin` +3. Navigate to **Dashboards** to find the pre-configured OpenCue monitoring dashboard + +![OpenCue Monitoring Grafana Dashboard](/assets/images/opencue_monitoring/opencue_monitoring_grafana_chart.png) + +### Step 4: Verify Prometheus metrics + +1. Open Prometheus at [http://localhost:9090](http://localhost:9090) +2. Navigate to **Status** > **Targets** +3. Verify that the `cuebot` target shows status `UP` + +![Prometheus Metrics Interface](/assets/images/opencue_monitoring/opencue_monitoring_prometheus.png) + +You can also query metrics directly: + +```bash +curl -s http://localhost:8080/metrics | grep cue_ +``` + +### Step 5: Browse Kafka events + +1. Open Kafka UI at [http://localhost:8090](http://localhost:8090) +2. Click on the `opencue` cluster +3. Browse topics to see events as they are published + +![Kafka UI for Apache Kafka](/assets/images/opencue_monitoring/opencue_monitoring_ui_for_apache_kafka.png) + +## Testing the monitoring system + +### Generate test events + +Submit a test job to generate monitoring events. + +**Option A: Using cuecmd** + +```bash +# Create a command file +echo "echo Hello from monitoring test" > /tmp/test_commands.txt + +# Submit the job +cuecmd /tmp/test_commands.txt --show testing --job-name monitoring_test +``` + +**Option B: Using PyOutline** + +```bash +# Install pycue if not already installed +pip install ./pycue ./pyoutline + +# Submit a test job +python -c " +import outline +from outline.modules.shell import Shell + +ol = outline.Outline('monitoring_test_job', shot='testshot', show='testing') +layer = Shell('test_layer', command=['/bin/echo', 'Hello from monitoring test'], range='1-1') +ol.add_layer(layer) +outline.cuerun.launch(ol, use_pycuerun=False) +" +``` + +### View events in real-time + +Watch Kafka events as jobs execute: + +```bash +docker exec opencue-kafka kafka-console-consumer \ + --bootstrap-server localhost:29092 \ + --topic opencue.frame.events \ + --from-beginning +``` + +Or use the Python consumer script (requires lz4 for decompression): + +```bash +pip install kafka-python lz4 +python sandbox/monitor_events.py +``` + +### Query Prometheus metrics + +Open Prometheus at [http://localhost:9090](http://localhost:9090) and try these queries: + +- `cue_frames_completed_total` - Completed frames by state +- `cue_jobs_completed_total` - Completed jobs by show +- `rate(cue_host_reports_received_total[5m])` - Host report rate + +## Grafana dashboard panels + +The pre-configured dashboard includes: + +- **Frames Completed (5m)**: Real-time frame completion by state (DEAD, SUCCEEDED, WAITING) +- **Jobs Completed by Show (5m)**: Jobs completed per show +- **Frame Runtime Distribution**: P50 and P95 frame execution times +- **Frame Memory Usage Distribution**: Memory consumption distribution +- **Host Reports Received (5m)**: Host reporting activity by facility + +## Accessing monitoring components + +### Grafana - Dashboards and Visualization + +**URL:** [http://localhost:3000](http://localhost:3000) + +**Login:** admin / admin + +Grafana provides pre-configured dashboards for monitoring your render farm: + +1. Navigate to **Dashboards** > **OpenCue Monitoring Dashboard** +2. View real-time metrics for frames, jobs, and hosts +3. Create custom dashboards using Prometheus as the data source + +### Prometheus - Metrics Collection + +**URL:** [http://localhost:9090](http://localhost:9090) + +Prometheus collects and stores time-series metrics from Cuebot: + +1. Navigate to **Status** > **Targets** to verify Cuebot is being scraped +2. Use the **Graph** tab to query metrics: + - `cue_frames_completed_total` - Frames by state + - `cue_jobs_completed_total` - Jobs by show + - `cue_host_reports_received_total` - Host reports received +3. Navigate to **Status** > **Configuration** to view scrape settings + +### Kafka UI - Event Stream Browser + +**URL:** [http://localhost:8090](http://localhost:8090) + +Kafka UI allows you to browse event topics and messages: + +1. Click on the **opencue** cluster +2. Navigate to **Topics** to see all event topics: + - `opencue.frame.events` - Frame lifecycle events + - `opencue.job.events` - Job lifecycle events + - `opencue.layer.events` - Layer lifecycle events + - `opencue.host.events` - Host status events + - `opencue.proc.events` - Proc allocation events +3. Click on a topic and select **Messages** to view events in real-time + +### Elasticsearch - Historical Data Storage + +**URL:** [http://localhost:9200](http://localhost:9200) + +Elasticsearch stores historical event data for analysis: + +```bash +# Check cluster health +curl http://localhost:9200/_cluster/health?pretty + +# List indices +curl http://localhost:9200/_cat/indices?v + +# Count events +curl http://localhost:9200/opencue-*/_count + +# Search for failed frames +curl -X GET "http://localhost:9200/opencue-*/_search?pretty" -H 'Content-Type: application/json' -d' +{ + "query": { + "match": { + "header.event_type": "FRAME_FAILED" + } + }, + "size": 5 +}' +``` + +### Kibana - Elasticsearch Visualization + +**URL:** [http://localhost:5601](http://localhost:5601) + +Kibana provides a UI for exploring Elasticsearch data: + +1. Navigate to **Management** > **Stack Management** > **Index Patterns** +2. Create an index pattern: `opencue-*` +3. Select `header.timestamp` as the time field (format: epoch_millis) +4. Navigate to **Discover** to explore events +5. Use KQL queries: + - `header.event_type: "FRAME_FAILED"` - Find failed frames + - `job_name: "test*"` - Find events for jobs matching pattern + - `header.event_type: "FRAME_COMPLETED" AND run_time > 3600` - Long-running frames + +![Kibana Dashboard](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dashboard1.png) + +![Kibana Dev Tools](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dev_tools.png) + +## Stopping the monitoring stack + +To stop all monitoring services: + +```bash +docker compose -f sandbox/docker-compose.monitoring-full.yml down +``` + +To stop and remove all data volumes: + +```bash +docker compose -f sandbox/docker-compose.monitoring-full.yml down -v +``` + +## Troubleshooting + +### Cuebot fails to start + +Check Cuebot logs for errors: + +```bash +docker logs opencue-cuebot +``` + +Common issues: +- Kafka not ready: Ensure Zookeeper and Kafka are healthy before Cuebot starts +- Elasticsearch connection: Verify Elasticsearch is accessible + +### No metrics in Prometheus + +1. Verify Cuebot exposes metrics: `curl http://localhost:8080/metrics` +2. Check Prometheus targets: Navigate to **Status** > **Targets** in Prometheus +3. Verify the Prometheus configuration file mounts correctly + +### Kafka topics not created + +Topics are auto-created when Cuebot publishes the first event. If topics are missing: + +1. Check Cuebot logs for Kafka connection errors +2. Verify Kafka is healthy: `docker logs opencue-kafka` +3. Ensure `KAFKA_AUTO_CREATE_TOPICS_ENABLE` is set to `true` + +## What's next? + +- [Render farm monitoring concepts](/docs/concepts/render-farm-monitoring/) - Learn about the monitoring architecture +- [Monitoring user guide](/docs/user-guides/render-farm-monitoring-guide/) - Configure alerts and custom dashboards +- [Monitoring developer guide](/docs/developer-guide/monitoring-development/) - Extend the monitoring system diff --git a/docs/_docs/reference/CueGUI-app.md b/docs/_docs/reference/CueGUI-app.md index 78ab17095..5a8481b8c 100644 --- a/docs/_docs/reference/CueGUI-app.md +++ b/docs/_docs/reference/CueGUI-app.md @@ -2,7 +2,7 @@ title: "CueGUI app" layout: default parent: Reference -nav_order: 54 +nav_order: 58 linkTitle: "CueGUI app" date: 2019-02-22 description: > diff --git a/docs/_docs/reference/commands/cueadmin.md b/docs/_docs/reference/commands/cueadmin.md index 3fdac8266..ab37f361d 100644 --- a/docs/_docs/reference/commands/cueadmin.md +++ b/docs/_docs/reference/commands/cueadmin.md @@ -2,7 +2,7 @@ title: "cueadmin command" layout: default parent: Reference -nav_order: 55 +nav_order: 59 linkTitle: "cueadmin command" date: 2025-08-11 description: > diff --git a/docs/_docs/reference/commands/pycuerun.md b/docs/_docs/reference/commands/pycuerun.md index 130afa594..805ec0820 100644 --- a/docs/_docs/reference/commands/pycuerun.md +++ b/docs/_docs/reference/commands/pycuerun.md @@ -2,7 +2,7 @@ title: "pycuerun command" layout: default parent: Reference -nav_order: 56 +nav_order: 60 linkTitle: "pycuerun command" date: 2019-05-23 description: > diff --git a/docs/_docs/reference/cuecommander-technical-reference.md b/docs/_docs/reference/cuecommander-technical-reference.md index ef9623f9c..28c8734e3 100644 --- a/docs/_docs/reference/cuecommander-technical-reference.md +++ b/docs/_docs/reference/cuecommander-technical-reference.md @@ -2,7 +2,7 @@ title: "CueCommander Technical Reference" layout: default parent: Reference -nav_order: 58 +nav_order: 62 linkTitle: "CueCommander Technical Reference" date: 2025-01-13 description: > diff --git a/docs/_docs/reference/filter-actions-reference.md b/docs/_docs/reference/filter-actions-reference.md index b4619f197..993c4d4a4 100644 --- a/docs/_docs/reference/filter-actions-reference.md +++ b/docs/_docs/reference/filter-actions-reference.md @@ -1,6 +1,6 @@ --- title: "Filter Actions Reference" -nav_order: 64 +nav_order: 68 parent: Reference layout: default date: 2025-10-15 diff --git a/docs/_docs/reference/index.md b/docs/_docs/reference/index.md index 4b9219661..70b953a78 100644 --- a/docs/_docs/reference/index.md +++ b/docs/_docs/reference/index.md @@ -1,7 +1,7 @@ --- layout: default title: Reference -nav_order: 53 +nav_order: 57 has_children: true permalink: /docs/reference --- diff --git a/docs/_docs/reference/monitoring-reference.md b/docs/_docs/reference/monitoring-reference.md new file mode 100644 index 000000000..4f2e9ea50 --- /dev/null +++ b/docs/_docs/reference/monitoring-reference.md @@ -0,0 +1,471 @@ +--- +title: "Monitoring system reference" +nav_order: 70 +parent: Reference +layout: default +linkTitle: "Monitoring reference" +date: 2024-11-24 +description: > + Complete reference for the OpenCue monitoring system +--- + +# Monitoring system reference + +### Complete reference for the OpenCue monitoring system + +--- + +This reference provides comprehensive documentation for all monitoring system components, configuration options, and APIs. + +## Component access + +| Component | Purpose | URL | Port | +|-----------|---------|-----|------| +| **Grafana** | Dashboards and visualization | [http://localhost:3000](http://localhost:3000) | 3000 | +| **Prometheus** | Metrics collection and querying | [http://localhost:9090](http://localhost:9090) | 9090 | +| **Kafka UI** | Event stream browser | [http://localhost:8090](http://localhost:8090) | 8090 | +| **Kibana** | Elasticsearch visualization | [http://localhost:5601](http://localhost:5601) | 5601 | +| **Elasticsearch** | Historical data storage | [http://localhost:9200](http://localhost:9200) | 9200 | +| **Kafka** | Event streaming broker | localhost:9092 | 9092 | +| **Zookeeper** | Kafka coordination | localhost:2181 | 2181 | +| **Cuebot Metrics** | Prometheus metrics endpoint | [http://localhost:8080/metrics](http://localhost:8080/metrics) | 8080 | + +### Prometheus Metrics Interface + +![Prometheus Metrics Interface](/assets/images/opencue_monitoring/opencue_monitoring_prometheus.png) + +### OpenCue Monitoring Grafana Dashboard + +![OpenCue Monitoring Grafana Dashboard](/assets/images/opencue_monitoring/opencue_monitoring_grafana_chart.png) + + +## Kafka topics + +### Topic specifications + +| Topic | Partition Key | Description | +|-------|---------------|-------------| +| `opencue.job.events` | `jobId` | Job lifecycle events | +| `opencue.layer.events` | `layerId` | Layer state changes | +| `opencue.frame.events` | `frameId` | Frame execution events | +| `opencue.host.events` | `hostId` | Host state changes | +| `opencue.proc.events` | `procId` | Process allocation events | + +![Kafka UI for Apache Kafka](/assets/images/opencue_monitoring/opencue_monitoring_ui_for_apache_kafka.png) + +### Event types + +#### Job events + +| Event Type | Description | Trigger | +|------------|-------------|---------| +| `JOB_CREATED` | Job submitted to queue | Job submission | +| `JOB_STARTED` | First frame dispatched | Frame dispatch | +| `JOB_FINISHED` | All frames complete | Last frame completion | +| `JOB_KILLED` | Job manually terminated | User action | +| `JOB_PAUSED` | Job paused | User action | +| `JOB_RESUMED` | Job resumed | User action | + +#### Layer events + +| Event Type | Description | Trigger | +|------------|-------------|---------| +| `LAYER_STARTED` | First frame of layer dispatched | Frame dispatch | +| `LAYER_FINISHED` | All frames in layer complete | Last frame completion | + +#### Frame events + +| Event Type | Description | Trigger | +|------------|-------------|---------| +| `FRAME_STARTED` | Frame began rendering | RQD reports start | +| `FRAME_COMPLETED` | Frame finished successfully | RQD reports completion | +| `FRAME_FAILED` | Frame failed with error | RQD reports failure | +| `FRAME_RETRIED` | Failed frame requeued | Automatic retry | +| `FRAME_EATEN` | Frame marked complete without rendering | User action | + +#### Host events + +| Event Type | Description | Trigger | +|------------|-------------|---------| +| `HOST_UP` | Host came online | RQD registration | +| `HOST_DOWN` | Host went offline | Heartbeat timeout | +| `HOST_LOCKED` | Host locked for maintenance | User action | +| `HOST_UNLOCKED` | Host unlocked | User action | +| `HOST_NIMBY_LOCKED` | Host entered NIMBY mode | NIMBY activation | +| `HOST_NIMBY_UNLOCKED` | Host exited NIMBY mode | NIMBY deactivation | + +#### Proc events + +| Event Type | Description | Trigger | +|------------|-------------|---------| +| `PROC_ASSIGNED` | Process allocated to frame | Dispatch | +| `PROC_UNASSIGNED` | Process deallocated | Frame completion/failure | + +## Event payload schemas + +All events include a `header` field with common metadata, plus event-specific fields at the top level. + +### Event header + +```json +{ + "header": { + "event_id": "f533d84a-1586-4980-8c5e-3443376425c9", + "event_type": "FRAME_COMPLETED", + "timestamp": "1764097486229", + "source_cuebot": "cuebot-01", + "correlation_id": "fa7bbb9a-cae1-4f6b-a50a-88a9ac349d24" + } +} +``` + +| Field | Type | Description | +|-------|------|-------------| +| `event_id` | string | Unique identifier for this event | +| `event_type` | string | Type of event (e.g., FRAME_COMPLETED, JOB_FINISHED) | +| `timestamp` | string | Unix timestamp in milliseconds | +| `source_cuebot` | string | Hostname of the Cuebot that generated the event | +| `correlation_id` | string | ID linking related events (typically the job ID) | + +### Job event payload + +```json +{ + "header": { + "event_id": "550e8400-e29b-41d4-a716-446655440000", + "event_type": "JOB_FINISHED", + "timestamp": "1732446600000", + "source_cuebot": "cuebot-01", + "correlation_id": "550e8400-e29b-41d4-a716-446655440000" + }, + "job_id": "550e8400-e29b-41d4-a716-446655440000", + "job_name": "show-shot-user_render_v001", + "show": "show", + "show_id": "550e8400-e29b-41d4-a716-446655440001", + "facility": "cloud", + "group_name": "render", + "user": "artist", + "state": "FINISHED", + "is_paused": false, + "is_auto_eat": false, + "start_time": "1732443000000", + "stop_time": "1732446600000", + "frame_count": 100, + "layer_count": 2, + "pending_frames": 0, + "running_frames": 0, + "dead_frames": 0, + "succeeded_frames": 100 +} +``` + +### Frame event payload + +```json +{ + "header": { + "event_id": "f533d84a-1586-4980-8c5e-3443376425c9", + "event_type": "FRAME_COMPLETED", + "timestamp": "1764097486229", + "source_cuebot": "cuebot-01", + "correlation_id": "fa7bbb9a-cae1-4f6b-a50a-88a9ac349d24" + }, + "frame_id": "fa18c460-0e92-49e1-8d6a-e26473ac2708", + "frame_name": "0001-render", + "frame_number": 1, + "layer_id": "53ec9034-b16b-4cc2-9eec-05f68b1848bf", + "layer_name": "render", + "job_id": "fa7bbb9a-cae1-4f6b-a50a-88a9ac349d24", + "job_name": "show-shot-user_render_v001", + "show": "show", + "state": "SUCCEEDED", + "previous_state": "RUNNING", + "exit_status": 0, + "exit_signal": 0, + "retry_count": 0, + "dispatch_order": 0, + "start_time": "1764097475839", + "stop_time": "1764097486233", + "run_time": 3600, + "llu_time": "1764097476", + "max_rss": "8589934592", + "used_memory": "8589934592", + "reserved_memory": "262144", + "max_gpu_memory": "0", + "used_gpu_memory": "0", + "reserved_gpu_memory": "0", + "num_cores": 8, + "num_gpus": 0, + "host_name": "render-node-01", + "resource_id": "357516fe-4d34-447f-b1cd-41779102b6e3", + "checkpoint_state": "DISABLED", + "checkpoint_count": 0, + "total_core_time": 0, + "total_gpu_time": 0, + "reason": "", + "killed_by": "" +} +``` + +## Prometheus metrics + +### Job and frame metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cue_jobs_completed_total` | Counter | `show` | Total jobs completed | +| `cue_frames_completed_total` | Counter | `state`, `show` | Total frames completed | +| `cue_frame_runtime_seconds` | Histogram | `show` | Frame execution time distribution | +| `cue_frame_memory_bytes` | Histogram | `show` | Frame memory usage distribution | +| `cue_frame_kill_failure_counter_total` | Counter | - | Frames that failed to be killed | + +### Queue metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cue_dispatch_waiting_total` | Gauge | - | Tasks waiting in dispatch queue | +| `cue_dispatch_threads_total` | Gauge | - | Active dispatch threads | +| `cue_dispatch_executed_total` | Gauge | - | Dispatch tasks executed | +| `cue_dispatch_rejected_total` | Gauge | - | Dispatch tasks rejected | +| `cue_dispatch_remaining_capacity_total` | Gauge | - | Dispatch queue remaining capacity | +| `cue_booking_waiting_total` | Gauge | - | Tasks waiting in booking queue | +| `cue_booking_threads_total` | Gauge | - | Active booking threads | +| `cue_booking_remaining_capacity_total` | Gauge | - | Booking queue remaining capacity | +| `cue_manage_waiting_total` | Gauge | - | Tasks waiting in manage queue | +| `cue_report_executed_total` | Gauge | - | Host reports processed | +| `cue_report_rejected_total` | Gauge | - | Host reports rejected | + +### Host metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cue_host_reports_received_total` | Counter | `facility` | Host reports received | + +### Query metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cue_find_jobs_by_show_count_total` | Counter | - | FIND_JOBS_BY_SHOW query count | +| `cue_booking_durations_histogram_in_millis` | Histogram | - | Booking step durations | + +## Configuration reference + +### Kafka configuration + +```properties +# Enable/disable Kafka event publishing +monitoring.kafka.enabled=false + +# Kafka broker addresses (comma-separated for multiple brokers) +monitoring.kafka.bootstrap.servers=localhost:9092 + +# Event queue configuration +monitoring.kafka.queue.capacity=1000 + +# Producer configuration +monitoring.kafka.batch.size=16384 +monitoring.kafka.linger.ms=100 +monitoring.kafka.buffer.memory=33554432 +monitoring.kafka.acks=1 +monitoring.kafka.retries=3 +monitoring.kafka.retry.backoff.ms=100 + +# Compression +monitoring.kafka.compression.type=lz4 +``` + +### kafka-es-indexer configuration + +Elasticsearch indexing is handled by the standalone `kafka-es-indexer` service (located at `rust/crates/kafka-es-indexer/`). It can be configured via environment variables or CLI arguments: + +| CLI Argument | Env Variable | Default | Description | +|--------------|--------------|---------|-------------| +| `--kafka-servers` | `KAFKA_BOOTSTRAP_SERVERS` | `localhost:9092` | Kafka broker addresses | +| `--kafka-group-id` | `KAFKA_GROUP_ID` | `opencue-elasticsearch-indexer` | Consumer group ID | +| `--elasticsearch-url` | `ELASTICSEARCH_URL` | `http://localhost:9200` | Elasticsearch URL | +| `--index-prefix` | `ELASTICSEARCH_INDEX_PREFIX` | `opencue` | Elasticsearch index prefix | +| `--log-level` | `LOG_LEVEL` | `info` | Log level (debug, info, warn, error) | + +Example using environment variables: + +```bash +export KAFKA_BOOTSTRAP_SERVERS=kafka:9092 +export ELASTICSEARCH_URL=http://elasticsearch:9200 +export ELASTICSEARCH_INDEX_PREFIX=opencue +kafka-es-indexer +``` + +Example using CLI arguments: + +```bash +kafka-es-indexer \ + --kafka-servers kafka:9092 \ + --elasticsearch-url http://elasticsearch:9200 \ + --index-prefix opencue +``` + +### Prometheus configuration + +```properties +# Enable/disable Prometheus metrics endpoint +metrics.prometheus.collector=false + +# Histogram bucket configuration +metrics.prometheus.frame.runtime.buckets=1,5,10,30,60,300,600,1800,3600,7200 +metrics.prometheus.frame.memory.buckets=1073741824,2147483648,4294967296,8589934592,17179869184 +``` + +## Elasticsearch indices + +### Index naming convention + +``` +{prefix}-{event-category}-{date} +``` + +Examples: +- `opencue-job-events-2024.11.24` +- `opencue-frame-events-2024.11.24` +- `opencue-host-events-2024.11.24` + +### Index mappings + +#### Job events index + +```json +{ + "mappings": { + "properties": { + "eventType": { "type": "keyword" }, + "timestamp": { "type": "date" }, + "source": { "type": "keyword" }, + "jobId": { "type": "keyword" }, + "jobName": { "type": "keyword" }, + "showName": { "type": "keyword" }, + "facilityName": { "type": "keyword" }, + "userName": { "type": "keyword" }, + "state": { "type": "keyword" }, + "frameCount": { "type": "integer" }, + "layerCount": { "type": "integer" }, + "runtime": { "type": "long" } + } + } +} +``` + +#### Frame events index + +```json +{ + "mappings": { + "properties": { + "eventType": { "type": "keyword" }, + "timestamp": { "type": "date" }, + "frameId": { "type": "keyword" }, + "frameName": { "type": "keyword" }, + "jobId": { "type": "keyword" }, + "jobName": { "type": "keyword" }, + "layerName": { "type": "keyword" }, + "showName": { "type": "keyword" }, + "state": { "type": "keyword" }, + "exitStatus": { "type": "integer" }, + "runtime": { "type": "long" }, + "maxRss": { "type": "long" }, + "host": { "type": "keyword" }, + "coresUsed": { "type": "integer" } + } + } +} +``` + +## gRPC monitoring service + +### Service definition + +```protobuf +service MonitoringInterface { + // Get current monitoring configuration + rpc GetMonitoringConfig(MonitoringConfigRequest) + returns (MonitoringConfigResponse); + + // Get monitoring statistics + rpc GetMonitoringStats(MonitoringStatsRequest) + returns (MonitoringStatsResponse); + + // Query historical events + rpc QueryEvents(QueryEventsRequest) + returns (QueryEventsResponse); + + // Stream real-time events + rpc StreamEvents(StreamEventsRequest) + returns (stream MonitoringEvent); +} +``` + +### Message definitions + +```protobuf +message MonitoringEvent { + MonitoringEventType event_type = 1; + string timestamp = 2; + string source = 3; + string job_id = 4; + string job_name = 5; + string show_name = 6; + map metadata = 7; +} + +enum MonitoringEventType { + JOB_CREATED = 0; + JOB_STARTED = 1; + JOB_FINISHED = 2; + JOB_KILLED = 3; + JOB_PAUSED = 4; + JOB_RESUMED = 5; + FRAME_STARTED = 10; + FRAME_COMPLETED = 11; + FRAME_FAILED = 12; + FRAME_RETRIED = 13; + HOST_UP = 20; + HOST_DOWN = 21; + HOST_LOCKED = 22; + HOST_UNLOCKED = 23; +} +``` + +## Docker compose reference + +### Full monitoring stack + +The `docker-compose.monitoring-full.yml` includes: + +| Service | Image | Ports | +|---------|-------|-------| +| zookeeper | confluentinc/cp-zookeeper:7.4.0 | 2181 | +| kafka | confluentinc/cp-kafka:7.4.0 | 9092, 29092 | +| kafka-ui | provectuslabs/kafka-ui:latest | 8090 | +| kafka-es-indexer | opencue/kafka-es-indexer | - | +| elasticsearch | elasticsearch:8.8.0 | 9200, 9300 | +| kibana | kibana:8.8.0 | 5601 | +| prometheus | prom/prometheus:v2.45.0 | 9090 | +| grafana | grafana/grafana:10.0.0 | 3000 | + +### Environment variables + +| Variable | Service | Description | +|----------|---------|-------------| +| `KAFKA_BROKER_ID` | kafka | Unique broker identifier | +| `KAFKA_ZOOKEEPER_CONNECT` | kafka | Zookeeper connection string | +| `KAFKA_AUTO_CREATE_TOPICS_ENABLE` | kafka | Enable automatic topic creation | +| `KAFKA_BOOTSTRAP_SERVERS` | kafka-es-indexer | Kafka broker addresses | +| `ELASTICSEARCH_URL` | kafka-es-indexer | Elasticsearch URL | +| `ELASTICSEARCH_INDEX_PREFIX` | kafka-es-indexer | Elasticsearch index prefix | +| `ES_JAVA_OPTS` | elasticsearch | JVM options | +| `GF_SECURITY_ADMIN_USER` | grafana | Admin username | +| `GF_SECURITY_ADMIN_PASSWORD` | grafana | Admin password | + +## What's next? + +- [Render farm monitoring concepts](/docs/concepts/render-farm-monitoring/) - Understand the monitoring architecture +- [Monitoring user guide](/docs/user-guides/render-farm-monitoring-guide/) - Configure dashboards and alerts +- [Monitoring developer guide](/docs/developer-guide/monitoring-development/) - Extend the monitoring system diff --git a/docs/_docs/reference/rest-api-reference.md b/docs/_docs/reference/rest-api-reference.md index 6299f6b45..cfcb1e252 100644 --- a/docs/_docs/reference/rest-api-reference.md +++ b/docs/_docs/reference/rest-api-reference.md @@ -2,7 +2,7 @@ layout: default title: OpenCue REST API Reference parent: Reference -nav_order: 65 +nav_order: 69 --- # OpenCue REST API Reference diff --git a/docs/_docs/reference/rust-rqd.md b/docs/_docs/reference/rust-rqd.md index 466a023d0..84dae355e 100644 --- a/docs/_docs/reference/rust-rqd.md +++ b/docs/_docs/reference/rust-rqd.md @@ -1,6 +1,6 @@ --- title: "Rust RQD" -nav_order: 57 +nav_order: 61 parent: Reference layout: default linkTitle: "Rust RQD" diff --git a/docs/_docs/reference/tools/cueadmin.md b/docs/_docs/reference/tools/cueadmin.md index b02110b9b..be1d4f923 100644 --- a/docs/_docs/reference/tools/cueadmin.md +++ b/docs/_docs/reference/tools/cueadmin.md @@ -1,6 +1,6 @@ --- title: "CueAdmin - CLI Administration Tool" -nav_order: 60 +nav_order: 64 parent: "Command Line Tools" grand_parent: "Reference" layout: default diff --git a/docs/_docs/reference/tools/cuecmd.md b/docs/_docs/reference/tools/cuecmd.md index 43a28aa9f..cda2ebd4c 100644 --- a/docs/_docs/reference/tools/cuecmd.md +++ b/docs/_docs/reference/tools/cuecmd.md @@ -1,6 +1,6 @@ --- title: "Cuecmd - Command Execution Tool" -nav_order: 62 +nav_order: 66 parent: "Command Line Tools" grand_parent: "Reference" layout: default diff --git a/docs/_docs/reference/tools/cueman.md b/docs/_docs/reference/tools/cueman.md index df0fb35cf..235f67c49 100644 --- a/docs/_docs/reference/tools/cueman.md +++ b/docs/_docs/reference/tools/cueman.md @@ -1,6 +1,6 @@ --- title: "Cueman - CLI Job Management Tool" -nav_order: 61 +nav_order: 65 parent: "Command Line Tools" grand_parent: "Reference" layout: default diff --git a/docs/_docs/reference/tools/cuenimby.md b/docs/_docs/reference/tools/cuenimby.md index 3c650cafb..a8545b56e 100644 --- a/docs/_docs/reference/tools/cuenimby.md +++ b/docs/_docs/reference/tools/cuenimby.md @@ -1,6 +1,6 @@ --- title: "CueNIMBY - NIMBY CLI and System Tray Application" -nav_order: 63 +nav_order: 67 parent: "Command Line Tools" grand_parent: "Reference" layout: default diff --git a/docs/_docs/reference/tools/index.md b/docs/_docs/reference/tools/index.md index c8dc604b7..1fb5f6915 100644 --- a/docs/_docs/reference/tools/index.md +++ b/docs/_docs/reference/tools/index.md @@ -1,6 +1,6 @@ --- title: "Command Line Tools" -nav_order: 59 +nav_order: 63 parent: "Reference" has_children: true layout: default diff --git a/docs/_docs/tutorials/cueadmin-tutorial.md b/docs/_docs/tutorials/cueadmin-tutorial.md index fc88e2dfb..e9d5b3c96 100644 --- a/docs/_docs/tutorials/cueadmin-tutorial.md +++ b/docs/_docs/tutorials/cueadmin-tutorial.md @@ -1,6 +1,6 @@ --- title: "CueAdmin Tutorial" -nav_order: 72 +nav_order: 77 parent: "Tutorials" layout: default date: 2025-08-11 diff --git a/docs/_docs/tutorials/cuecmd-tutorial.md b/docs/_docs/tutorials/cuecmd-tutorial.md index 95324bef7..bfeeaef30 100644 --- a/docs/_docs/tutorials/cuecmd-tutorial.md +++ b/docs/_docs/tutorials/cuecmd-tutorial.md @@ -1,6 +1,6 @@ --- title: "Cuecmd Tutorial" -nav_order: 74 +nav_order: 79 parent: Tutorials layout: default date: 2025-10-02 diff --git a/docs/_docs/tutorials/cueman-tutorial.md b/docs/_docs/tutorials/cueman-tutorial.md index d124569e3..39f4dff07 100644 --- a/docs/_docs/tutorials/cueman-tutorial.md +++ b/docs/_docs/tutorials/cueman-tutorial.md @@ -1,6 +1,6 @@ --- title: "Cueman Tutorial" -nav_order: 73 +nav_order: 78 parent: "Tutorials" layout: default date: 2025-08-06 diff --git a/docs/_docs/tutorials/cuenimby-tutorial.md b/docs/_docs/tutorials/cuenimby-tutorial.md index 0a3e154e1..3040b0293 100644 --- a/docs/_docs/tutorials/cuenimby-tutorial.md +++ b/docs/_docs/tutorials/cuenimby-tutorial.md @@ -1,6 +1,6 @@ --- title: "CueNIMBY tutorial" -nav_order: 75 +nav_order: 80 parent: Tutorials layout: default linkTitle: "CueNIMBY tutorial" diff --git a/docs/_docs/tutorials/cueweb-tutorial.md b/docs/_docs/tutorials/cueweb-tutorial.md index c2e4af214..8eb6aea9b 100644 --- a/docs/_docs/tutorials/cueweb-tutorial.md +++ b/docs/_docs/tutorials/cueweb-tutorial.md @@ -1,6 +1,6 @@ --- title: "CueWeb Tutorial" -nav_order: 78 +nav_order: 83 parent: Tutorials layout: default linkTitle: "Getting Started with CueWeb" diff --git a/docs/_docs/tutorials/dcc-integration.md b/docs/_docs/tutorials/dcc-integration.md index defebef97..30e651c10 100644 --- a/docs/_docs/tutorials/dcc-integration.md +++ b/docs/_docs/tutorials/dcc-integration.md @@ -2,7 +2,7 @@ title: "DCC Integration Tutorial" layout: default parent: Tutorials -nav_order: 79 +nav_order: 85 linkTitle: "DCC Integration Tutorial" date: 2025-01-29 description: > diff --git a/docs/_docs/tutorials/filter-tutorial.md b/docs/_docs/tutorials/filter-tutorial.md index 91ae68519..0f878f734 100644 --- a/docs/_docs/tutorials/filter-tutorial.md +++ b/docs/_docs/tutorials/filter-tutorial.md @@ -1,6 +1,6 @@ --- title: "Filter Tutorial" -nav_order: 76 +nav_order: 81 parent: Tutorials layout: default date: 2025-10-15 diff --git a/docs/_docs/tutorials/getting-started-tutorial.md b/docs/_docs/tutorials/getting-started-tutorial.md index 69fa1a44a..a7fd74515 100644 --- a/docs/_docs/tutorials/getting-started-tutorial.md +++ b/docs/_docs/tutorials/getting-started-tutorial.md @@ -2,7 +2,7 @@ title: "Getting Started with OpenCue" layout: default parent: Tutorials -nav_order: 67 +nav_order: 72 linkTitle: "Getting Started with OpenCue" date: 2025-01-29 description: > diff --git a/docs/_docs/tutorials/index.md b/docs/_docs/tutorials/index.md index 31b1faecd..b907d3a73 100644 --- a/docs/_docs/tutorials/index.md +++ b/docs/_docs/tutorials/index.md @@ -1,7 +1,7 @@ --- layout: default title: Tutorials -nav_order: 66 +nav_order: 71 has_children: true permalink: /docs/tutorials --- diff --git a/docs/_docs/tutorials/managing-jobs-frames.md b/docs/_docs/tutorials/managing-jobs-frames.md index 4dffea090..caae5805a 100644 --- a/docs/_docs/tutorials/managing-jobs-frames.md +++ b/docs/_docs/tutorials/managing-jobs-frames.md @@ -2,7 +2,7 @@ title: "Managing Jobs and Frames" layout: default parent: Tutorials -nav_order: 70 +nav_order: 75 linkTitle: "Managing Jobs and Frames" date: 2025-01-29 description: > diff --git a/docs/_docs/tutorials/monitoring-tutorial.md b/docs/_docs/tutorials/monitoring-tutorial.md new file mode 100644 index 000000000..c13e7b3c3 --- /dev/null +++ b/docs/_docs/tutorials/monitoring-tutorial.md @@ -0,0 +1,432 @@ +--- +title: "Monitoring tutorial" +nav_order: 84 +parent: Tutorials +layout: default +linkTitle: "Monitoring tutorial" +date: 2024-11-24 +description: > + Build custom monitoring dashboards and alerts for your OpenCue render farm +--- + +# Monitoring tutorial + +### Build custom monitoring dashboards and alerts for your OpenCue render farm + +--- + +This tutorial walks you through setting up monitoring for your OpenCue render farm, creating custom Grafana dashboards, and configuring alerts. + +## Prerequisites + +- OpenCue sandbox environment running (see [Using the OpenCue Sandbox for Testing](/docs/developer-guide/sandbox-testing/)) +- Monitoring stack deployed (see [Quick start for monitoring](/docs/quick-starts/quick-start-monitoring/)) +- Basic familiarity with Prometheus and Grafana + +## Monitoring stack components + +| Component | Purpose | URL | Port | +|-----------|---------|-----|------| +| **Grafana** | Dashboards and visualization | [http://localhost:3000](http://localhost:3000) | 3000 | +| **Prometheus** | Metrics collection | [http://localhost:9090](http://localhost:9090) | 9090 | +| **Kafka UI** | Event stream browser | [http://localhost:8090](http://localhost:8090) | 8090 | +| **Kibana** | Elasticsearch visualization | [http://localhost:5601](http://localhost:5601) | 5601 | +| **Elasticsearch** | Historical data storage | [http://localhost:9200](http://localhost:9200) | 9200 | +| **Kafka** | Event streaming | localhost:9092 | 9092 | +| **kafka-es-indexer** | Kafka to Elasticsearch indexer | - | - | +| **Zookeeper** | Kafka coordination | localhost:2181 | 2181 | + +### Grafana: OpenCue Monitoring Grafana Dashboard + +![OpenCue Monitoring Grafana Dashboard](/assets/images/opencue_monitoring/opencue_monitoring_grafana_chart.png) + +### Prometheus Metrics Interface + +![Prometheus Metrics Interface](/assets/images/opencue_monitoring/opencue_monitoring_prometheus.png) + +### UI for Apache Kafka + +![UI for Apache Kafka](/assets/images/opencue_monitoring/opencue_monitoring_ui_for_apache_kafka.png) + +### Elasticsearch Kibana - Dev Tools + +![Kibana](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dev_tools.png) + +### Elasticsearch + +![Elasticsearch](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch.png) + +## Tutorial goals + +By the end of this tutorial, you will: + +1. Create a custom Grafana dashboard for job monitoring +2. Build a Prometheus alert for failed frames +3. Set up a Kafka consumer to process events +4. Query historical data in Elasticsearch + +## Part 1: Creating a custom Grafana dashboard + +### Step 1: Access Grafana + +1. Open Grafana at [http://localhost:3000](http://localhost:3000) +2. Log in with username `admin` and password `admin` +3. Click **Dashboards** in the left menu + +### Step 2: Create a new dashboard + +1. Click **New** > **New Dashboard** +2. Click **Add visualization** +3. Select **Prometheus** as the data source + +### Step 3: Add a frame completion panel + +Create a time series panel showing frame completions: + +1. In the Query tab, enter: + ```promql + sum(increase(cue_frames_completed_total[5m])) by (state) + ``` + +2. Configure the panel: + - Title: "Frames Completed by State (5m)" + - Legend: `{{state}}` + - Unit: `short` + +3. Click **Apply** + +### Step 4: Add a job queue panel + +Add a gauge showing pending work: + +1. Click **Add** > **Visualization** +2. Select **Prometheus** as the data source +3. Enter the query: + ```promql + cue_dispatch_waiting_total + ``` + +4. Change visualization to **Gauge** +5. Configure: + - Title: "Dispatch Queue Size" + - Thresholds: 0 (green), 100 (yellow), 500 (red) + +6. Click **Apply** + +### Step 5: Add a host report panel + +Create a panel showing host activity: + +1. Click **Add** > **Visualization** +2. Enter the query: + ```promql + sum(increase(cue_host_reports_received_total[5m])) by (facility) + ``` + +3. Configure: + - Title: "Host Reports by Facility" + - Visualization: Time series + +4. Click **Apply** + +### Step 6: Save the dashboard + +1. Click the save icon (or Ctrl+S) +2. Name: "My OpenCue Dashboard" +3. Click **Save** + +## Part 2: Creating Prometheus alerts + +### Step 1: Create an alert rule + +1. In Grafana, go to **Alerting** > **Alert rules** +2. Click **New alert rule** + +### Step 2: Configure the alert condition + +1. Name: "High Frame Failure Rate" +2. In Query section: + ```promql + rate(cue_frames_completed_total{state="DEAD"}[5m]) > 0.1 + ``` + +3. Set condition: + - Threshold: IS ABOVE 0.1 + - For: 5m + +### Step 3: Add alert details + +1. Add summary: + ``` + Frame failure rate is {{ $value }} per second + ``` + +2. Add description: + ``` + The render farm is experiencing elevated frame failures. + Check host health and job configurations. + ``` + +3. Click **Save and exit** + +### Step 4: Create a notification contact point + +1. Go to **Alerting** > **Contact points** +2. Click **Add contact point** +3. Configure for your notification method (email, Slack, etc.) + +## Part 3: Building a Kafka event consumer + +### Step 1: Create a Python consumer + +Create a file `monitor_events.py`: + +```python +#!/usr/bin/env python3 +""" +Simple Kafka consumer for OpenCue monitoring events. +""" + +from kafka import KafkaConsumer +import json +from datetime import datetime + +# Connect to Kafka +# Note: The cuebot producer uses lz4 compression, so the lz4 library must be installed +consumer = KafkaConsumer( + 'opencue.frame.events', + 'opencue.job.events', + bootstrap_servers=['localhost:9092'], + value_deserializer=lambda m: json.loads(m.decode('utf-8')), + auto_offset_reset='earliest', + group_id='tutorial-consumer' +) + +print("Listening for OpenCue events...") +print("-" * 60) + +for message in consumer: + event = message.value + + # Events have a 'header' field containing event metadata + header = event.get('header', {}) + event_type = header.get('event_type', 'UNKNOWN') + timestamp = header.get('timestamp', '') + + # Convert timestamp from milliseconds to readable format + if timestamp: + try: + dt = datetime.fromtimestamp(int(timestamp) / 1000) + timestamp = dt.strftime('%Y-%m-%d %H:%M:%S') + except (ValueError, OSError): + pass + + # Format output based on event type + if event_type.startswith('FRAME_'): + job_name = event.get('job_name', 'N/A') + frame_name = event.get('frame_name', 'N/A') + state = event.get('state', 'N/A') + print(f"[{timestamp}] {event_type}") + print(f" Job: {job_name}") + print(f" Frame: {frame_name}") + print(f" State: {state}") + if event_type == 'FRAME_COMPLETED': + runtime = event.get('run_time', 0) + print(f" Runtime: {runtime}s") + elif event_type == 'FRAME_FAILED': + exit_status = event.get('exit_status', -1) + print(f" Exit Status: {exit_status}") + print() + + elif event_type.startswith('JOB_'): + job_name = event.get('job_name', 'N/A') + show_name = event.get('show', 'N/A') + print(f"[{timestamp}] {event_type}") + print(f" Job: {job_name}") + print(f" Show: {show_name}") + print() +``` + +### Step 2: Install dependencies + +```bash +pip install kafka-python lz4 +``` + +### Step 3: Run the consumer + +```bash +python monitor_events.py +``` + +### Step 4: Generate events + +In another terminal, submit a test job. You can use either cuecmd or PyOutline: + +**Option A: Using cuecmd** + +```bash +# Create a command file +echo "echo Hello from monitoring test" > /tmp/test_commands.txt + +# Submit the job +cuecmd /tmp/test_commands.txt --show testing --job-name monitoring_test +``` + +**Option B: Using PyOutline** + +```bash +python -c " +import outline +from outline.modules.shell import Shell + +ol = outline.Outline('monitoring_test_$RANDOM', shot='testshot', show='testing') +layer = Shell('test_layer', command=['/bin/echo', 'Hello from monitoring test'], range='1-1') +ol.add_layer(layer) +outline.cuerun.launch(ol, use_pycuerun=False) +" +``` + +Watch the consumer output as events flow through Kafka. + +## Part 4: Querying Elasticsearch + +![Kibana Dashboard](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dashboard1.png) + +### Step 1: Access Kibana + +1. Open Kibana at [http://localhost:5601](http://localhost:5601) +2. Navigate to **Management** > **Stack Management** > **Index Patterns** + +### Step 2: Create an index pattern + +1. Click **Create index pattern** +2. Enter pattern: `opencue-*` +3. Select `header.timestamp` as the time field (format: epoch_millis) +4. Click **Create index pattern** + +### Step 3: Explore events + +1. Navigate to **Discover** +2. Select the `opencue-*` index pattern +3. Set the time range to include your test events + +### Step 4: Run KQL queries + +![Kibana Dev Tools](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dev_tools.png) + +Try these example queries: + +``` +# Find all failed frames +header.event_type: "FRAME_FAILED" + +# Find events for a specific job +job_name: "test*" + +# Find frames that took longer than 1 hour +header.event_type: "FRAME_COMPLETED" AND run_time > 3600 + +# Find host down events +header.event_type: "HOST_DOWN" +``` + +### Step 5: Create a visualization + +1. Navigate to **Visualize Library** +2. Click **Create visualization** +3. Select **Lens** +4. Drag `eventType` to the visualization +5. Create a pie chart of event types + +## Part 5: Building a failure tracking dashboard + +Let's create a comprehensive failure tracking dashboard. + +### Step 1: Create failure rate panel + +In Grafana, create a new panel: + +```promql +sum(rate(cue_frames_completed_total{state="DEAD"}[1h])) by (show) +/ sum(rate(cue_frames_completed_total[1h])) by (show) +* 100 +``` + +Configure: +- Title: "Frame Failure Rate by Show (%)" +- Unit: `percent (0-100)` + +### Step 2: Create retry tracking panel + +```promql +sum(increase(cue_frames_completed_total{state="DEAD"}[24h])) by (show) +``` + +Configure: +- Title: "Failed Frames (24h)" +- Visualization: Bar gauge + +### Step 3: Create host health panel + +```promql +sum(up{job="cuebot"}) +``` + +Configure: +- Title: "Cuebot Health" +- Visualization: Stat +- Color mode: Background +- Thresholds: 0 (red), 1 (green) + +### Step 4: Organize the dashboard + +1. Arrange panels in a logical layout +2. Add row headers: "Farm Health", "Job Metrics", "Failures" +3. Set dashboard refresh rate to 30s +4. Save the dashboard + +## Challenge exercises + +### Exercise 1: Memory usage alert + +Create an alert that fires when average frame memory exceeds 16GB: + +```promql +histogram_quantile(0.95, sum(rate(cue_frame_memory_bytes_bucket[5m])) by (le)) +> 17179869184 +``` + +### Exercise 2: Capacity planning query + +Build a Grafana panel showing peak usage times: + +```promql +max_over_time(cue_dispatch_threads_total[1d]) +``` + +### Exercise 3: Custom Kafka processor + +Extend the Python consumer to: +- Track frame failure rates per show +- Send Slack notifications for high failure rates +- Write metrics to a time-series database + +## Cleanup + +To stop the monitoring stack: + +```bash +docker compose -f sandbox/docker-compose.monitoring-full.yml down +``` + +To preserve your Grafana dashboards, export them first: +1. Open the dashboard +2. Click the share icon +3. Select **Export** > **Save to file** + +## What's next? + +- [Monitoring user guide](/docs/user-guides/render-farm-monitoring-guide/) - Advanced configuration +- [Monitoring developer guide](/docs/developer-guide/monitoring-development/) - Extend the system +- [Monitoring reference](/docs/reference/monitoring-reference/) - Complete API reference diff --git a/docs/_docs/tutorials/multi-layer-jobs.md b/docs/_docs/tutorials/multi-layer-jobs.md index c633d651a..cf61290bb 100644 --- a/docs/_docs/tutorials/multi-layer-jobs.md +++ b/docs/_docs/tutorials/multi-layer-jobs.md @@ -2,7 +2,7 @@ title: "Creating Multi-Layer Jobs" layout: default parent: Tutorials -nav_order: 71 +nav_order: 76 linkTitle: "Creating Multi-Layer Jobs" date: 2025-01-29 description: > diff --git a/docs/_docs/tutorials/rest-api-tutorial.md b/docs/_docs/tutorials/rest-api-tutorial.md index 3091c1728..616f0574a 100644 --- a/docs/_docs/tutorials/rest-api-tutorial.md +++ b/docs/_docs/tutorials/rest-api-tutorial.md @@ -1,6 +1,6 @@ --- title: "REST API Tutorial" -nav_order: 77 +nav_order: 82 parent: Tutorials layout: default linkTitle: "Getting Started with OpenCue REST API" diff --git a/docs/_docs/tutorials/submitting-first-job.md b/docs/_docs/tutorials/submitting-first-job.md index a063898d1..faa533ca8 100644 --- a/docs/_docs/tutorials/submitting-first-job.md +++ b/docs/_docs/tutorials/submitting-first-job.md @@ -2,7 +2,7 @@ title: "Submitting Your First Job" layout: default parent: Tutorials -nav_order: 68 +nav_order: 73 linkTitle: "Submitting Your First Job" date: 2025-01-29 description: > diff --git a/docs/_docs/tutorials/using-cuegui.md b/docs/_docs/tutorials/using-cuegui.md index d6899c251..55017ae2d 100644 --- a/docs/_docs/tutorials/using-cuegui.md +++ b/docs/_docs/tutorials/using-cuegui.md @@ -2,7 +2,7 @@ title: "Using CueGUI for Job Monitoring" layout: default parent: Tutorials -nav_order: 69 +nav_order: 74 linkTitle: "Using CueGUI for Job Monitoring" date: 2025-01-29 description: > diff --git a/docs/_docs/user-guides/adding-removing-limits.md b/docs/_docs/user-guides/adding-removing-limits.md index 50e732cfb..4545de09e 100644 --- a/docs/_docs/user-guides/adding-removing-limits.md +++ b/docs/_docs/user-guides/adding-removing-limits.md @@ -1,6 +1,6 @@ --- title: "Adding or removing limits" -nav_order: 30 +nav_order: 33 parent: User Guides layout: default linkTitle: "Adding or removing limits" diff --git a/docs/_docs/user-guides/cuecommander-administration-guide.md b/docs/_docs/user-guides/cuecommander-administration-guide.md index 3a439e622..3e6c3092c 100644 --- a/docs/_docs/user-guides/cuecommander-administration-guide.md +++ b/docs/_docs/user-guides/cuecommander-administration-guide.md @@ -2,7 +2,7 @@ title: "CueGUI: CueCommander Administration System" layout: default parent: User Guides -nav_order: 34 +nav_order: 37 linkTitle: "CueCommander Administration Guide" date: 2025-01-13 description: > diff --git a/docs/_docs/user-guides/cuenimby-user-guide.md b/docs/_docs/user-guides/cuenimby-user-guide.md index 60d81767b..a83a0278c 100644 --- a/docs/_docs/user-guides/cuenimby-user-guide.md +++ b/docs/_docs/user-guides/cuenimby-user-guide.md @@ -1,6 +1,6 @@ --- title: "CueNIMBY User Guide" -nav_order: 37 +nav_order: 40 parent: User Guides layout: default linkTitle: "CueNIMBY user guide" diff --git a/docs/_docs/user-guides/cuetopia-monitoring-guide.md b/docs/_docs/user-guides/cuetopia-monitoring-guide.md index 1ee835af6..95c75ae16 100644 --- a/docs/_docs/user-guides/cuetopia-monitoring-guide.md +++ b/docs/_docs/user-guides/cuetopia-monitoring-guide.md @@ -2,7 +2,7 @@ title: "CueGUI: Cuetopia Monitoring System" layout: default parent: User Guides -nav_order: 33 +nav_order: 36 linkTitle: "Cuetopia Monitoring Guide" date: 2025-01-07 description: > diff --git a/docs/_docs/user-guides/cueweb-user-guide.md b/docs/_docs/user-guides/cueweb-user-guide.md index 363238804..5e5017621 100644 --- a/docs/_docs/user-guides/cueweb-user-guide.md +++ b/docs/_docs/user-guides/cueweb-user-guide.md @@ -2,7 +2,7 @@ layout: default title: CueWeb User Guide parent: User Guides -nav_order: 39 +nav_order: 42 --- # CueWeb User Guide diff --git a/docs/_docs/user-guides/index.md b/docs/_docs/user-guides/index.md index a8e58e0ef..df850f30b 100644 --- a/docs/_docs/user-guides/index.md +++ b/docs/_docs/user-guides/index.md @@ -1,7 +1,7 @@ --- layout: default title: User Guides -nav_order: 29 +nav_order: 32 has_children: true permalink: /docs/user-guides --- diff --git a/docs/_docs/user-guides/monitoring-jobs.md b/docs/_docs/user-guides/monitoring-jobs.md index 295ea21c6..93437aee5 100644 --- a/docs/_docs/user-guides/monitoring-jobs.md +++ b/docs/_docs/user-guides/monitoring-jobs.md @@ -1,6 +1,6 @@ --- title: "Monitoring jobs" -nav_order: 32 +nav_order: 35 parent: User Guides layout: default linkTitle: "Monitoring your jobs" diff --git a/docs/_docs/user-guides/render-farm-monitoring-guide.md b/docs/_docs/user-guides/render-farm-monitoring-guide.md new file mode 100644 index 000000000..543c6cb6e --- /dev/null +++ b/docs/_docs/user-guides/render-farm-monitoring-guide.md @@ -0,0 +1,390 @@ +--- +title: "Render farm monitoring guide" +nav_order: 43 +parent: User Guides +layout: default +linkTitle: "Render farm monitoring" +date: 2024-11-24 +description: > + Configure and use the OpenCue render farm monitoring system +--- + +# Render farm monitoring guide + +### Configure and use the OpenCue render farm monitoring system + +--- + +This guide explains how to use the OpenCue monitoring system to track render farm operations, create custom dashboards, and set up alerts. + +## Overview + +The OpenCue monitoring system provides three ways to observe your render farm: + +1. **Real-time metrics** via Prometheus and Grafana +2. **Event streaming** via Kafka +3. **Historical analysis** via Elasticsearch and Kibana + +## Monitoring stack components + +| Component | Purpose | URL | +|-----------|---------|-----| +| **Grafana** | Dashboards and visualization | [http://localhost:3000](http://localhost:3000) | +| **Prometheus** | Metrics collection | [http://localhost:9090](http://localhost:9090) | +| **Kafka UI** | Event stream browser | [http://localhost:8090](http://localhost:8090) | +| **Elasticsearch** | Historical data storage | [http://localhost:9200](http://localhost:9200) | +| **Kibana** | Elasticsearch visualization | [http://localhost:5601](http://localhost:5601) | +| **Kafka** | Event streaming (internal) | localhost:9092 | +| **kafka-es-indexer** | Kafka to Elasticsearch indexer (Rust) | - | +| **Zookeeper** | Kafka coordination (internal) | localhost:2181 | + + +### Grafana: OpenCue Monitoring Grafana Dashboard + +![OpenCue Monitoring Grafana Dashboard](/assets/images/opencue_monitoring/opencue_monitoring_grafana_chart.png) + +### Prometheus Metrics Interface + +![Prometheus Metrics Interface](/assets/images/opencue_monitoring/opencue_monitoring_prometheus.png) + +### UI for Apache Kafka + +![UI for Apache Kafka](/assets/images/opencue_monitoring/opencue_monitoring_ui_for_apache_kafka.png) + +### Elasticsearch Kibana - Dev Tools + +![Kibana](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dev_tools.png) + +### Elasticsearch + +![Elasticsearch](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch.png) + +## Configuring Cuebot for monitoring + +### Enabling Kafka event publishing + +Add these properties to your Cuebot configuration: + +```properties +# Enable Kafka event publishing +monitoring.kafka.enabled=true +monitoring.kafka.bootstrap.servers=your-kafka-host:9092 + +# Optional: Configure event queue +monitoring.kafka.queue.capacity=1000 +monitoring.kafka.batch.size=100 +``` + +Or pass them as command-line arguments: + +```bash +java -jar cuebot.jar \ + --monitoring.kafka.enabled=true \ + --monitoring.kafka.bootstrap.servers=kafka:9092 +``` + +### Enabling Elasticsearch storage + +Elasticsearch indexing is handled by the standalone `kafka-es-indexer` service (located in `rust/crates/kafka-es-indexer/`), not Cuebot. The indexer consumes events from Kafka and bulk indexes them into Elasticsearch. + +Using environment variables: + +```bash +export KAFKA_BOOTSTRAP_SERVERS=kafka:9092 +export ELASTICSEARCH_URL=http://elasticsearch:9200 +kafka-es-indexer +``` + +Or using CLI arguments: + +```bash +kafka-es-indexer \ + --kafka-servers kafka:9092 \ + --elasticsearch-url http://elasticsearch:9200 \ + --index-prefix opencue +``` + +### Enabling Prometheus metrics + +```properties +# Enable Prometheus metrics endpoint +metrics.prometheus.collector=true +``` + +The metrics endpoint is available at `http://cuebot-host:8080/metrics`. + +## Using Grafana dashboards + +### Accessing the dashboard + +1. Open Grafana at your configured URL (default: `http://localhost:3000`) +2. Navigate to **Dashboards** > **OpenCue Monitoring Dashboard** + +### Dashboard panels + +The pre-configured dashboard includes: + +#### Frame metrics + +| Panel | Description | Metric | +|-------|-------------|--------| +| Frames Completed (5m) | Frames completed in 5 minutes by state | `increase(cue_frames_completed_total[5m])` | +| Frame Runtime Distribution | P50 and P95 frame execution times | `histogram_quantile(0.95, cue_frame_runtime_seconds_bucket)` | +| Frame Memory Usage Distribution | Memory consumption distribution | `histogram_quantile(0.95, cue_frame_memory_bytes_bucket)` | + +#### Job metrics + +| Panel | Description | Metric | +|-------|-------------|--------| +| Jobs Completed by Show (5m) | Jobs completed per show in 5 minutes | `increase(cue_jobs_completed_total[5m])` | + +#### System health + +| Panel | Description | Metric | +|-------|-------------|--------| +| Host Reports Received (5m) | Reports received from render hosts | `increase(cue_host_reports_received_total[5m])` | + +### Creating custom panels + +To create a custom panel: + +1. Click **Add** > **Visualization** +2. Select **Prometheus** as the data source +3. Enter your PromQL query +4. Configure visualization options + +Example queries: + +```promql +# Average frame runtime by show (last hour) +avg(rate(cue_frame_runtime_seconds_sum[1h])) by (show) + / avg(rate(cue_frame_runtime_seconds_count[1h])) by (show) + +# Failed frame rate +rate(cue_frames_completed_total{state="DEAD"}[5m]) + +# Queue saturation +cue_dispatch_waiting_total / cue_dispatch_remaining_capacity_total +``` + +### Setting up alerts + +To create an alert in Grafana: + +1. Edit a panel or create a new one +2. Click the **Alert** tab +3. Configure alert conditions + +Example alert: High frame failure rate + +```yaml +Alert name: High Frame Failure Rate +Condition: rate(cue_frames_completed_total{state="DEAD"}[5m]) > 0.1 +For: 5m +Message: "Frame failure rate is elevated. Check job configurations and host health." +``` + +Example alert: Cuebot down + +```yaml +Alert name: Cuebot Down +Condition: up{job="cuebot"} == 0 +For: 1m +Message: "Cuebot is not responding to Prometheus scrapes." +``` + +## Using Kafka for event streaming + +![Kafka UI for Apache Kafka](/assets/images/opencue_monitoring/opencue_monitoring_ui_for_apache_kafka.png) + +### Viewing events + +Use the Kafka console consumer to view events: + +```bash +# View job events +kafka-console-consumer --bootstrap-server kafka:9092 \ + --topic opencue.job.events --from-beginning + +# View frame events (latest only) +kafka-console-consumer --bootstrap-server kafka:9092 \ + --topic opencue.frame.events +``` + +### Event format + +Events are published as JSON messages with a header containing metadata and fields at the top level: + +```json +{ + "header": { + "event_id": "f533d84a-1586-4980-8c5e-3443376425c9", + "event_type": "FRAME_COMPLETED", + "timestamp": "1764097486229", + "source_cuebot": "cuebot-01", + "correlation_id": "fa7bbb9a-cae1-4f6b-a50a-88a9ac349d24" + }, + "frame_id": "fa18c460-0e92-49e1-8d6a-e26473ac2708", + "frame_name": "0001-render", + "frame_number": 1, + "layer_id": "53ec9034-b16b-4cc2-9eec-05f68b1848bf", + "layer_name": "render", + "job_id": "fa7bbb9a-cae1-4f6b-a50a-88a9ac349d24", + "job_name": "show-shot-user_render", + "show": "show", + "state": "SUCCEEDED", + "previous_state": "RUNNING", + "exit_status": 0, + "run_time": 3600, + "max_rss": "8589934592", + "host_name": "render-node-01", + "num_cores": 8, + "num_gpus": 0 +} +``` + +### Integrating with external systems + +Kafka events can be consumed by external systems for: + +- **Custom alerting**: Build alerts based on specific job or frame conditions +- **Cost tracking**: Calculate render costs based on resource usage +- **Capacity planning**: Analyze usage patterns for infrastructure planning +- **Reporting**: Generate custom reports on render farm utilization + +Example Python consumer: + +```python +from kafka import KafkaConsumer +import json + +# Note: lz4 library required for decompression (pip install kafka-python lz4) +consumer = KafkaConsumer( + 'opencue.frame.events', + bootstrap_servers=['kafka:9092'], + value_deserializer=lambda m: json.loads(m.decode('utf-8')) +) + +for message in consumer: + event = message.value + header = event.get('header', {}) + if header.get('event_type') == 'FRAME_FAILED': + print(f"Frame failed: {event.get('frame_name')}") + # Send alert, update database, etc. +``` + +## Using Elasticsearch for historical analysis + +![Kibana Dashboard](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dashboard1.png) + +### Querying events in Kibana + +1. Open Kibana at your configured URL (default: `http://localhost:5601`) +2. Navigate to **Discover** +3. Select the `opencue-*` index pattern +4. Use KQL to search events + +Example queries: + +``` +# Find all failed frames for a job +header.event_type: "FRAME_FAILED" AND job_name: "myshow*" + +# Find frames that took longer than 1 hour +header.event_type: "FRAME_COMPLETED" AND run_time > 3600 + +# Find host down events +header.event_type: "HOST_DOWN" AND host_name: "render-*" + +# Find all events for a specific show +show: "testing" +``` + +![Kibana Dev Tools](/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dev_tools.png) + +### Creating visualizations + +In Kibana, you can create: + +- **Time series**: Frame completion over time +- **Pie charts**: Frame states distribution +- **Data tables**: Top failing jobs or layers +- **Metrics**: Average frame runtime + +### Retention and cleanup + +Configure Elasticsearch index lifecycle management (ILM) to manage data retention: + +```json +{ + "policy": { + "phases": { + "hot": { + "actions": { + "rollover": { + "max_age": "7d", + "max_size": "50gb" + } + } + }, + "delete": { + "min_age": "30d", + "actions": { + "delete": {} + } + } + } + } +} +``` + +## Prometheus metrics reference + +### Job metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cue_jobs_completed_total` | Counter | show | Total jobs completed | +| `cue_frames_completed_total` | Counter | state, show | Total frames completed | +| `cue_frame_runtime_seconds` | Histogram | show | Frame execution time | +| `cue_frame_memory_bytes` | Histogram | show | Frame memory usage | + +### Queue metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cue_dispatch_waiting_total` | Gauge | - | Dispatch queue size | +| `cue_dispatch_threads_total` | Gauge | - | Active dispatch threads | +| `cue_booking_waiting_total` | Gauge | - | Booking queue size | +| `cue_report_executed_total` | Gauge | - | Host reports processed | + +### Host metrics + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `cue_host_reports_received_total` | Counter | facility | Host reports received | + +## Best practices + +### Dashboard organization + +- Create separate dashboards for operations, capacity planning, and debugging +- Use template variables to filter by show, facility, or time range +- Set appropriate refresh intervals (5s for real-time, 1m for overview) + +### Alert tuning + +- Start with conservative thresholds and adjust based on baseline +- Use `for` clauses to avoid alerting on transient spikes +- Include runbook links in alert messages + +### Data retention + +- Keep high-resolution metrics for 2-4 weeks +- Downsample older data for long-term trends +- Archive raw events to cold storage if needed for compliance + +## What's next? + +- [Monitoring developer guide](/docs/developer-guide/monitoring-development/) - Extend and customize the monitoring system +- [Render farm monitoring concepts](/docs/concepts/render-farm-monitoring/) - Understand the monitoring architecture diff --git a/docs/_docs/user-guides/submitting-jobs.md b/docs/_docs/user-guides/submitting-jobs.md index 9754dffd1..3ec8ae390 100644 --- a/docs/_docs/user-guides/submitting-jobs.md +++ b/docs/_docs/user-guides/submitting-jobs.md @@ -1,6 +1,6 @@ --- title: "Submitting jobs" -nav_order: 31 +nav_order: 34 parent: User Guides layout: default linkTitle: "Submitting jobs" diff --git a/docs/_docs/user-guides/using-cuecmd.md b/docs/_docs/user-guides/using-cuecmd.md index 0f23354bd..0465f3f04 100644 --- a/docs/_docs/user-guides/using-cuecmd.md +++ b/docs/_docs/user-guides/using-cuecmd.md @@ -1,6 +1,6 @@ --- title: "Cuecmd User Guide" -nav_order: 36 +nav_order: 39 parent: "User Guides" layout: default date: 2025-10-02 diff --git a/docs/_docs/user-guides/using-filters.md b/docs/_docs/user-guides/using-filters.md index 0c020e133..802063398 100644 --- a/docs/_docs/user-guides/using-filters.md +++ b/docs/_docs/user-guides/using-filters.md @@ -1,6 +1,6 @@ --- title: "Using Filters" -nav_order: 35 +nav_order: 38 parent: User Guides layout: default date: 2025-10-15 diff --git a/docs/_docs/user-guides/using-rest-api.md b/docs/_docs/user-guides/using-rest-api.md index 21aaf9f50..daccdb06a 100644 --- a/docs/_docs/user-guides/using-rest-api.md +++ b/docs/_docs/user-guides/using-rest-api.md @@ -1,6 +1,6 @@ --- title: "Cue REST API User Guide" -nav_order: 38 +nav_order: 41 parent: User Guides layout: default linkTitle: "Using the OpenCue REST API" diff --git a/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch.png b/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch.png new file mode 100644 index 000000000..88a38ebcd Binary files /dev/null and b/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch.png differ diff --git a/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dashboard1.png b/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dashboard1.png new file mode 100644 index 000000000..6195d0518 Binary files /dev/null and b/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dashboard1.png differ diff --git a/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dashboard2.png b/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dashboard2.png new file mode 100644 index 000000000..b4119b334 Binary files /dev/null and b/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dashboard2.png differ diff --git a/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dev_tools.png b/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dev_tools.png new file mode 100644 index 000000000..9822bf77c Binary files /dev/null and b/docs/assets/images/opencue_monitoring/opencue_monitoring_elasticsearch_kibana_dev_tools.png differ diff --git a/docs/assets/images/opencue_monitoring/opencue_monitoring_grafana_chart.png b/docs/assets/images/opencue_monitoring/opencue_monitoring_grafana_chart.png new file mode 100644 index 000000000..a034eb41c Binary files /dev/null and b/docs/assets/images/opencue_monitoring/opencue_monitoring_grafana_chart.png differ diff --git a/docs/assets/images/opencue_monitoring/opencue_monitoring_prometheus.png b/docs/assets/images/opencue_monitoring/opencue_monitoring_prometheus.png new file mode 100644 index 000000000..73e7e55aa Binary files /dev/null and b/docs/assets/images/opencue_monitoring/opencue_monitoring_prometheus.png differ diff --git a/docs/assets/images/opencue_monitoring/opencue_monitoring_ui_for_apache_kafka.png b/docs/assets/images/opencue_monitoring/opencue_monitoring_ui_for_apache_kafka.png new file mode 100644 index 000000000..ad4d1885c Binary files /dev/null and b/docs/assets/images/opencue_monitoring/opencue_monitoring_ui_for_apache_kafka.png differ diff --git a/docs/nav_order_index.txt b/docs/nav_order_index.txt index 46ff7d6e2..4d285f902 100644 --- a/docs/nav_order_index.txt +++ b/docs/nav_order_index.txt @@ -13,84 +13,92 @@ 6|/Users/rfigueiredo/github/OpenCue/docs/_docs/quick-starts/quick-start-cuecmd.md 7|/Users/rfigueiredo/github/OpenCue/docs/_docs/quick-starts/quick-start-cuenimby.md 8|/Users/rfigueiredo/github/OpenCue/docs/_docs/quick-starts/quick-start-cueweb.md -9|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/index.md -10|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/opencue-overview.md -11|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/glossary.md -12|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/versioning.md -13|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/nimby.md -14|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/filters-and-actions.md -15|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/command-execution.md -16|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/cueweb-rest-gateway.md -17|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/spi-case-study.md -18|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/index.md -19|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/setting-up-the-database.md -20|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/deploying-cuebot.md -21|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/deploying-rqd.md -22|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/checking-out-the-source-code.md -23|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/installing-pycue-and-pyoutline.md -24|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/installing-cueadmin.md -25|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/installing-cuegui.md -26|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/installing-cuesubmit.md -27|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/deploying-rest-gateway.md -28|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/deploying-cueweb.md -29|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/index.md -30|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/adding-removing-limits.md -31|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/submitting-jobs.md -32|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/monitoring-jobs.md -33|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/cuetopia-monitoring-guide.md -34|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/cuecommander-administration-guide.md -35|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/using-filters.md -36|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/using-cuecmd.md -37|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/cuenimby-user-guide.md -38|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/using-rest-api.md -39|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/cueweb-user-guide.md -40|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/index.md -41|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/configuring-opencue.md -42|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/configuring-limits.md -43|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/customizing-rqd.md -44|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/applying-database-migrations.md -45|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/troubleshooting-deployment.md -46|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/troubleshooting-rendering.md -47|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/monitoring-with-prometheus-loki-and-grafana.md -48|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/containerized_frames.md -49|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/framelogging-with-loki.md -50|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/desktop-rendering-control.md -51|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/deploying-rest-gateway.md -52|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/cueweb.md -53|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/index.md -54|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/CueGUI-app.md -55|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/commands/cueadmin.md -56|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/commands/pycuerun.md -57|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/rust-rqd.md -58|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/cuecommander-technical-reference.md -59|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/tools/index.md -60|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/tools/cueadmin.md -61|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/tools/cueman.md -62|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/tools/cuecmd.md -63|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/tools/cuenimby.md -64|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/filter-actions-reference.md -65|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/rest-api-reference.md -66|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/index.md -67|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/getting-started-tutorial.md -68|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/submitting-first-job.md -69|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/using-cuegui.md -70|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/managing-jobs-frames.md -71|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/multi-layer-jobs.md -72|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/cueadmin-tutorial.md -73|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/cueman-tutorial.md -74|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/cuecmd-tutorial.md -75|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/cuenimby-tutorial.md -76|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/filter-tutorial.md -77|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/rest-api-tutorial.md -78|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/cueweb-tutorial.md -79|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/dcc-integration.md -80|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/index.md -81|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/contributing.md -82|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/sandbox-testing.md -83|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/cuetopia-technical-reference.md -84|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/cuecommander-technical-reference.md -85|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/cuecmd-development.md -86|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/cuenimby-development.md -87|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/filter-development.md -88|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/rest-gateway-development.md -89|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/cueweb-development.md +9|/Users/rfigueiredo/github/OpenCue/docs/_docs/quick-starts/quick-start-monitoring.md +10|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/index.md +11|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/opencue-overview.md +12|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/glossary.md +13|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/versioning.md +14|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/nimby.md +15|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/filters-and-actions.md +16|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/command-execution.md +17|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/cueweb-rest-gateway.md +18|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/render-farm-monitoring.md +19|/Users/rfigueiredo/github/OpenCue/docs/_docs/concepts/spi-case-study.md +20|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/index.md +21|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/setting-up-the-database.md +22|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/deploying-cuebot.md +23|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/deploying-rqd.md +24|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/checking-out-the-source-code.md +25|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/installing-pycue-and-pyoutline.md +26|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/installing-cueadmin.md +27|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/installing-cuegui.md +28|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/installing-cuesubmit.md +29|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/deploying-rest-gateway.md +30|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/deploying-cueweb.md +31|/Users/rfigueiredo/github/OpenCue/docs/_docs/getting-started/deploying-monitoring.md +32|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/index.md +33|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/adding-removing-limits.md +34|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/submitting-jobs.md +35|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/monitoring-jobs.md +36|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/cuetopia-monitoring-guide.md +37|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/cuecommander-administration-guide.md +38|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/using-filters.md +39|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/using-cuecmd.md +40|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/cuenimby-user-guide.md +41|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/using-rest-api.md +42|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/cueweb-user-guide.md +43|/Users/rfigueiredo/github/OpenCue/docs/_docs/user-guides/render-farm-monitoring-guide.md +44|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/index.md +45|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/configuring-opencue.md +46|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/configuring-limits.md +47|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/customizing-rqd.md +48|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/applying-database-migrations.md +49|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/troubleshooting-deployment.md +50|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/troubleshooting-rendering.md +51|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/monitoring-with-prometheus-loki-and-grafana.md +52|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/containerized_frames.md +53|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/framelogging-with-loki.md +54|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/desktop-rendering-control.md +55|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/deploying-rest-gateway.md +56|/Users/rfigueiredo/github/OpenCue/docs/_docs/other-guides/cueweb.md +57|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/index.md +58|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/CueGUI-app.md +59|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/commands/cueadmin.md +60|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/commands/pycuerun.md +61|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/rust-rqd.md +62|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/cuecommander-technical-reference.md +63|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/tools/index.md +64|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/tools/cueadmin.md +65|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/tools/cueman.md +66|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/tools/cuecmd.md +67|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/tools/cuenimby.md +68|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/filter-actions-reference.md +69|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/rest-api-reference.md +70|/Users/rfigueiredo/github/OpenCue/docs/_docs/reference/monitoring-reference.md +71|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/index.md +72|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/getting-started-tutorial.md +73|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/submitting-first-job.md +74|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/using-cuegui.md +75|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/managing-jobs-frames.md +76|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/multi-layer-jobs.md +77|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/cueadmin-tutorial.md +78|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/cueman-tutorial.md +79|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/cuecmd-tutorial.md +80|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/cuenimby-tutorial.md +81|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/filter-tutorial.md +82|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/rest-api-tutorial.md +83|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/cueweb-tutorial.md +84|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/monitoring-tutorial.md +85|/Users/rfigueiredo/github/OpenCue/docs/_docs/tutorials/dcc-integration.md +86|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/index.md +87|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/contributing.md +88|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/sandbox-testing.md +89|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/cuetopia-technical-reference.md +90|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/cuecommander-technical-reference.md +91|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/cuecmd-development.md +92|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/cuenimby-development.md +93|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/filter-development.md +94|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/rest-gateway-development.md +95|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/cueweb-development.md +96|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/hybrid-rqd-setup.md +97|/Users/rfigueiredo/github/OpenCue/docs/_docs/developer-guide/monitoring-development.md diff --git a/proto/src/monitoring.proto b/proto/src/monitoring.proto new file mode 100644 index 000000000..ccb7c39bc --- /dev/null +++ b/proto/src/monitoring.proto @@ -0,0 +1,454 @@ + +syntax = "proto3"; +package monitoring; + +option java_package = "com.imageworks.spcue.grpc.monitoring"; +option java_multiple_files = true; + +option go_package = "opencue_gateway/gen/go"; + +import "job.proto"; +import "host.proto"; +import "report.proto"; + +// Monitoring Events for Render Farm Statistics +// These events are published to Kafka for downstream processing + +// -------- Enums -------- + +// Types of lifecycle events +enum EventType { + EVENT_TYPE_UNKNOWN = 0; + + // Job events + JOB_CREATED = 1; + JOB_STARTED = 2; + JOB_PAUSED = 3; + JOB_RESUMED = 4; + JOB_FINISHED = 5; + JOB_KILLED = 6; + + // Layer events + LAYER_CREATED = 10; + LAYER_STARTED = 11; + LAYER_COMPLETED = 12; + + // Frame events + FRAME_DISPATCHED = 20; + FRAME_STARTED = 21; + FRAME_COMPLETED = 22; + FRAME_FAILED = 23; + FRAME_RETRIED = 24; + FRAME_KILLED = 25; + FRAME_EATEN = 26; + FRAME_CHECKPOINT = 27; + + // Host events + HOST_BOOT = 31; + HOST_STATE_CHANGED = 32; + HOST_LOCKED = 33; + HOST_UNLOCKED = 34; + + // Proc events + PROC_BOOKED = 40; + PROC_UNBOOKED = 41; + PROC_REDIRECTED = 42; +} + +// -------- Base Event Message -------- + +// Common header for all monitoring events +message EventHeader { + // Unique event ID (UUID) + string event_id = 1; + + // Event type + EventType event_type = 2; + + // Timestamp when event occurred (Unix epoch milliseconds) + int64 timestamp = 3; + + // Source cuebot instance that generated the event + string source_cuebot = 4; + + // Optional correlation ID for tracing related events + string correlation_id = 5; +} + +// -------- Job Events -------- + +message JobEvent { + EventHeader header = 1; + + // Embedded job data (uses composition instead of duplicating fields) + job.Job job = 2; + + // Event-specific fields + job.JobState previous_state = 3; + + // Kill/finish reason if applicable + string reason = 4; + string killed_by = 5; +} + +// -------- Layer Events -------- + +message LayerEvent { + EventHeader header = 1; + + // Embedded layer data + job.Layer layer = 2; + + // Context fields (not in Layer message) + string job_id = 3; + string job_name = 4; + string show = 5; +} + +// -------- Frame Events -------- + +message FrameEvent { + EventHeader header = 1; + + // Embedded frame data + job.Frame frame = 2; + + // Context fields (not in Frame message) + string layer_id = 3; + string job_id = 4; + string job_name = 5; + string show = 6; + + // Event-specific fields + job.FrameState previous_state = 7; + int32 exit_signal = 8; + int32 run_time = 9; + + // Resource allocation (not in Frame message) + int32 num_cores = 10; + int32 num_gpus = 11; + + // Host information + string host_name = 12; + string resource_id = 13; + + // Kill reason if applicable + string reason = 14; + string killed_by = 15; +} + +// -------- Host Events -------- + +message HostEvent { + EventHeader header = 1; + + // Embedded host data + host.Host host = 2; + + // Context fields (not in Host message) + string facility = 3; + + // Event-specific fields + host.HardwareState previous_state = 4; + host.LockState previous_lock_state = 5; + bool nimby_locked = 6; + + // Reason for state change if applicable + string reason = 7; +} + +// -------- Proc Events -------- + +message ProcEvent { + EventHeader header = 1; + + // Proc identification + string proc_id = 2; + string proc_name = 3; + string host_id = 4; + string host_name = 5; + + // Assignment information + string job_id = 6; + string job_name = 7; + string layer_id = 8; + string layer_name = 9; + string frame_id = 10; + string frame_name = 11; + string show = 12; + string group_name = 13; + + // Resource reservation + float reserved_cores = 14; + float reserved_gpus = 15; + int64 reserved_memory = 16; + int64 reserved_gpu_memory = 17; + + // Booking information + int32 dispatch_time = 18; + int32 booked_time = 19; + bool is_local_dispatch = 20; + bool is_unbooked = 21; + + // Redirect target if applicable + string redirect_target = 22; + + // Services + repeated string services = 23; +} + +// -------- Aggregate Statistics Event -------- + +// Periodic snapshot of farm-wide statistics +message FarmStatisticsEvent { + EventHeader header = 1; + + // Snapshot time + int64 snapshot_time = 2; + + // Job statistics + int32 total_jobs = 3; + int32 pending_jobs = 4; + int32 finished_jobs = 5; + + // Frame statistics + int64 total_frames = 6; + int64 waiting_frames = 7; + int64 running_frames = 8; + int64 succeeded_frames = 9; + int64 dead_frames = 10; + int64 eaten_frames = 11; + int64 depend_frames = 12; + + // Host statistics + int32 total_hosts = 13; + int32 up_hosts = 14; + int32 down_hosts = 15; + int32 repair_hosts = 16; + int32 nimby_locked_hosts = 17; + + // Resource utilization + float total_cores = 18; + float running_cores = 19; + float idle_cores = 20; + float total_gpus = 21; + float running_gpus = 22; + float idle_gpus = 23; + int64 total_memory = 24; + int64 used_memory = 25; + + // Per-show breakdown + repeated ShowStatistics show_stats = 26; +} + +message ShowStatistics { + string show = 1; + int32 pending_jobs = 2; + int64 running_frames = 3; + int64 waiting_frames = 4; + float reserved_cores = 5; + float reserved_gpus = 6; +} + +// -------- Services -------- + +// Service for querying historical monitoring data +service MonitoringInterface { + // Get historical job events + rpc GetJobHistory(GetJobHistoryRequest) returns (GetJobHistoryResponse); + + // Get historical frame events for a job + rpc GetFrameHistory(GetFrameHistoryRequest) returns (GetFrameHistoryResponse); + + // Get historical layer events for a job + rpc GetLayerHistory(GetLayerHistoryRequest) returns (GetLayerHistoryResponse); + + // Get historical host events + rpc GetHostHistory(GetHostHistoryRequest) returns (GetHostHistoryResponse); + + // Get aggregated statistics over a time range + rpc GetFarmStatistics(GetFarmStatisticsRequest) returns (GetFarmStatisticsResponse); + + // Get memory usage history for a layer + rpc GetLayerMemoryHistory(GetLayerMemoryHistoryRequest) returns (GetLayerMemoryHistoryResponse); +} + +// -------- Requests & Responses -------- + +// Time range for historical queries +message TimeRange { + int64 start_time = 1; // Unix epoch milliseconds + int64 end_time = 2; // Unix epoch milliseconds +} + +// Pagination for results +message Pagination { + int32 page = 1; + int32 page_size = 2; + int32 total_pages = 3; + int64 total_records = 4; +} + +// GetJobHistory +message GetJobHistoryRequest { + // Filter criteria + repeated string shows = 1; + repeated string users = 2; + repeated string shots = 3; + repeated string job_name_regex = 4; + repeated job.JobState states = 5; + TimeRange time_range = 6; + + // Pagination + int32 page = 7; + int32 page_size = 8; + int32 max_results = 9; +} + +message GetJobHistoryResponse { + repeated HistoricalJob jobs = 1; + Pagination pagination = 2; +} + +message HistoricalJob { + string id = 1; + string name = 2; + string show = 3; + string shot = 4; + string user = 5; + string facility = 6; + job.JobState final_state = 7; + int32 start_time = 8; + int32 stop_time = 9; + int32 priority = 10; + + // Final statistics + int32 total_frames = 11; + int32 succeeded_frames = 12; + int32 failed_frames = 13; + int64 total_core_seconds = 14; + int64 total_gpu_seconds = 15; + int64 max_rss = 16; +} + +// GetFrameHistory +message GetFrameHistoryRequest { + string job_id = 1; + string job_name = 2; + repeated string layer_names = 3; + repeated job.FrameState states = 4; + TimeRange time_range = 5; + + // Pagination + int32 page = 6; + int32 page_size = 7; +} + +message GetFrameHistoryResponse { + repeated HistoricalFrame frames = 1; + Pagination pagination = 2; +} + +message HistoricalFrame { + string id = 1; + string name = 2; + string layer_name = 3; + string job_name = 4; + string show = 5; + int32 frame_number = 6; + job.FrameState final_state = 7; + int32 exit_status = 8; + int32 retry_count = 9; + int32 start_time = 10; + int32 stop_time = 11; + int64 max_rss = 12; + string last_host = 13; + int32 total_core_time = 14; + int32 total_gpu_time = 15; +} + +// GetLayerHistory +message GetLayerHistoryRequest { + string job_id = 1; + string job_name = 2; + TimeRange time_range = 3; + + // Pagination + int32 page = 4; + int32 page_size = 5; +} + +message GetLayerHistoryResponse { + repeated HistoricalLayer layers = 1; + Pagination pagination = 2; +} + +message HistoricalLayer { + string id = 1; + string name = 2; + string job_name = 3; + string show = 4; + job.LayerType type = 5; + repeated string tags = 6; + repeated string services = 7; + int32 total_frames = 8; + int32 succeeded_frames = 9; + int32 failed_frames = 10; + int64 total_core_seconds = 11; + int64 total_gpu_seconds = 12; + int64 max_rss = 13; + int64 avg_frame_seconds = 14; +} + +// GetHostHistory +message GetHostHistoryRequest { + repeated string host_names = 1; + repeated string facilities = 2; + repeated string allocations = 3; + repeated host.HardwareState states = 4; + TimeRange time_range = 5; + + // Pagination + int32 page = 6; + int32 page_size = 7; +} + +message GetHostHistoryResponse { + repeated HostEvent events = 1; + Pagination pagination = 2; +} + +// GetFarmStatistics +message GetFarmStatisticsRequest { + TimeRange time_range = 1; + int32 interval_minutes = 2; // Aggregation interval + repeated string shows = 3; // Filter by shows +} + +message GetFarmStatisticsResponse { + repeated FarmStatisticsEvent statistics = 1; +} + +// GetLayerMemoryHistory +message GetLayerMemoryHistoryRequest { + string layer_name = 1; + repeated string shows = 2; + TimeRange time_range = 3; + int32 max_results = 4; +} + +message GetLayerMemoryHistoryResponse { + repeated LayerMemoryRecord records = 1; +} + +message LayerMemoryRecord { + string job_name = 1; + string layer_name = 2; + string show = 3; + int32 timestamp = 4; + int64 max_rss = 5; + int64 reserved_memory = 6; + int32 frame_count = 7; + int64 avg_frame_memory = 8; + int64 p95_frame_memory = 9; +} diff --git a/pycue/opencue/cuebot.py b/pycue/opencue/cuebot.py index 8d5c530d1..4de529e6f 100644 --- a/pycue/opencue/cuebot.py +++ b/pycue/opencue/cuebot.py @@ -58,6 +58,8 @@ from opencue_proto import subscription_pb2_grpc from opencue_proto import task_pb2 from opencue_proto import task_pb2_grpc +from opencue_proto import monitoring_pb2 +from opencue_proto import monitoring_pb2_grpc from opencue.exception import ConnectionException from opencue.exception import CueException import opencue.config @@ -107,6 +109,7 @@ class Cuebot(object): 'layer': job_pb2, 'limit': limit_pb2, 'matcher': filter_pb2, + 'monitoring': monitoring_pb2, 'owner': host_pb2, 'proc': host_pb2, 'renderPartition': renderPartition_pb2, @@ -132,6 +135,7 @@ class Cuebot(object): 'layer': job_pb2_grpc.LayerInterfaceStub, 'limit': limit_pb2_grpc.LimitInterfaceStub, 'matcher': filter_pb2_grpc.MatcherInterfaceStub, + 'monitoring': monitoring_pb2_grpc.MonitoringInterfaceStub, 'owner': host_pb2_grpc.OwnerInterfaceStub, 'proc': host_pb2_grpc.ProcInterfaceStub, 'renderPartition': renderPartition_pb2_grpc.RenderPartitionInterfaceStub, diff --git a/pycue/opencue/wrappers/monitoring.py b/pycue/opencue/wrappers/monitoring.py new file mode 100644 index 000000000..86c39d9ce --- /dev/null +++ b/pycue/opencue/wrappers/monitoring.py @@ -0,0 +1,649 @@ +# Copyright Contributors to the OpenCue Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Monitoring and Historical Data Access Wrappers. + +This module provides access to historical render farm statistics including +job history, frame history, layer memory usage, and host metrics over time. +""" + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +# Import monitoring_pb2 - this module should only be used when monitoring is available +try: + from opencue_proto import monitoring_pb2 +except ImportError: + monitoring_pb2 = None + +from opencue.cuebot import Cuebot +from opencue import util + + +class HistoricalJob: + """Represents a historical job record from the monitoring system.""" + + def __init__(self, data): + self.data = data + + @property + def id(self): + """Returns the job ID.""" + return self.data.id + + @property + def name(self): + """Returns the job name.""" + return self.data.name + + @property + def show(self): + """Returns the show name.""" + return self.data.show + + @property + def shot(self): + """Returns the shot name.""" + return self.data.shot + + @property + def user(self): + """Returns the username who submitted the job.""" + return self.data.user + + @property + def facility(self): + """Returns the facility name.""" + return self.data.facility + + @property + def finalState(self): + """Returns the final job state.""" + return self.data.final_state + + @property + def startTime(self): + """Returns the job start time as Unix timestamp.""" + return self.data.start_time + + @property + def stopTime(self): + """Returns the job stop time as Unix timestamp.""" + return self.data.stop_time + + @property + def priority(self): + """Returns the job priority.""" + return self.data.priority + + @property + def totalFrames(self): + """Returns the total number of frames.""" + return self.data.total_frames + + @property + def succeededFrames(self): + """Returns the number of succeeded frames.""" + return self.data.succeeded_frames + + @property + def failedFrames(self): + """Returns the number of failed frames.""" + return self.data.failed_frames + + @property + def totalCoreSeconds(self): + """Returns total core-seconds consumed.""" + return self.data.total_core_seconds + + @property + def totalGpuSeconds(self): + """Returns total GPU-seconds consumed.""" + return self.data.total_gpu_seconds + + @property + def maxRss(self): + """Returns maximum RSS memory usage in KB.""" + return self.data.max_rss + + def __repr__(self): + return f"HistoricalJob({self.name})" + + +class HistoricalFrame: + """Represents a historical frame record from the monitoring system.""" + + def __init__(self, data): + self.data = data + + @property + def id(self): + """Returns the frame ID.""" + return self.data.id + + @property + def name(self): + """Returns the frame name.""" + return self.data.name + + @property + def layerName(self): + """Returns the layer name.""" + return self.data.layer_name + + @property + def jobName(self): + """Returns the job name.""" + return self.data.job_name + + @property + def show(self): + """Returns the show name.""" + return self.data.show + + @property + def frameNumber(self): + """Returns the frame number.""" + return self.data.frame_number + + @property + def finalState(self): + """Returns the final frame state.""" + return self.data.final_state + + @property + def exitStatus(self): + """Returns the exit status code.""" + return self.data.exit_status + + @property + def retryCount(self): + """Returns the number of retries.""" + return self.data.retry_count + + @property + def startTime(self): + """Returns the start time as Unix timestamp.""" + return self.data.start_time + + @property + def stopTime(self): + """Returns the stop time as Unix timestamp.""" + return self.data.stop_time + + @property + def maxRss(self): + """Returns maximum RSS memory usage in KB.""" + return self.data.max_rss + + @property + def lastHost(self): + """Returns the last host that ran this frame.""" + return self.data.last_host + + @property + def totalCoreTime(self): + """Returns total core time in seconds.""" + return self.data.total_core_time + + @property + def totalGpuTime(self): + """Returns total GPU time in seconds.""" + return self.data.total_gpu_time + + def __repr__(self): + return f"HistoricalFrame({self.name})" + + +class HistoricalLayer: + """Represents a historical layer record from the monitoring system.""" + + def __init__(self, data): + self.data = data + + @property + def id(self): + """Returns the layer ID.""" + return self.data.id + + @property + def name(self): + """Returns the layer name.""" + return self.data.name + + @property + def jobName(self): + """Returns the job name.""" + return self.data.job_name + + @property + def show(self): + """Returns the show name.""" + return self.data.show + + @property + def layerType(self): + """Returns the layer type.""" + return self.data.type + + @property + def tags(self): + """Returns the layer tags.""" + return list(self.data.tags) + + @property + def services(self): + """Returns the layer services.""" + return list(self.data.services) + + @property + def totalFrames(self): + """Returns the total number of frames.""" + return self.data.total_frames + + @property + def succeededFrames(self): + """Returns the number of succeeded frames.""" + return self.data.succeeded_frames + + @property + def failedFrames(self): + """Returns the number of failed frames.""" + return self.data.failed_frames + + @property + def totalCoreSeconds(self): + """Returns total core-seconds consumed.""" + return self.data.total_core_seconds + + @property + def totalGpuSeconds(self): + """Returns total GPU-seconds consumed.""" + return self.data.total_gpu_seconds + + @property + def maxRss(self): + """Returns maximum RSS memory usage in KB.""" + return self.data.max_rss + + @property + def avgFrameSeconds(self): + """Returns average frame render time in seconds.""" + return self.data.avg_frame_seconds + + def __repr__(self): + return f"HistoricalLayer({self.name})" + + +class LayerMemoryRecord: + """Represents a historical layer memory usage record.""" + + def __init__(self, data): + self.data = data + + @property + def jobName(self): + """Returns the job name.""" + return self.data.job_name + + @property + def layerName(self): + """Returns the layer name.""" + return self.data.layer_name + + @property + def show(self): + """Returns the show name.""" + return self.data.show + + @property + def timestamp(self): + """Returns the timestamp as Unix epoch.""" + return self.data.timestamp + + @property + def maxRss(self): + """Returns maximum RSS memory usage in KB.""" + return self.data.max_rss + + @property + def reservedMemory(self): + """Returns reserved memory in KB.""" + return self.data.reserved_memory + + @property + def frameCount(self): + """Returns the number of frames in this record.""" + return self.data.frame_count + + @property + def avgFrameMemory(self): + """Returns average frame memory usage in KB.""" + return self.data.avg_frame_memory + + @property + def p95FrameMemory(self): + """Returns 95th percentile frame memory usage in KB.""" + return self.data.p95_frame_memory + + def __repr__(self): + return f"LayerMemoryRecord({self.layerName}@{self.timestamp})" + + +class FarmStatistics: + """Represents a snapshot of farm-wide statistics.""" + + def __init__(self, data): + self.data = data + + @property + def snapshotTime(self): + """Returns the timestamp of this snapshot.""" + return self.data.snapshot_time + + @property + def totalJobs(self): + """Returns total number of jobs.""" + return self.data.total_jobs + + @property + def pendingJobs(self): + """Returns number of pending jobs.""" + return self.data.pending_jobs + + @property + def finishedJobs(self): + """Returns number of finished jobs.""" + return self.data.finished_jobs + + @property + def totalFrames(self): + """Returns total number of frames.""" + return self.data.total_frames + + @property + def waitingFrames(self): + """Returns number of waiting frames.""" + return self.data.waiting_frames + + @property + def runningFrames(self): + """Returns number of running frames.""" + return self.data.running_frames + + @property + def succeededFrames(self): + """Returns number of succeeded frames.""" + return self.data.succeeded_frames + + @property + def deadFrames(self): + """Returns number of dead frames.""" + return self.data.dead_frames + + @property + def totalHosts(self): + """Returns total number of hosts.""" + return self.data.total_hosts + + @property + def upHosts(self): + """Returns number of hosts in UP state.""" + return self.data.up_hosts + + @property + def downHosts(self): + """Returns number of hosts in DOWN state.""" + return self.data.down_hosts + + @property + def totalCores(self): + """Returns total cores in the farm.""" + return self.data.total_cores + + @property + def runningCores(self): + """Returns number of cores currently running frames.""" + return self.data.running_cores + + @property + def idleCores(self): + """Returns number of idle cores.""" + return self.data.idle_cores + + @property + def showStats(self): + """Returns per-show statistics.""" + return [ShowStatistics(s) for s in self.data.show_stats] + + def __repr__(self): + return f"FarmStatistics({self.snapshotTime})" + + +class ShowStatistics: + """Represents per-show statistics within a farm snapshot.""" + + def __init__(self, data): + self.data = data + + @property + def show(self): + """Returns the show name.""" + return self.data.show + + @property + def pendingJobs(self): + """Returns number of pending jobs for this show.""" + return self.data.pending_jobs + + @property + def runningFrames(self): + """Returns number of running frames for this show.""" + return self.data.running_frames + + @property + def waitingFrames(self): + """Returns number of waiting frames for this show.""" + return self.data.waiting_frames + + @property + def reservedCores(self): + """Returns reserved cores for this show.""" + return self.data.reserved_cores + + @property + def reservedGpus(self): + """Returns reserved GPUs for this show.""" + return self.data.reserved_gpus + + def __repr__(self): + return f"ShowStatistics({self.show})" + + +class TimeRange: + """Represents a time range for historical queries.""" + + def __init__(self, start_time, end_time): + """ + Create a time range. + + :param start_time: Start time as Unix epoch milliseconds + :param end_time: End time as Unix epoch milliseconds + """ + self.start_time = start_time + self.end_time = end_time + + def toProto(self): + """Convert to protobuf TimeRange message.""" + return monitoring_pb2.TimeRange( + start_time=self.start_time, + end_time=self.end_time + ) + + +# API Functions for Historical Data Access + +@util.grpcExceptionParser +def getJobHistory(shows=None, users=None, shots=None, job_name_regex=None, + states=None, time_range=None, page=1, page_size=100, max_results=1000): + """ + Query historical job records with optional filters. + + :param shows: List of show names to filter by + :param users: List of usernames to filter by + :param shots: List of shot names to filter by + :param job_name_regex: Regular expression pattern for job names + :param states: List of job states to filter by + :param time_range: TimeRange object for time-based filtering + :param page: Page number for pagination (1-indexed) + :param page_size: Number of results per page + :param max_results: Maximum total results to return + :rtype: list[HistoricalJob] + :return: List of HistoricalJob objects + """ + request = monitoring_pb2.GetJobHistoryRequest( + shows=shows or [], + users=users or [], + shots=shots or [], + job_name_regex=job_name_regex or [], + states=states or [], + page=page, + page_size=page_size, + max_results=max_results + ) + + if time_range: + request.time_range.CopyFrom(time_range.toProto()) + + response = Cuebot.getStub('monitoring').GetJobHistory( + request, timeout=Cuebot.Timeout) + + return [HistoricalJob(j) for j in response.jobs] + + +@util.grpcExceptionParser +def getFrameHistory(job_id=None, job_name=None, layer_names=None, states=None, + time_range=None, page=1, page_size=100): + """ + Query historical frame records for a specific job. + + :param job_id: Job ID to query (required if job_name not provided) + :param job_name: Job name to query (required if job_id not provided) + :param layer_names: List of layer names to filter by + :param states: List of frame states to filter by + :param time_range: TimeRange object for time-based filtering + :param page: Page number for pagination (1-indexed) + :param page_size: Number of results per page + :rtype: list[HistoricalFrame] + :return: List of HistoricalFrame objects + """ + request = monitoring_pb2.GetFrameHistoryRequest( + job_id=job_id or "", + job_name=job_name or "", + layer_names=layer_names or [], + states=states or [], + page=page, + page_size=page_size + ) + + if time_range: + request.time_range.CopyFrom(time_range.toProto()) + + response = Cuebot.getStub('monitoring').GetFrameHistory( + request, timeout=Cuebot.Timeout) + + return [HistoricalFrame(f) for f in response.frames] + + +@util.grpcExceptionParser +def getLayerHistory(job_id=None, job_name=None, time_range=None, page=1, page_size=100): + """ + Query historical layer records for a specific job. + + :param job_id: Job ID to query (required if job_name not provided) + :param job_name: Job name to query (required if job_id not provided) + :param time_range: TimeRange object for time-based filtering + :param page: Page number for pagination (1-indexed) + :param page_size: Number of results per page + :rtype: list[HistoricalLayer] + :return: List of HistoricalLayer objects + """ + request = monitoring_pb2.GetLayerHistoryRequest( + job_id=job_id or "", + job_name=job_name or "", + page=page, + page_size=page_size + ) + + if time_range: + request.time_range.CopyFrom(time_range.toProto()) + + response = Cuebot.getStub('monitoring').GetLayerHistory( + request, timeout=Cuebot.Timeout) + + return [HistoricalLayer(layer) for layer in response.layers] + + +@util.grpcExceptionParser +def getLayerMemoryHistory(layer_name, shows=None, time_range=None, max_results=1000): + """ + Query historical memory usage for a specific layer type. + This is useful for memory prediction based on historical data. + + :param layer_name: Layer name pattern to query + :param shows: List of show names to filter by + :param time_range: TimeRange object for time-based filtering + :param max_results: Maximum number of records to return + :rtype: list[LayerMemoryRecord] + :return: List of LayerMemoryRecord objects + """ + request = monitoring_pb2.GetLayerMemoryHistoryRequest( + layer_name=layer_name, + shows=shows or [], + max_results=max_results + ) + + if time_range: + request.time_range.CopyFrom(time_range.toProto()) + + response = Cuebot.getStub('monitoring').GetLayerMemoryHistory( + request, timeout=Cuebot.Timeout) + + return [LayerMemoryRecord(r) for r in response.records] + + +@util.grpcExceptionParser +def getFarmStatistics(time_range=None, interval_minutes=60, shows=None): + """ + Query aggregated farm statistics over a time range. + + :param time_range: TimeRange object for time-based filtering + :param interval_minutes: Aggregation interval in minutes + :param shows: List of show names to filter by + :rtype: list[FarmStatistics] + :return: List of FarmStatistics snapshots + """ + request = monitoring_pb2.GetFarmStatisticsRequest( + interval_minutes=interval_minutes, + shows=shows or [] + ) + + if time_range: + request.time_range.CopyFrom(time_range.toProto()) + + response = Cuebot.getStub('monitoring').GetFarmStatistics( + request, timeout=Cuebot.Timeout) + + return [FarmStatistics(s) for s in response.statistics] diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 1730e927e..8328cdb04 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -68,6 +68,56 @@ dependencies = [ "winapi", ] +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.60.2", +] + [[package]] name = "anyhow" version = "1.0.99" @@ -80,6 +130,19 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d902e3d592a523def97af8f317b08ce16b7ab854c1985a0c671e6f15cebc236" +[[package]] +name = "async-compression" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93c1f86859c1af3d514fa19e8323147ff10ea98684e6c7b307912509f50e67b2" +dependencies = [ + "compression-codecs", + "compression-core", + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "async-stream" version = "0.1.2" @@ -144,8 +207,8 @@ dependencies = [ "axum-core", "bytes", "futures-util", - "http", - "http-body", + "http 1.3.1", + "http-body 1.0.1", "http-body-util", "itoa", "matchit", @@ -155,7 +218,7 @@ dependencies = [ "pin-project-lite", "rustversion", "serde", - "sync_wrapper", + "sync_wrapper 1.0.2", "tower", "tower-layer", "tower-service", @@ -169,13 +232,13 @@ checksum = "68464cd0412f486726fb3373129ef5d2993f90c34bc2bc1c1e9943b2f4fc7ca6" dependencies = [ "bytes", "futures-core", - "http", - "http-body", + "http 1.3.1", + "http-body 1.0.1", "http-body-util", "mime", "pin-project-lite", "rustversion", - "sync_wrapper", + "sync_wrapper 1.0.2", "tower-layer", "tower-service", ] @@ -204,6 +267,12 @@ dependencies = [ "backtrace", ] +[[package]] +name = "base64" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b41b7ea54a0c9d92199de89e20e58d49f02f8e699814ef3fdf266f6f748d15c7" + [[package]] name = "base64" version = "0.21.7" @@ -261,9 +330,9 @@ dependencies = [ "futures-core", "futures-util", "hex", - "http", + "http 1.3.1", "http-body-util", - "hyper", + "hyper 1.6.0", "hyper-named-pipe", "hyper-util", "hyperlocal", @@ -290,7 +359,7 @@ checksum = "3f179cfbddb6e77a5472703d4b30436bff32929c0aa8a9008ecf23d1d3cdd0da" dependencies = [ "serde", "serde_repr", - "serde_with", + "serde_with 3.14.0", ] [[package]] @@ -353,7 +422,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-link", + "windows-link 0.1.3", ] [[package]] @@ -365,12 +434,84 @@ dependencies = [ "ansi_term", "atty", "bitflags 1.3.2", - "strsim", + "strsim 0.8.0", "textwrap 0.11.0", "unicode-width 0.1.14", "vec_map", ] +[[package]] +name = "clap" +version = "4.5.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim 0.11.1", +] + +[[package]] +name = "clap_derive" +version = "4.5.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.104", +] + +[[package]] +name = "clap_lex" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" + +[[package]] +name = "cmake" +version = "0.1.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" +dependencies = [ + "cc", +] + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "compression-codecs" +version = "0.4.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "680dc087785c5230f8e8843e2e57ac7c1c90488b6a91b88caa265410568f441b" +dependencies = [ + "compression-core", + "flate2", + "memchr", +] + +[[package]] +name = "compression-core" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" + [[package]] name = "config" version = "0.14.1" @@ -532,6 +673,41 @@ dependencies = [ "typenum", ] +[[package]] +name = "darling" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a01d95850c592940db9b8194bc39f4bc0e89dee5c4265e4b1807c34a9aba453c" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "859d65a907b6852c9361e3185c862aae7fafd2887876799fa55f5f99dc40d610" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn 1.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c972679f83bdf9c42bd905396b6c3588a843a17f0f16dfcfa3e2c5d57441835" +dependencies = [ + "darling_core", + "quote", + "syn 1.0.109", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -646,6 +822,26 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "elasticsearch" +version = "8.5.0-alpha.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40d9bd57d914cc66ce878f098f63ed7b5d5b64c30644a5adb950b008f874a6c6" +dependencies = [ + "base64 0.11.0", + "bytes", + "dyn-clone", + "lazy_static", + "percent-encoding", + "reqwest", + "rustc_version", + "serde", + "serde_json", + "serde_with 1.14.0", + "url", + "void", +] + [[package]] name = "encoding_rs" version = "0.8.35" @@ -705,6 +901,21 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -854,6 +1065,25 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "h2" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 0.2.12", + "indexmap 2.10.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "h2" version = "0.4.11" @@ -865,7 +1095,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http", + "http 1.3.1", "indexmap 2.10.0", "slab", "tokio", @@ -943,6 +1173,17 @@ dependencies = [ "digest", ] +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http" version = "1.3.1" @@ -954,6 +1195,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http 0.2.12", + "pin-project-lite", +] + [[package]] name = "http-body" version = "1.0.1" @@ -961,7 +1213,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http", + "http 1.3.1", ] [[package]] @@ -972,8 +1224,8 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http", - "http-body", + "http 1.3.1", + "http-body 1.0.1", "pin-project-lite", ] @@ -1005,6 +1257,30 @@ dependencies = [ "serde", ] +[[package]] +name = "hyper" +version = "0.14.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.3.27", + "http 0.2.12", + "http-body 0.4.6", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.5.10", + "tokio", + "tower-service", + "tracing", + "want", +] + [[package]] name = "hyper" version = "1.6.0" @@ -1014,9 +1290,9 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2", - "http", - "http-body", + "h2 0.4.11", + "http 1.3.1", + "http-body 1.0.1", "httparse", "httpdate", "itoa", @@ -1033,7 +1309,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73b7d8abf35697b81a825e386fc151e0d503e8cb5fcb93cc8669c376dfd6f278" dependencies = [ "hex", - "hyper", + "hyper 1.6.0", "hyper-util", "pin-project-lite", "tokio", @@ -1047,13 +1323,26 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" dependencies = [ - "hyper", + "hyper 1.6.0", "hyper-util", "pin-project-lite", "tokio", "tower-service", ] +[[package]] +name = "hyper-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes", + "hyper 0.14.32", + "native-tls", + "tokio", + "tokio-native-tls", +] + [[package]] name = "hyper-util" version = "0.1.15" @@ -1064,9 +1353,9 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "http", - "http-body", - "hyper", + "http 1.3.1", + "http-body 1.0.1", + "hyper 1.6.0", "libc", "pin-project-lite", "socket2 0.5.10", @@ -1083,7 +1372,7 @@ checksum = "986c5ce3b994526b3cd75578e62554abd09f0899d6206de48b3e96ab34ccc8c7" dependencies = [ "hex", "http-body-util", - "hyper", + "hyper 1.6.0", "hyper-util", "pin-project-lite", "tokio", @@ -1200,6 +1489,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.1.0" @@ -1254,6 +1549,12 @@ dependencies = [ "libc", ] +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + [[package]] name = "ipnetwork" version = "0.20.0" @@ -1269,6 +1570,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7655c9839580ee829dfacba1d1278c2b7883e50a277ff7541299489d6bdfdc45" +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.13.0" @@ -1314,6 +1621,26 @@ dependencies = [ "serde", ] +[[package]] +name = "kafka-es-indexer" +version = "0.1.5" +dependencies = [ + "anyhow", + "chrono", + "clap 4.5.53", + "config", + "elasticsearch", + "futures", + "rdkafka", + "serde", + "serde_json", + "thiserror 1.0.69", + "tokio", + "tracing", + "tracing-subscriber", + "url", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -1326,6 +1653,18 @@ version = "0.2.174" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" +[[package]] +name = "libz-sys" +version = "1.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15d118bbf3771060e7311cc7bb0545b01d08a8b4a7de949198dec1fa0ca1c0f7" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "linux-raw-sys" version = "0.9.4" @@ -1370,6 +1709,15 @@ dependencies = [ "core-foundation-sys", ] +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "matchit" version = "0.8.4" @@ -1460,6 +1808,23 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "nix" version = "0.29.0" @@ -1521,6 +1886,28 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_enum" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" +dependencies = [ + "num_enum_derive", + "rustversion", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.104", +] + [[package]] name = "object" version = "0.36.7" @@ -1531,26 +1918,86 @@ dependencies = [ ] [[package]] -name = "once_cell" -version = "1.21.3" +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "opencue-proto" +version = "0.1.5" +dependencies = [ + "prost", + "prost-types", + "rand 0.8.5", + "rmp", + "rmp-serde", + "serde", + "serde_derive", + "tonic", + "tonic-build", + "uuid", + "whoami", +] + +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags 2.9.1", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.104", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + +[[package]] +name = "openssl-src" +version = "300.5.4+3.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507b3792995dae9b0df8a1c1e3771e8418b7c2d9f0baeba32e6fe8b06c7cb72" +dependencies = [ + "cc", +] + +[[package]] +name = "openssl-sys" +version = "0.9.111" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" - -[[package]] -name = "opencue-proto" -version = "0.1.5" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" dependencies = [ - "prost", - "prost-types", - "rand 0.8.5", - "rmp", - "rmp-serde", - "serde", - "serde_derive", - "tonic", - "tonic-build", - "uuid", - "whoami", + "cc", + "libc", + "openssl-src", + "pkg-config", + "vcpkg", ] [[package]] @@ -1874,6 +2321,15 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "proc-macro-crate" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +dependencies = [ + "toml_edit 0.23.5", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -2053,6 +2509,38 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rdkafka" +version = "0.36.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1beea247b9a7600a81d4cc33f659ce1a77e1988323d7d2809c7ed1c21f4c316d" +dependencies = [ + "futures-channel", + "futures-util", + "libc", + "log", + "rdkafka-sys", + "serde", + "serde_derive", + "serde_json", + "slab", + "tokio", +] + +[[package]] +name = "rdkafka-sys" +version = "4.9.0+2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5230dca48bc354d718269f3e4353280e188b610f7af7e2fcf54b7a79d5802872" +dependencies = [ + "cmake", + "libc", + "libz-sys", + "num_enum", + "openssl-sys", + "pkg-config", +] + [[package]] name = "readkey" version = "0.2.2" @@ -2123,6 +2611,48 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "reqwest" +version = "0.11.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62" +dependencies = [ + "async-compression", + "base64 0.21.7", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2 0.3.27", + "http 0.2.12", + "http-body 0.4.6", + "hyper 0.14.32", + "hyper-tls", + "ipnet", + "js-sys", + "log", + "mime", + "native-tls", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls-pemfile 1.0.4", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper 0.1.2", + "system-configuration", + "tokio", + "tokio-native-tls", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winreg", +] + [[package]] name = "ring" version = "0.17.14" @@ -2186,8 +2716,8 @@ dependencies = [ "device_query", "futures", "futures-core", - "http", - "http-body", + "http 1.3.1", + "http-body 1.0.1", "http-body-util", "humantime", "humantime-serde", @@ -2239,6 +2769,15 @@ version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.0.7" @@ -2267,6 +2806,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-pemfile" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +dependencies = [ + "base64 0.21.7", +] + [[package]] name = "rustls-pemfile" version = "2.2.0" @@ -2308,6 +2856,15 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "schemars" version = "0.9.0" @@ -2338,20 +2895,68 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.9.1", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -2402,6 +3007,16 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_with" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "678b5a069e50bf00ecd22d0cd8ddf7c236f68581b03db652061ed5eb13a312ff" +dependencies = [ + "serde", + "serde_with_macros", +] + [[package]] name = "serde_with" version = "3.14.0" @@ -2421,6 +3036,18 @@ dependencies = [ "time", ] +[[package]] +name = "serde_with_macros" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e182d6ec6f05393cc0e5ed1bf81ad6db3a8feedf8ee515ecdd369809bcce8082" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "sha2" version = "0.10.9" @@ -2517,13 +3144,25 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "structopt" version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" dependencies = [ - "clap", + "clap 2.34.0", "lazy_static", "structopt-derive", ] @@ -2590,6 +3229,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + [[package]] name = "sync_wrapper" version = "1.0.2" @@ -2621,6 +3266,27 @@ dependencies = [ "windows 0.57.0", ] +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tempfile" version = "3.20.0" @@ -2808,6 +3474,16 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-postgres" version = "0.7.13" @@ -2866,8 +3542,8 @@ checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" dependencies = [ "serde", "serde_spanned", - "toml_datetime", - "toml_edit", + "toml_datetime 0.6.11", + "toml_edit 0.22.27", ] [[package]] @@ -2879,6 +3555,15 @@ dependencies = [ "serde", ] +[[package]] +name = "toml_datetime" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533" +dependencies = [ + "serde_core", +] + [[package]] name = "toml_edit" version = "0.22.27" @@ -2888,11 +3573,32 @@ dependencies = [ "indexmap 2.10.0", "serde", "serde_spanned", - "toml_datetime", + "toml_datetime 0.6.11", "toml_write", "winnow", ] +[[package]] +name = "toml_edit" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2ad0b7ae9cfeef5605163839cb9221f453399f15cfb5c10be9885fcf56611f9" +dependencies = [ + "indexmap 2.10.0", + "toml_datetime 0.7.3", + "toml_parser", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e" +dependencies = [ + "winnow", +] + [[package]] name = "toml_write" version = "0.1.2" @@ -2909,11 +3615,11 @@ dependencies = [ "axum", "base64 0.22.1", "bytes", - "h2", - "http", - "http-body", + "h2 0.4.11", + "http 1.3.1", + "http-body 1.0.1", "http-body-util", - "hyper", + "hyper 1.6.0", "hyper-timeout", "hyper-util", "percent-encoding", @@ -2953,7 +3659,7 @@ dependencies = [ "indexmap 2.10.0", "pin-project-lite", "slab", - "sync_wrapper", + "sync_wrapper 1.0.2", "tokio", "tokio-util", "tower-layer", @@ -3037,18 +3743,35 @@ dependencies = [ "chrono", ] +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + [[package]] name = "tracing-subscriber" version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2054a14f5307d601f88daf0553e1cbf472acc4f2c51afab632431cdcd72124d5" dependencies = [ + "matchers", "nu-ansi-term", + "once_cell", + "regex-automata", + "serde", + "serde_json", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-log", + "tracing-serde", ] [[package]] @@ -3138,7 +3861,7 @@ dependencies = [ "log", "percent-encoding", "rustls", - "rustls-pemfile", + "rustls-pemfile 2.2.0", "rustls-pki-types", "serde", "serde_json", @@ -3154,7 +3877,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5b6cabebbecc4c45189ab06b52f956206cea7d8c8a20851c35a85cb169224cc" dependencies = [ "base64 0.22.1", - "http", + "http 1.3.1", "httparse", "log", ] @@ -3193,6 +3916,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.17.0" @@ -3211,6 +3940,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "vec_map" version = "0.8.2" @@ -3223,6 +3958,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + [[package]] name = "want" version = "0.3.1" @@ -3279,6 +4020,19 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.100" @@ -3402,7 +4156,7 @@ checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" dependencies = [ "windows-implement 0.60.0", "windows-interface 0.59.1", - "windows-link", + "windows-link 0.1.3", "windows-result 0.3.4", "windows-strings", ] @@ -3457,6 +4211,12 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + [[package]] name = "windows-result" version = "0.1.2" @@ -3472,7 +4232,7 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" dependencies = [ - "windows-link", + "windows-link 0.1.3", ] [[package]] @@ -3481,7 +4241,16 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" dependencies = [ - "windows-link", + "windows-link 0.1.3", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", ] [[package]] @@ -3511,6 +4280,15 @@ dependencies = [ "windows-targets 0.53.2", ] +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link 0.2.1", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -3698,13 +4476,23 @@ checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" [[package]] name = "winnow" -version = "0.7.11" +version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74c7b26e3480b707944fc872477815d29a8e429d2f93a1ce000f5fa84a15cbcd" +checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" dependencies = [ "memchr", ] +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "wit-bindgen-rt" version = "0.39.0" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index c462f22a4..346dc1857 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["crates/opencue-proto", "crates/rqd", "crates/dummy-cuebot"] +members = ["crates/opencue-proto", "crates/rqd", "crates/dummy-cuebot", "crates/kafka-es-indexer"] resolver = "3" [workspace.package] diff --git a/rust/README.md b/rust/README.md index aa4c8659a..fa5d50bf5 100644 --- a/rust/README.md +++ b/rust/README.md @@ -6,6 +6,7 @@ Project crates: * rqd: rewrite of [OpenCue/rqd](https://github.com/AcademySoftwareFoundation/OpenCue/tree/master/rqd) * dummy-cuebot: A cli tool to interact with rqd's gRPC interface * opencue_proto: Wrapper around grpc's generated code for the project protobuf modules + * kafka-es-indexer: Kafka to Elasticsearch indexer for OpenCue monitoring events ## Build Instructions diff --git a/rust/crates/kafka-es-indexer/Cargo.toml b/rust/crates/kafka-es-indexer/Cargo.toml new file mode 100644 index 000000000..4ebdff58a --- /dev/null +++ b/rust/crates/kafka-es-indexer/Cargo.toml @@ -0,0 +1,43 @@ +[package] +name = "kafka-es-indexer" +description = "OpenCue Kafka to Elasticsearch event indexer" +authors = ["OpenCue Contributors"] +edition = "2021" +version = "0.1.0" + +[[bin]] +name = "kafka-es-indexer" +path = "src/main.rs" + +[dependencies] +# Async runtime +tokio = { version = "1.45", features = ["full"] } +futures = "0.3" + +# Kafka +rdkafka = { version = "0.36", features = ["cmake-build", "ssl-vendored"] } + +# Elasticsearch (alpha versions are the norm for this crate) +elasticsearch = "8.5.0-alpha.1" + +# Serialization +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" + +# Configuration +config = "0.14.0" +clap = { version = "4.5", features = ["derive", "env"] } + +# Logging +tracing = "0.1.40" +tracing-subscriber = { version = "0.3.20", features = ["env-filter", "json"] } + +# Error handling +thiserror = "1.0" +anyhow = "1.0" + +# Time +chrono = { version = "0.4", features = ["serde"] } + +# Utilities +url = "2.5" diff --git a/rust/crates/kafka-es-indexer/Dockerfile b/rust/crates/kafka-es-indexer/Dockerfile new file mode 100644 index 000000000..0aa3df7f6 --- /dev/null +++ b/rust/crates/kafka-es-indexer/Dockerfile @@ -0,0 +1,55 @@ +# Build stage +FROM rust:1.83-bookworm AS builder + +# Install build dependencies +RUN apt-get update && apt-get install -y \ + cmake \ + libssl-dev \ + pkg-config \ + libsasl2-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy only the Cargo.toml first for dependency caching +COPY crates/kafka-es-indexer/Cargo.toml ./Cargo.toml + +# Create dummy main.rs and build dependencies (for caching) +RUN mkdir -p src && \ + echo "fn main() {}" > src/main.rs && \ + cargo build --release 2>/dev/null || true && \ + rm -rf src && \ + rm -f target/release/kafka-es-indexer + +# Copy actual source and build +COPY crates/kafka-es-indexer/src ./src +RUN touch src/main.rs && cargo build --release + +# Runtime stage +FROM debian:bookworm-slim + +# Install runtime dependencies +RUN apt-get update && apt-get install -y \ + ca-certificates \ + libssl3 \ + libsasl2-2 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy the binary +COPY --from=builder /app/target/release/kafka-es-indexer /usr/local/bin/ + +# Create non-root user +RUN useradd -r -s /bin/false indexer +USER indexer + +# Default environment variables +ENV KAFKA_BOOTSTRAP_SERVERS=kafka:9092 +ENV KAFKA_GROUP_ID=opencue-elasticsearch-indexer +ENV ELASTICSEARCH_URL=http://elasticsearch:9200 +ENV ELASTICSEARCH_INDEX_PREFIX=opencue +ENV LOG_LEVEL=info +ENV RUST_BACKTRACE=1 + +ENTRYPOINT ["kafka-es-indexer"] diff --git a/rust/crates/kafka-es-indexer/README.md b/rust/crates/kafka-es-indexer/README.md new file mode 100644 index 000000000..49def23e8 --- /dev/null +++ b/rust/crates/kafka-es-indexer/README.md @@ -0,0 +1,112 @@ +# OpenCue Kafka-Elasticsearch Indexer + +A Rust service that consumes OpenCue monitoring events from Kafka and indexes them into Elasticsearch for historical analysis. + +## Overview + +This service is the consumer side of the OpenCue monitoring pipeline: + +``` +Cuebot (Producer) -> Kafka -> kafka-es-indexer (Consumer) -> Elasticsearch +``` + +## Features + +- Consumes events from all OpenCue Kafka topics: + - `opencue.job.events` + - `opencue.layer.events` + - `opencue.frame.events` + - `opencue.host.events` + - `opencue.proc.events` +- Bulk indexing to Elasticsearch for efficiency +- Automatic index template creation with proper mappings +- Configurable via CLI arguments, environment variables, or config file +- Graceful shutdown with final flush + +## Usage + +### Command Line + +```bash +kafka-es-indexer \ + --kafka-servers localhost:9092 \ + --kafka-group-id opencue-elasticsearch-indexer \ + --elasticsearch-url http://localhost:9200 \ + --index-prefix opencue \ + --log-level info +``` + +### Environment Variables + +```bash +export KAFKA_BOOTSTRAP_SERVERS=localhost:9092 +export KAFKA_GROUP_ID=opencue-elasticsearch-indexer +export ELASTICSEARCH_URL=http://localhost:9200 +export ELASTICSEARCH_INDEX_PREFIX=opencue +export LOG_LEVEL=info + +kafka-es-indexer +``` + +### Configuration File + +```yaml +# config.yaml +kafka: + bootstrap_servers: "localhost:9092" + group_id: "opencue-elasticsearch-indexer" + auto_offset_reset: "earliest" + enable_auto_commit: true + auto_commit_interval_ms: 5000 + +elasticsearch: + url: "http://localhost:9200" + index_prefix: "opencue" + num_shards: 1 + num_replicas: 0 + bulk_size: 100 + flush_interval_ms: 5000 +``` + +```bash +kafka-es-indexer --config config.yaml +``` + +## Docker + +Build the Docker image: + +```bash +cd rust +docker build -f crates/kafka-es-indexer/Dockerfile -t opencue/kafka-es-indexer . +``` + +Run with Docker: + +```bash +docker run -d \ + -e KAFKA_BOOTSTRAP_SERVERS=kafka:9092 \ + -e ELASTICSEARCH_URL=http://elasticsearch:9200 \ + opencue/kafka-es-indexer +``` + +## Building + +```bash +cd rust +cargo build --release --package kafka-es-indexer +``` + +The binary will be at `target/release/kafka-es-indexer`. + +## Index Structure + +Events are indexed into daily indices with the pattern: + +``` +{prefix}-{event-type}-{date} +``` + +Examples: +- `opencue-job-events-2024.11.29` +- `opencue-frame-events-2024.11.29` diff --git a/rust/crates/kafka-es-indexer/src/config.rs b/rust/crates/kafka-es-indexer/src/config.rs new file mode 100644 index 000000000..58d5d8ca1 --- /dev/null +++ b/rust/crates/kafka-es-indexer/src/config.rs @@ -0,0 +1,228 @@ +// Copyright Contributors to the OpenCue Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under +// the License. + +//! Configuration for the Kafka-Elasticsearch indexer. + +use serde::Deserialize; + +use crate::error::IndexerError; + +/// Top-level configuration +#[derive(Debug, Clone, Deserialize)] +pub struct Config { + pub kafka: KafkaConfig, + pub elasticsearch: ElasticsearchConfig, +} + +/// Kafka consumer configuration +#[derive(Debug, Clone, Deserialize)] +pub struct KafkaConfig { + /// Kafka bootstrap servers (comma-separated) + #[serde(default = "default_bootstrap_servers")] + pub bootstrap_servers: String, + + /// Consumer group ID + #[serde(default = "default_group_id")] + pub group_id: String, + + /// Auto offset reset policy + #[serde(default = "default_auto_offset_reset")] + pub auto_offset_reset: String, + + /// Enable auto commit + #[serde(default = "default_enable_auto_commit")] + pub enable_auto_commit: bool, + + /// Auto commit interval in milliseconds + #[serde(default = "default_auto_commit_interval")] + pub auto_commit_interval_ms: u32, + + /// Maximum poll records + #[serde(default = "default_max_poll_records")] + pub max_poll_records: u32, + + /// Session timeout in milliseconds + #[serde(default = "default_session_timeout")] + pub session_timeout_ms: u32, + + /// Topics to subscribe to + #[serde(default = "default_topics")] + pub topics: Vec, +} + +/// Elasticsearch client configuration +#[derive(Debug, Clone, Deserialize)] +pub struct ElasticsearchConfig { + /// Elasticsearch URL + #[serde(default = "default_elasticsearch_url")] + pub url: String, + + /// Username for authentication (optional) + pub username: Option, + + /// Password for authentication (optional) + pub password: Option, + + /// Index prefix + #[serde(default = "default_index_prefix")] + pub index_prefix: String, + + /// Number of shards for indices + #[serde(default = "default_num_shards")] + pub num_shards: u32, + + /// Number of replicas for indices + #[serde(default = "default_num_replicas")] + pub num_replicas: u32, + + /// Bulk indexing batch size + #[serde(default = "default_bulk_size")] + pub bulk_size: usize, + + /// Bulk indexing flush interval in milliseconds + #[serde(default = "default_flush_interval")] + pub flush_interval_ms: u64, +} + +// Default value functions +fn default_bootstrap_servers() -> String { + "localhost:9092".to_string() +} + +fn default_group_id() -> String { + "opencue-elasticsearch-indexer".to_string() +} + +fn default_auto_offset_reset() -> String { + "earliest".to_string() +} + +fn default_enable_auto_commit() -> bool { + true +} + +fn default_auto_commit_interval() -> u32 { + 5000 +} + +fn default_max_poll_records() -> u32 { + 500 +} + +fn default_session_timeout() -> u32 { + 30000 +} + +fn default_topics() -> Vec { + vec![ + "opencue.job.events".to_string(), + "opencue.layer.events".to_string(), + "opencue.frame.events".to_string(), + "opencue.host.events".to_string(), + "opencue.proc.events".to_string(), + ] +} + +fn default_elasticsearch_url() -> String { + "http://localhost:9200".to_string() +} + +fn default_index_prefix() -> String { + "opencue".to_string() +} + +fn default_num_shards() -> u32 { + 1 +} + +fn default_num_replicas() -> u32 { + 0 +} + +fn default_bulk_size() -> usize { + 100 +} + +fn default_flush_interval() -> u64 { + 5000 +} + +impl Config { + /// Load configuration from a file + pub fn from_file(path: &str) -> Result { + let settings = config::Config::builder() + .add_source(config::File::with_name(path)) + .add_source(config::Environment::with_prefix("INDEXER").separator("_")) + .build() + .map_err(|e| IndexerError::Config(e.to_string()))?; + + settings + .try_deserialize() + .map_err(|e| IndexerError::Config(e.to_string())) + } + + /// Create configuration from CLI arguments + pub fn from_args(args: &super::Args) -> Self { + Config { + kafka: KafkaConfig { + bootstrap_servers: args.kafka_servers.clone(), + group_id: args.kafka_group_id.clone(), + auto_offset_reset: default_auto_offset_reset(), + enable_auto_commit: default_enable_auto_commit(), + auto_commit_interval_ms: default_auto_commit_interval(), + max_poll_records: default_max_poll_records(), + session_timeout_ms: default_session_timeout(), + topics: default_topics(), + }, + elasticsearch: ElasticsearchConfig { + url: args.elasticsearch_url.clone(), + username: None, + password: None, + index_prefix: args.index_prefix.clone(), + num_shards: default_num_shards(), + num_replicas: default_num_replicas(), + bulk_size: default_bulk_size(), + flush_interval_ms: default_flush_interval(), + }, + } + } +} + +impl Default for KafkaConfig { + fn default() -> Self { + Self { + bootstrap_servers: default_bootstrap_servers(), + group_id: default_group_id(), + auto_offset_reset: default_auto_offset_reset(), + enable_auto_commit: default_enable_auto_commit(), + auto_commit_interval_ms: default_auto_commit_interval(), + max_poll_records: default_max_poll_records(), + session_timeout_ms: default_session_timeout(), + topics: default_topics(), + } + } +} + +impl Default for ElasticsearchConfig { + fn default() -> Self { + Self { + url: default_elasticsearch_url(), + username: None, + password: None, + index_prefix: default_index_prefix(), + num_shards: default_num_shards(), + num_replicas: default_num_replicas(), + bulk_size: default_bulk_size(), + flush_interval_ms: default_flush_interval(), + } + } +} diff --git a/rust/crates/kafka-es-indexer/src/consumer.rs b/rust/crates/kafka-es-indexer/src/consumer.rs new file mode 100644 index 000000000..5f151d85c --- /dev/null +++ b/rust/crates/kafka-es-indexer/src/consumer.rs @@ -0,0 +1,242 @@ +// Copyright Contributors to the OpenCue Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under +// the License. + +//! Kafka consumer for OpenCue monitoring events. + +use std::sync::Arc; +use std::time::Duration; + +use rdkafka::config::ClientConfig; +use rdkafka::consumer::{CommitMode, Consumer, StreamConsumer}; +use rdkafka::message::Message; +use tokio::sync::mpsc; +use tracing::{debug, error, info, warn}; + +use crate::config::KafkaConfig; +use crate::elasticsearch::ElasticsearchClient; +use crate::error::IndexerError; + +/// Kafka topic names for OpenCue events +pub const TOPIC_JOB_EVENTS: &str = "opencue.job.events"; +pub const TOPIC_LAYER_EVENTS: &str = "opencue.layer.events"; +pub const TOPIC_FRAME_EVENTS: &str = "opencue.frame.events"; +pub const TOPIC_HOST_EVENTS: &str = "opencue.host.events"; +pub const TOPIC_PROC_EVENTS: &str = "opencue.proc.events"; + +/// Event types for routing to appropriate indices +#[derive(Debug, Clone)] +pub enum EventType { + Job, + Layer, + Frame, + Host, + Proc, +} + +impl EventType { + /// Determine event type from Kafka topic name + pub fn from_topic(topic: &str) -> Option { + match topic { + TOPIC_JOB_EVENTS => Some(EventType::Job), + TOPIC_LAYER_EVENTS => Some(EventType::Layer), + TOPIC_FRAME_EVENTS => Some(EventType::Frame), + TOPIC_HOST_EVENTS => Some(EventType::Host), + TOPIC_PROC_EVENTS => Some(EventType::Proc), + _ => None, + } + } + + /// Get the index suffix for this event type + pub fn index_suffix(&self) -> &'static str { + match self { + EventType::Job => "job-events", + EventType::Layer => "layer-events", + EventType::Frame => "frame-events", + EventType::Host => "host-events", + EventType::Proc => "proc-events", + } + } +} + +/// A consumed event ready for indexing +#[derive(Debug)] +pub struct ConsumedEvent { + pub event_type: EventType, + pub event_id: Option, + pub payload: String, +} + +/// Kafka consumer for OpenCue monitoring events +pub struct EventConsumer { + consumer: StreamConsumer, + es_client: Arc, +} + +impl EventConsumer { + /// Create a new event consumer + pub fn new(config: &KafkaConfig, es_client: ElasticsearchClient) -> Result { + let consumer: StreamConsumer = ClientConfig::new() + .set("bootstrap.servers", &config.bootstrap_servers) + .set("group.id", &config.group_id) + .set("auto.offset.reset", &config.auto_offset_reset) + .set("enable.auto.commit", config.enable_auto_commit.to_string()) + .set( + "auto.commit.interval.ms", + config.auto_commit_interval_ms.to_string(), + ) + .set("session.timeout.ms", config.session_timeout_ms.to_string()) + .set("enable.partition.eof", "false") + .create()?; + + // Subscribe to all topics + let topics: Vec<&str> = config.topics.iter().map(|s| s.as_str()).collect(); + consumer.subscribe(&topics)?; + + info!(topics = ?topics, "Subscribed to Kafka topics"); + + Ok(Self { + consumer, + es_client: Arc::new(es_client), + }) + } + + /// Run the consumer loop + pub async fn run(self) -> Result<(), IndexerError> { + let (tx, mut rx) = mpsc::channel::(1000); + let es_client = self.es_client.clone(); + + // Spawn indexer task + let indexer_handle = tokio::spawn(async move { + let mut batch: Vec = Vec::with_capacity(100); + let mut last_flush = std::time::Instant::now(); + let flush_interval = Duration::from_secs(5); + + loop { + tokio::select! { + event = rx.recv() => { + match event { + Some(e) => { + batch.push(e); + if batch.len() >= 100 || last_flush.elapsed() > flush_interval { + if let Err(e) = es_client.bulk_index(&batch).await { + error!(error = %e, "Failed to bulk index events"); + } + batch.clear(); + last_flush = std::time::Instant::now(); + } + } + None => { + // Channel closed, flush remaining + if !batch.is_empty() { + if let Err(e) = es_client.bulk_index(&batch).await { + error!(error = %e, "Failed to flush remaining events"); + } + } + break; + } + } + } + _ = tokio::time::sleep(flush_interval) => { + if !batch.is_empty() { + if let Err(e) = es_client.bulk_index(&batch).await { + error!(error = %e, "Failed to flush events on interval"); + } + batch.clear(); + last_flush = std::time::Instant::now(); + } + } + } + } + }); + + // Consumer loop + loop { + match self.consumer.recv().await { + Ok(message) => { + let topic = message.topic(); + let partition = message.partition(); + let offset = message.offset(); + + if let Some(payload) = message.payload() { + let payload_str = match std::str::from_utf8(payload) { + Ok(s) => s.to_string(), + Err(e) => { + warn!( + topic = topic, + partition = partition, + offset = offset, + error = %e, + "Invalid UTF-8 in message payload" + ); + continue; + } + }; + + if let Some(event_type) = EventType::from_topic(topic) { + // Extract event_id from JSON + let event_id = extract_event_id(&payload_str); + + debug!( + topic = topic, + partition = partition, + offset = offset, + event_id = ?event_id, + "Received event" + ); + + let event = ConsumedEvent { + event_type, + event_id, + payload: payload_str, + }; + + if tx.send(event).await.is_err() { + error!("Indexer task has stopped, shutting down consumer"); + break; + } + } else { + warn!(topic = topic, "Unknown topic, skipping message"); + } + } + + // Commit offset + if let Err(e) = self.consumer.commit_message(&message, CommitMode::Async) { + warn!(error = %e, "Failed to commit offset"); + } + } + Err(e) => { + error!(error = %e, "Error receiving message from Kafka"); + // Sleep briefly before retrying + tokio::time::sleep(Duration::from_millis(100)).await; + } + } + } + + // Wait for indexer to finish + drop(tx); + indexer_handle.await.ok(); + + Ok(()) + } +} + +/// Extract event_id from JSON payload +fn extract_event_id(json: &str) -> Option { + serde_json::from_str::(json) + .ok() + .and_then(|v| { + v.get("header") + .and_then(|h| h.get("event_id")) + .and_then(|id| id.as_str()) + .map(|s| s.to_string()) + }) +} diff --git a/rust/crates/kafka-es-indexer/src/elasticsearch.rs b/rust/crates/kafka-es-indexer/src/elasticsearch.rs new file mode 100644 index 000000000..aa1cea68c --- /dev/null +++ b/rust/crates/kafka-es-indexer/src/elasticsearch.rs @@ -0,0 +1,430 @@ +// Copyright Contributors to the OpenCue Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under +// the License. + +//! Elasticsearch client for indexing OpenCue monitoring events. + +use chrono::Utc; +use elasticsearch::http::request::JsonBody; +use elasticsearch::http::transport::{SingleNodeConnectionPool, TransportBuilder}; +use elasticsearch::indices::IndicesPutIndexTemplateParts; +use elasticsearch::{BulkParts, Elasticsearch}; +use serde_json::json; +use tracing::{debug, error, info, warn}; +use url::Url; + +use crate::config::ElasticsearchConfig; +use crate::consumer::ConsumedEvent; +use crate::error::IndexerError; + +/// Elasticsearch client wrapper for OpenCue event indexing +pub struct ElasticsearchClient { + client: Elasticsearch, + config: ElasticsearchConfig, +} + +impl ElasticsearchClient { + /// Create a new Elasticsearch client + pub async fn new(config: &ElasticsearchConfig) -> Result { + let url = Url::parse(&config.url) + .map_err(|e| IndexerError::Elasticsearch(format!("Invalid URL: {}", e)))?; + + let pool = SingleNodeConnectionPool::new(url); + let mut builder = TransportBuilder::new(pool); + + // Add authentication if provided + if let (Some(username), Some(password)) = (&config.username, &config.password) { + builder = builder.auth(elasticsearch::auth::Credentials::Basic( + username.clone(), + password.clone(), + )); + } + + let transport = builder + .build() + .map_err(|e| IndexerError::Elasticsearch(e.to_string()))?; + + let client = Elasticsearch::new(transport); + + // Verify connection + let info = client + .info() + .send() + .await + .map_err(|e| IndexerError::Elasticsearch(format!("Connection failed: {}", e)))?; + + if !info.status_code().is_success() { + return Err(IndexerError::Elasticsearch( + "Failed to connect to Elasticsearch".to_string(), + )); + } + + info!(url = %config.url, "Connected to Elasticsearch"); + + Ok(Self { + client, + config: config.clone(), + }) + } + + /// Create index templates for all event types + pub async fn create_index_templates(&self) -> Result<(), IndexerError> { + let event_types = vec![ + ("job-events", self.job_events_mapping()), + ("layer-events", self.layer_events_mapping()), + ("frame-events", self.frame_events_mapping()), + ("host-events", self.host_events_mapping()), + ("proc-events", self.proc_events_mapping()), + ]; + + for (event_type, mappings) in event_types { + self.create_index_template(event_type, mappings).await?; + } + + Ok(()) + } + + /// Create a single index template + async fn create_index_template( + &self, + event_type: &str, + mappings: serde_json::Value, + ) -> Result<(), IndexerError> { + let template_name = format!("{}-{}", self.config.index_prefix, event_type); + let index_pattern = format!("{}-{}-*", self.config.index_prefix, event_type); + + let body = json!({ + "index_patterns": [index_pattern], + "template": { + "settings": { + "number_of_shards": self.config.num_shards, + "number_of_replicas": self.config.num_replicas, + "index.mapping.total_fields.limit": 2000 + }, + "mappings": mappings + }, + "priority": 100, + "version": 1 + }); + + let response = self + .client + .indices() + .put_index_template(IndicesPutIndexTemplateParts::Name(&template_name)) + .body(body) + .send() + .await?; + + if response.status_code().is_success() { + info!(template = %template_name, "Index template created/updated"); + } else { + let error_body = response.text().await.unwrap_or_default(); + warn!( + template = %template_name, + error = %error_body, + "Failed to create index template" + ); + } + + Ok(()) + } + + /// Bulk index a batch of events + pub async fn bulk_index(&self, events: &[ConsumedEvent]) -> Result<(), IndexerError> { + if events.is_empty() { + return Ok(()); + } + + let date_suffix = Utc::now().format("%Y.%m.%d").to_string(); + let mut body: Vec> = Vec::with_capacity(events.len() * 2); + + for event in events { + let index_name = format!( + "{}-{}-{}", + self.config.index_prefix, + event.event_type.index_suffix(), + date_suffix + ); + + // Index action + let action = if let Some(ref event_id) = event.event_id { + json!({ "index": { "_index": index_name, "_id": event_id } }) + } else { + json!({ "index": { "_index": index_name } }) + }; + + body.push(action.into()); + + // Document + let doc: serde_json::Value = serde_json::from_str(&event.payload)?; + body.push(doc.into()); + } + + let response = self + .client + .bulk(BulkParts::None) + .body(body) + .send() + .await?; + + if response.status_code().is_success() { + let response_body: serde_json::Value = response.json().await?; + let errors = response_body.get("errors").and_then(|e| e.as_bool()).unwrap_or(false); + + if errors { + // Log individual errors + if let Some(items) = response_body.get("items").and_then(|i| i.as_array()) { + for item in items { + if let Some(error) = item.get("index").and_then(|i| i.get("error")) { + warn!(error = %error, "Bulk index error for item"); + } + } + } + } else { + debug!(count = events.len(), "Bulk indexed events"); + } + } else { + let error_body = response.text().await.unwrap_or_default(); + error!(error = %error_body, "Bulk index request failed"); + return Err(IndexerError::Elasticsearch(error_body)); + } + + Ok(()) + } + + /// Common header mapping shared by all event types + fn header_mapping() -> serde_json::Value { + json!({ + "properties": { + "event_id": { "type": "keyword" }, + "event_type": { "type": "keyword" }, + "timestamp": { "type": "date", "format": "epoch_millis" }, + "source_cuebot": { "type": "keyword" }, + "correlation_id": { "type": "keyword" } + } + }) + } + + /// Job events index mapping + fn job_events_mapping(&self) -> serde_json::Value { + json!({ + "properties": { + "header": Self::header_mapping(), + "job": { + "properties": { + "id": { "type": "keyword" }, + "name": { "type": "keyword" }, + "show": { "type": "keyword" }, + "shot": { "type": "keyword" }, + "user": { "type": "keyword" }, + "state": { "type": "keyword" }, + "facility": { "type": "keyword" }, + "group": { "type": "keyword" }, + "priority": { "type": "integer" }, + "start_time": { "type": "date", "format": "epoch_millis" }, + "stop_time": { "type": "date", "format": "epoch_millis" }, + "is_paused": { "type": "boolean" }, + "is_auto_eat": { "type": "boolean" }, + "job_stats": { + "properties": { + "pending_frames": { "type": "integer" }, + "running_frames": { "type": "integer" }, + "dead_frames": { "type": "integer" }, + "eaten_frames": { "type": "integer" }, + "succeeded_frames": { "type": "integer" }, + "waiting_frames": { "type": "integer" }, + "depend_frames": { "type": "integer" }, + "total_frames": { "type": "integer" }, + "total_layers": { "type": "integer" }, + "reserved_cores": { "type": "float" }, + "reserved_gpus": { "type": "float" } + } + } + } + }, + "previous_state": { "type": "keyword" }, + "reason": { "type": "text" }, + "killed_by": { "type": "keyword" } + } + }) + } + + /// Layer events index mapping + fn layer_events_mapping(&self) -> serde_json::Value { + json!({ + "properties": { + "header": Self::header_mapping(), + "layer": { + "properties": { + "id": { "type": "keyword" }, + "name": { "type": "keyword" }, + "type": { "type": "keyword" }, + "range": { "type": "keyword" }, + "chunk_size": { "type": "integer" }, + "min_cores": { "type": "float" }, + "max_cores": { "type": "float" }, + "min_memory": { "type": "long" }, + "min_gpus": { "type": "integer" }, + "min_gpu_memory": { "type": "long" }, + "is_threadable": { "type": "boolean" }, + "tags": { "type": "keyword" }, + "services": { "type": "keyword" }, + "layer_stats": { + "properties": { + "pending_frames": { "type": "integer" }, + "running_frames": { "type": "integer" }, + "dead_frames": { "type": "integer" }, + "eaten_frames": { "type": "integer" }, + "succeeded_frames": { "type": "integer" }, + "waiting_frames": { "type": "integer" }, + "depend_frames": { "type": "integer" }, + "total_frames": { "type": "integer" }, + "reserved_cores": { "type": "float" }, + "reserved_gpus": { "type": "float" }, + "max_rss": { "type": "long" }, + "total_core_seconds": { "type": "long" }, + "total_gpu_seconds": { "type": "long" }, + "rendered_frame_count": { "type": "integer" }, + "failed_frame_count": { "type": "integer" }, + "avg_frame_sec": { "type": "integer" }, + "low_frame_sec": { "type": "integer" }, + "high_frame_sec": { "type": "integer" } + } + } + } + }, + "job_id": { "type": "keyword" }, + "job_name": { "type": "keyword" }, + "show": { "type": "keyword" } + } + }) + } + + /// Frame events index mapping + fn frame_events_mapping(&self) -> serde_json::Value { + json!({ + "properties": { + "header": Self::header_mapping(), + "frame": { + "properties": { + "id": { "type": "keyword" }, + "name": { "type": "keyword" }, + "number": { "type": "integer" }, + "state": { "type": "keyword" }, + "retry_count": { "type": "integer" }, + "exit_status": { "type": "integer" }, + "dispatch_order": { "type": "integer" }, + "start_time": { "type": "date", "format": "epoch_millis" }, + "stop_time": { "type": "date", "format": "epoch_millis" }, + "max_rss": { "type": "long" }, + "used_memory": { "type": "long" }, + "reserved_memory": { "type": "long" }, + "max_gpu_memory": { "type": "long" }, + "used_gpu_memory": { "type": "long" }, + "reserved_gpu_memory": { "type": "long" }, + "last_resource": { "type": "keyword" }, + "checkpoint_state": { "type": "keyword" }, + "checkpoint_count": { "type": "integer" }, + "total_core_time": { "type": "long" }, + "total_gpu_time": { "type": "long" }, + "llu_time": { "type": "date", "format": "epoch_millis" } + } + }, + "layer_id": { "type": "keyword" }, + "job_id": { "type": "keyword" }, + "job_name": { "type": "keyword" }, + "show": { "type": "keyword" }, + "previous_state": { "type": "keyword" }, + "exit_signal": { "type": "integer" }, + "run_time": { "type": "integer" }, + "num_cores": { "type": "integer" }, + "num_gpus": { "type": "integer" }, + "host_name": { "type": "keyword" }, + "resource_id": { "type": "keyword" }, + "reason": { "type": "text" }, + "killed_by": { "type": "keyword" } + } + }) + } + + /// Host events index mapping + fn host_events_mapping(&self) -> serde_json::Value { + json!({ + "properties": { + "header": Self::header_mapping(), + "host": { + "properties": { + "id": { "type": "keyword" }, + "name": { "type": "keyword" }, + "state": { "type": "keyword" }, + "lock_state": { "type": "keyword" }, + "nimby_enabled": { "type": "boolean" }, + "free_memory": { "type": "long" }, + "total_memory": { "type": "long" }, + "free_swap": { "type": "long" }, + "total_swap": { "type": "long" }, + "free_mcp": { "type": "long" }, + "total_mcp": { "type": "long" }, + "free_gpu_memory": { "type": "long" }, + "total_gpu_memory": { "type": "long" }, + "load": { "type": "integer" }, + "cores": { "type": "float" }, + "idle_cores": { "type": "float" }, + "gpus": { "type": "integer" }, + "idle_gpus": { "type": "integer" }, + "procs": { "type": "integer" }, + "boot_time": { "type": "date", "format": "epoch_millis" }, + "ping_time": { "type": "date", "format": "epoch_millis" }, + "tags": { "type": "keyword" }, + "alloc_name": { "type": "keyword" }, + "os": { "type": "keyword" } + } + }, + "facility": { "type": "keyword" }, + "previous_state": { "type": "keyword" }, + "previous_lock_state": { "type": "keyword" }, + "nimby_locked": { "type": "boolean" }, + "reason": { "type": "text" } + } + }) + } + + /// Proc events index mapping + fn proc_events_mapping(&self) -> serde_json::Value { + json!({ + "properties": { + "header": Self::header_mapping(), + "proc_id": { "type": "keyword" }, + "proc_name": { "type": "keyword" }, + "host_id": { "type": "keyword" }, + "host_name": { "type": "keyword" }, + "job_id": { "type": "keyword" }, + "job_name": { "type": "keyword" }, + "layer_id": { "type": "keyword" }, + "layer_name": { "type": "keyword" }, + "frame_id": { "type": "keyword" }, + "frame_name": { "type": "keyword" }, + "show": { "type": "keyword" }, + "group_name": { "type": "keyword" }, + "reserved_cores": { "type": "float" }, + "reserved_gpus": { "type": "float" }, + "reserved_memory": { "type": "long" }, + "reserved_gpu_memory": { "type": "long" }, + "dispatch_time": { "type": "date", "format": "epoch_millis" }, + "booked_time": { "type": "date", "format": "epoch_millis" }, + "is_local_dispatch": { "type": "boolean" }, + "is_unbooked": { "type": "boolean" }, + "redirect_target": { "type": "keyword" }, + "services": { "type": "keyword" } + } + }) + } +} diff --git a/rust/crates/kafka-es-indexer/src/error.rs b/rust/crates/kafka-es-indexer/src/error.rs new file mode 100644 index 000000000..bc71734bd --- /dev/null +++ b/rust/crates/kafka-es-indexer/src/error.rs @@ -0,0 +1,45 @@ +// Copyright Contributors to the OpenCue Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under +// the License. + +//! Error types for the Kafka-Elasticsearch indexer. + +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum IndexerError { + #[error("Configuration error: {0}")] + Config(String), + + #[error("Kafka error: {0}")] + Kafka(String), + + #[error("Elasticsearch error: {0}")] + Elasticsearch(String), + + #[error("JSON parsing error: {0}")] + Json(#[from] serde_json::Error), + + #[error("IO error: {0}")] + Io(#[from] std::io::Error), +} + +impl From for IndexerError { + fn from(err: rdkafka::error::KafkaError) -> Self { + IndexerError::Kafka(err.to_string()) + } +} + +impl From for IndexerError { + fn from(err: elasticsearch::Error) -> Self { + IndexerError::Elasticsearch(err.to_string()) + } +} diff --git a/rust/crates/kafka-es-indexer/src/main.rs b/rust/crates/kafka-es-indexer/src/main.rs new file mode 100644 index 000000000..103fc73fb --- /dev/null +++ b/rust/crates/kafka-es-indexer/src/main.rs @@ -0,0 +1,107 @@ +// Copyright Contributors to the OpenCue Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under +// the License. + +//! OpenCue Kafka to Elasticsearch Event Indexer +//! +//! This service consumes monitoring events from Kafka topics and indexes them +//! into Elasticsearch for historical analysis and querying. + +mod config; +mod consumer; +mod elasticsearch; +mod error; + +use clap::Parser; +use tracing::{info, Level}; +use tracing_subscriber::{fmt, prelude::*, EnvFilter}; + +use crate::config::Config; +use crate::consumer::EventConsumer; +use crate::elasticsearch::ElasticsearchClient; + +#[derive(Parser, Debug)] +#[command(name = "kafka-es-indexer")] +#[command(about = "OpenCue Kafka to Elasticsearch event indexer")] +#[command(version)] +struct Args { + /// Path to configuration file + #[arg(short, long, env = "INDEXER_CONFIG")] + config: Option, + + /// Kafka bootstrap servers + #[arg(long, env = "KAFKA_BOOTSTRAP_SERVERS", default_value = "localhost:9092")] + kafka_servers: String, + + /// Kafka consumer group ID + #[arg(long, env = "KAFKA_GROUP_ID", default_value = "opencue-elasticsearch-indexer")] + kafka_group_id: String, + + /// Elasticsearch URL + #[arg(long, env = "ELASTICSEARCH_URL", default_value = "http://localhost:9200")] + elasticsearch_url: String, + + /// Elasticsearch index prefix + #[arg(long, env = "ELASTICSEARCH_INDEX_PREFIX", default_value = "opencue")] + index_prefix: String, + + /// Log level + #[arg(long, env = "LOG_LEVEL", default_value = "info")] + log_level: String, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let args = Args::parse(); + + // Initialize logging + let filter = EnvFilter::try_from_default_env() + .unwrap_or_else(|_| EnvFilter::new(&args.log_level)); + + tracing_subscriber::registry() + .with(fmt::layer()) + .with(filter) + .init(); + + info!("Starting OpenCue Kafka-Elasticsearch Indexer"); + + // Load configuration + let config = if let Some(config_path) = &args.config { + Config::from_file(config_path)? + } else { + Config::from_args(&args) + }; + + info!( + kafka_servers = %config.kafka.bootstrap_servers, + group_id = %config.kafka.group_id, + elasticsearch_url = %config.elasticsearch.url, + "Configuration loaded" + ); + + // Initialize Elasticsearch client + let es_client = ElasticsearchClient::new(&config.elasticsearch).await?; + info!("Elasticsearch client initialized"); + + // Create index templates + es_client.create_index_templates().await?; + info!("Index templates created/verified"); + + // Start the consumer + let consumer = EventConsumer::new(&config.kafka, es_client)?; + info!("Kafka consumer initialized, starting event processing"); + + // Run the consumer (blocks until shutdown) + consumer.run().await?; + + info!("Indexer shutting down"); + Ok(()) +} diff --git a/sandbox/README.md b/sandbox/README.md index 3a8973fab..778e8d7c0 100644 --- a/sandbox/README.md +++ b/sandbox/README.md @@ -157,8 +157,27 @@ cuesubmit & ## Monitoring -To get started with monitoring there is also an additional Docker compose file which sets up -monitoring for key services. +### Event Streaming Monitoring Stack (Recommended) -To learn how to run the sandbox environment with monitoring, -see https://www.opencue.io/docs/other-guides/monitoring-with-prometheus-loki-and-grafana/. \ No newline at end of file +The full monitoring stack provides real-time event streaming and historical analysis: + +```bash +docker compose -f sandbox/docker-compose.monitoring-full.yml up -d +``` + +This starts: +- **Kafka** + **Zookeeper** - Event streaming (localhost:9092) +- **kafka-es-indexer** - Rust service that indexes events to Elasticsearch +- **Elasticsearch** - Historical event storage (http://localhost:9200) +- **Kibana** - Elasticsearch visualization (http://localhost:5601) +- **Prometheus** - Metrics collection (http://localhost:9090) +- **Grafana** - Dashboards and visualization (http://localhost:3000) +- **Kafka UI** - Event stream browser (http://localhost:8090) + +For sample Kibana queries, see [kibana-queries.md](kibana-queries.md). + +For more information, see https://www.opencue.io/docs/quick-starts/quick-start-monitoring/. + +### Legacy Prometheus/Loki Monitoring + +For the legacy Prometheus/Loki monitoring setup, see https://www.opencue.io/docs/other-guides/monitoring-with-prometheus-loki-and-grafana/. \ No newline at end of file diff --git a/sandbox/config/grafana/dashboards/opencue-monitoring.json b/sandbox/config/grafana/dashboards/opencue-monitoring.json new file mode 100644 index 000000000..c31631a17 --- /dev/null +++ b/sandbox/config/grafana/dashboards/opencue-monitoring.json @@ -0,0 +1,1215 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(increase(cue_frames_completed_total[5m])) by (state)", + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "DEAD" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "SUCCEEDED" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "WAITING" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + } + ] + }, + "title": "Frames Completed (5m)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(increase(cue_jobs_completed_total[5m])) by (show)", + "legendFormat": "{{show}}", + "refId": "A" + } + ], + "title": "Jobs Completed by Show (5m)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum(rate(cue_layer_max_runtime_seconds_bucket[5m])) by (le, show))", + "legendFormat": "p95 {{show}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum(rate(cue_layer_max_runtime_seconds_bucket[5m])) by (le, show))", + "legendFormat": "p50 {{show}}", + "refId": "B" + } + ], + "title": "Layer Max Runtime Distribution", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum(rate(cue_layer_max_memory_bytes_bucket[5m])) by (le, show))", + "legendFormat": "p95 {{show}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum(rate(cue_layer_max_memory_bytes_bucket[5m])) by (le, show))", + "legendFormat": "p50 {{show}}", + "refId": "B" + } + ], + "title": "Layer Max Memory Distribution", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.95, sum(rate(cue_job_core_seconds_bucket[5m])) by (le, show))", + "legendFormat": "p95 {{show}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "histogram_quantile(0.50, sum(rate(cue_job_core_seconds_bucket[5m])) by (le, show))", + "legendFormat": "p50 {{show}}", + "refId": "B" + } + ], + "title": "Job Core Seconds Distribution", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(increase(cue_host_reports_received_total[5m])) by (facility)", + "legendFormat": "{{facility}}", + "refId": "A" + } + ], + "title": "Host Reports Received (5m)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 7, + "panels": [], + "title": "Pickup Time Metrics", + "type": "row" + }, + { + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 25 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "alias": "Frames Started", + "bucketAggs": [ + { + "field": "header.timestamp", + "id": "2", + "settings": { + "interval": "5m" + }, + "type": "date_histogram" + } + ], + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "header.event_type:FRAME_STARTED", + "refId": "A", + "timeField": "header.timestamp" + } + ], + "title": "Frames Started (5m)", + "description": "WAITING -> RUNNING transitions. Marks when frames begin execution on a host.", + "type": "stat" + }, + { + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 25 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["sum"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "alias": "Frames Dispatched", + "bucketAggs": [ + { + "field": "header.timestamp", + "id": "2", + "settings": { + "interval": "5m" + }, + "type": "date_histogram" + } + ], + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "header.event_type:FRAME_DISPATCHED", + "refId": "A", + "timeField": "header.timestamp" + } + ], + "title": "Frames Dispatchable (5m)", + "description": "DEPEND -> WAITING transitions. Marks when frames become ready for dispatch after dependencies are satisfied.", + "type": "stat" + }, + { + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Events", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "FRAME_STARTED" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "FRAME_DISPATCHED" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 25 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["sum"], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "alias": "FRAME_STARTED", + "bucketAggs": [ + { + "field": "header.timestamp", + "id": "2", + "settings": { + "interval": "1m" + }, + "type": "date_histogram" + } + ], + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "header.event_type:FRAME_STARTED", + "refId": "A", + "timeField": "header.timestamp" + }, + { + "alias": "FRAME_DISPATCHED", + "bucketAggs": [ + { + "field": "header.timestamp", + "id": "2", + "settings": { + "interval": "1m" + }, + "type": "date_histogram" + } + ], + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "metrics": [ + { + "id": "1", + "type": "count" + } + ], + "query": "header.event_type:FRAME_DISPATCHED", + "refId": "B", + "timeField": "header.timestamp" + } + ], + "title": "Pickup Time Events Over Time", + "description": "Comparison of FRAME_DISPATCHED (ready) vs FRAME_STARTED (running) events. The gap between these shows frames waiting in queue.", + "type": "timeseries" + }, + { + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "header.event_type" + }, + "properties": [ + { + "id": "custom.width", + "value": 140 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "header.timestamp" + }, + "properties": [ + { + "id": "custom.width", + "value": 180 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 11, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "header.timestamp" + } + ] + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "bucketAggs": [], + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "metrics": [ + { + "id": "1", + "settings": { + "size": "50" + }, + "type": "raw_data" + } + ], + "query": "header.event_type:FRAME_STARTED", + "refId": "A", + "timeField": "header.timestamp" + } + ], + "title": "Recent FRAME_STARTED Events", + "description": "Recent frames that started running (WAITING -> RUNNING). Use start_time to calculate pickup time.", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "_id": true, + "_index": true, + "_type": true, + "header.correlation_id": true, + "header.event_id": true, + "header.source_cuebot": true + }, + "indexByName": {}, + "renameByName": { + "header.timestamp": "Timestamp", + "frame_id": "Frame ID", + "frame_name": "Frame", + "header.event_type": "Event Type", + "host_name": "Host", + "job_name": "Job", + "layer_name": "Layer", + "num_cores": "Cores", + "previous_state": "From State", + "show": "Show", + "start_time": "Start Time", + "state": "To State" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "header.event_type" + }, + "properties": [ + { + "id": "custom.width", + "value": 160 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "header.timestamp" + }, + "properties": [ + { + "id": "custom.width", + "value": 180 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 33 + }, + "id": 12, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": ["sum"], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "header.timestamp" + } + ] + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "bucketAggs": [], + "datasource": { + "type": "elasticsearch", + "uid": "elasticsearch" + }, + "metrics": [ + { + "id": "1", + "settings": { + "size": "50" + }, + "type": "raw_data" + } + ], + "query": "header.event_type:FRAME_DISPATCHED", + "refId": "A", + "timeField": "header.timestamp" + } + ], + "title": "Recent FRAME_DISPATCHED Events", + "description": "Recent frames that became dispatchable (DEPEND -> WAITING). Dependencies were satisfied.", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "_id": true, + "_index": true, + "_type": true, + "header.correlation_id": true, + "header.event_id": true, + "header.source_cuebot": true + }, + "indexByName": {}, + "renameByName": { + "header.timestamp": "Timestamp", + "dispatch_order": "Dispatch Order", + "frame_id": "Frame ID", + "frame_name": "Frame", + "frame_number": "Frame #", + "header.event_type": "Event Type", + "job_id": "Job ID", + "layer_id": "Layer ID", + "previous_state": "From State", + "retry_count": "Retries", + "state": "To State" + } + } + } + ], + "type": "table" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["opencue", "monitoring"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "OpenCue Monitoring Dashboard", + "uid": "opencue-monitoring", + "version": 1, + "weekStart": "" +} diff --git a/sandbox/config/grafana/provisioning/datasources/datasource.yml b/sandbox/config/grafana/provisioning/datasources/datasource.yml index 593b8276f..747fbf79f 100644 --- a/sandbox/config/grafana/provisioning/datasources/datasource.yml +++ b/sandbox/config/grafana/provisioning/datasources/datasource.yml @@ -22,4 +22,21 @@ datasources: basicAuth: false isDefault: false version: 1 - editable: false \ No newline at end of file + editable: false + - name: elasticsearch + type: elasticsearch + uid: elasticsearch + access: proxy + orgId: 1 + url: http://elasticsearch:9200 + basicAuth: false + isDefault: false + version: 1 + editable: false + jsonData: + esVersion: "7.10.0" + timeField: "header.timestamp" + logMessageField: "message" + logLevelField: "level" + maxConcurrentShardRequests: 5 + index: "opencue-frame-events-*" \ No newline at end of file diff --git a/sandbox/config/prometheus-monitoring.yml b/sandbox/config/prometheus-monitoring.yml new file mode 100644 index 000000000..e5bc6e68d --- /dev/null +++ b/sandbox/config/prometheus-monitoring.yml @@ -0,0 +1,27 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +scrape_configs: + # Cuebot metrics endpoint + - job_name: 'cuebot' + static_configs: + - targets: ['cuebot:8080'] + metrics_path: /metrics + + # Prometheus self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Kafka metrics (via JMX exporter if configured) + - job_name: 'kafka' + static_configs: + - targets: ['kafka:9092'] + metrics_path: /metrics + + # Elasticsearch metrics + - job_name: 'elasticsearch' + static_configs: + - targets: ['elasticsearch:9200'] + metrics_path: /_prometheus/metrics diff --git a/sandbox/docker-compose.monitoring-full.yml b/sandbox/docker-compose.monitoring-full.yml new file mode 100644 index 000000000..136d4f0d8 --- /dev/null +++ b/sandbox/docker-compose.monitoring-full.yml @@ -0,0 +1,233 @@ +services: + # Zookeeper for Kafka + zookeeper: + image: confluentinc/cp-zookeeper:7.4.0 + hostname: zookeeper + container_name: opencue-zookeeper + ports: + - "2181:2181" + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + ZOOKEEPER_TICK_TIME: 2000 + healthcheck: + test: ["CMD", "nc", "-z", "localhost", "2181"] + interval: 10s + timeout: 5s + retries: 5 + + # Kafka broker + kafka: + image: confluentinc/cp-kafka:7.4.0 + hostname: kafka + container_name: opencue-kafka + depends_on: + zookeeper: + condition: service_healthy + ports: + - "9092:9092" + - "29092:29092" + environment: + KAFKA_BROKER_ID: 1 + KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 + KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 + KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 + KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true" + healthcheck: + test: ["CMD", "kafka-topics", "--bootstrap-server", "kafka:29092", "--list"] + interval: 10s + timeout: 10s + retries: 5 + + # Kafka UI for debugging + kafka-ui: + image: provectuslabs/kafka-ui:latest + container_name: opencue-kafka-ui + depends_on: + kafka: + condition: service_healthy + ports: + - "8090:8080" + environment: + KAFKA_CLUSTERS_0_NAME: opencue + KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka:29092 + KAFKA_CLUSTERS_0_ZOOKEEPER: zookeeper:2181 + + # Elasticsearch for historical data storage + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.8.0 + container_name: opencue-elasticsearch + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - "ES_JAVA_OPTS=-Xms512m -Xmx512m" + ports: + - "9200:9200" + - "9300:9300" + volumes: + - elasticsearch-data:/usr/share/elasticsearch/data + healthcheck: + test: ["CMD-SHELL", "curl -s http://localhost:9200/_cluster/health | grep -q '\"status\":\"green\"\\|\"status\":\"yellow\"'"] + interval: 10s + timeout: 10s + retries: 10 + + # Kibana for Elasticsearch visualization + kibana: + image: docker.elastic.co/kibana/kibana:8.8.0 + container_name: opencue-kibana + depends_on: + elasticsearch: + condition: service_healthy + ports: + - "5601:5601" + environment: + ELASTICSEARCH_HOSTS: http://elasticsearch:9200 + + # Prometheus for metrics collection + prometheus: + image: prom/prometheus:v2.45.0 + container_name: opencue-prometheus + ports: + - "9090:9090" + volumes: + - ./config/prometheus-monitoring.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.enable-lifecycle' + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:9090/-/healthy"] + interval: 10s + timeout: 5s + retries: 3 + + # Grafana for dashboards + grafana: + image: grafana/grafana:10.0.0 + container_name: opencue-grafana + depends_on: + - prometheus + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - ./config/grafana/provisioning:/etc/grafana/provisioning:ro + - ./config/grafana/dashboards:/etc/grafana/dashboards:ro + - grafana-data:/var/lib/grafana + + # PostgreSQL database + db: + image: postgres:15 + container_name: opencue-db + environment: + - POSTGRES_USER=cuebot + - POSTGRES_PASSWORD=cuebot_password + - POSTGRES_DB=cuebot + ports: + - "5432:5432" + volumes: + - ./db-data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U cuebot"] + interval: 5s + timeout: 5s + retries: 5 + + # Flyway for database migrations + flyway: + build: + context: ../ + dockerfile: ./sandbox/flyway.Dockerfile + container_name: opencue-flyway + depends_on: + db: + condition: service_healthy + environment: + - PGUSER=cuebot + - PGPASSWORD=cuebot_password + - PGDATABASE=cuebot + - PGHOST=db + - PGPORT=5432 + command: /opt/scripts/migrate.sh + + # Cuebot with monitoring enabled + cuebot: + build: + context: ../ + dockerfile: ./cuebot/Dockerfile + container_name: opencue-cuebot + depends_on: + db: + condition: service_healthy + flyway: + condition: service_completed_successfully + kafka: + condition: service_healthy + ports: + - "8443:8443" + - "8080:8080" + environment: + - CUEBOT_DB_HOST=db + - CUEBOT_DB_NAME=cuebot + - CUEBOT_DB_USER=cuebot + - CUEBOT_DB_PASS=cuebot_password + command: + - java + - -jar + - /opt/opencue/cuebot-latest.jar + - --datasource.cue-data-source.jdbc-url=jdbc:postgresql://db/cuebot + - --datasource.cue-data-source.username=cuebot + - --datasource.cue-data-source.password=cuebot_password + - --monitoring.kafka.enabled=true + - --monitoring.kafka.bootstrap.servers=kafka:29092 + - --metrics.prometheus.collector=true + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/health"] + interval: 10s + timeout: 5s + retries: 10 + + # Kafka to Elasticsearch indexer (Rust service) + kafka-es-indexer: + build: + context: ../rust + dockerfile: ./crates/kafka-es-indexer/Dockerfile + container_name: opencue-kafka-es-indexer + depends_on: + kafka: + condition: service_healthy + elasticsearch: + condition: service_healthy + environment: + - KAFKA_BOOTSTRAP_SERVERS=kafka:29092 + - KAFKA_GROUP_ID=opencue-elasticsearch-indexer + - ELASTICSEARCH_URL=http://elasticsearch:9200 + - ELASTICSEARCH_INDEX_PREFIX=opencue + - LOG_LEVEL=info + restart: unless-stopped + + # RQD (render queue daemon) + rqd: + image: opencue/rqd + container_name: opencue-rqd + depends_on: + cuebot: + condition: service_healthy + environment: + - CUEBOT_HOSTNAME=cuebot + volumes: + - ./rqd/logs:/tmp/rqd_logs + - ./rqd/shots:/shots + +volumes: + elasticsearch-data: + prometheus-data: + grafana-data: diff --git a/sandbox/kibana-queries.md b/sandbox/kibana-queries.md new file mode 100644 index 000000000..4cd2a11cd --- /dev/null +++ b/sandbox/kibana-queries.md @@ -0,0 +1,630 @@ +# OpenCue Monitoring - Kibana Query Reference + +This document provides sample Kibana Dev Tools queries for exploring OpenCue monitoring data stored in Elasticsearch. + +**Access Kibana Dev Tools:** http://localhost:5601/app/dev_tools#/console + +## Index Overview + +```json +# List all OpenCue indices with stats +GET /_cat/indices/opencue-*?v&s=index + +# Get total document counts +GET /opencue-frame-events-*/_count +GET /opencue-job-events-*/_count +GET /opencue-layer-events-*/_count +GET /opencue-proc-events-*/_count +GET /opencue-host-events-*/_count +``` + +## Pickup Time Tracking + +Pickup time measures how long frames wait between becoming ready (DEPEND->WAITING) and starting execution (WAITING->RUNNING). + +### FRAME_STARTED Events (WAITING -> RUNNING) + +These events are published when a frame is dispatched to a host and begins execution. + +```json +# Count FRAME_STARTED events +GET /opencue-frame-events-*/_count +{ + "query": { + "match": { + "header.event_type": "FRAME_STARTED" + } + } +} + +# Get recent FRAME_STARTED events +GET /opencue-frame-events-*/_search +{ + "query": { + "match": { + "header.event_type": "FRAME_STARTED" + } + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 20, + "_source": [ + "header.timestamp", + "frame_name", + "job_name", + "host_name", + "previous_state", + "state", + "num_cores", + "reserved_memory" + ] +} + +# FRAME_STARTED events in last hour +GET /opencue-frame-events-*/_search +{ + "query": { + "bool": { + "must": [ + { "match": { "header.event_type": "FRAME_STARTED" } }, + { "range": { "header.timestamp": { "gte": "now-1h" } } } + ] + } + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 50 +} +``` + +### FRAME_DISPATCHED Events (DEPEND -> WAITING) + +These events are published when a frame's dependencies are satisfied and it becomes ready for dispatch. + +```json +# Count FRAME_DISPATCHED events +GET /opencue-frame-events-*/_count +{ + "query": { + "match": { + "header.event_type": "FRAME_DISPATCHED" + } + } +} + +# Get recent FRAME_DISPATCHED events +GET /opencue-frame-events-*/_search +{ + "query": { + "match": { + "header.event_type": "FRAME_DISPATCHED" + } + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 20, + "_source": [ + "header.timestamp", + "frame_name", + "frame_number", + "job_id", + "layer_id", + "previous_state", + "state", + "dispatch_order" + ] +} +``` + +### Pickup Time Analysis + +```json +# Both pickup time event types in last hour +GET /opencue-frame-events-*/_search +{ + "query": { + "bool": { + "must": [ + { + "terms": { + "header.event_type": ["FRAME_STARTED", "FRAME_DISPATCHED"] + } + }, + { + "range": { + "header.timestamp": { "gte": "now-1h" } + } + } + ] + } + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 100 +} + +# Pickup events histogram over time +GET /opencue-frame-events-*/_search +{ + "size": 0, + "query": { + "bool": { + "must": [ + { + "terms": { + "header.event_type": ["FRAME_STARTED", "FRAME_DISPATCHED"] + } + }, + { + "range": { + "header.timestamp": { "gte": "now-6h" } + } + } + ] + } + }, + "aggs": { + "events_over_time": { + "date_histogram": { + "field": "header.timestamp", + "fixed_interval": "5m" + }, + "aggs": { + "by_type": { + "terms": { + "field": "header.event_type" + } + } + } + } + } +} +``` + +## Frame Events + +### Event Type Summary + +```json +# Aggregate all frame events by type +GET /opencue-frame-events-*/_search +{ + "size": 0, + "aggs": { + "by_event_type": { + "terms": { + "field": "header.event_type" + } + } + } +} +``` + +### FRAME_COMPLETED Events + +```json +# Recent completed frames +GET /opencue-frame-events-*/_search +{ + "query": { + "match": { + "header.event_type": "FRAME_COMPLETED" + } + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 20, + "_source": [ + "header.timestamp", + "frame_name", + "job_name", + "host_name", + "exit_status", + "run_time", + "max_rss", + "reserved_memory" + ] +} + +# Completed frames runtime statistics +GET /opencue-frame-events-*/_search +{ + "size": 0, + "query": { + "match": { + "header.event_type": "FRAME_COMPLETED" + } + }, + "aggs": { + "runtime_stats": { + "stats": { + "field": "run_time" + } + }, + "memory_stats": { + "stats": { + "field": "max_rss" + } + } + } +} +``` + +### FRAME_FAILED Events + +```json +# Recent failed frames +GET /opencue-frame-events-*/_search +{ + "query": { + "match": { + "header.event_type": "FRAME_FAILED" + } + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 20, + "_source": [ + "header.timestamp", + "frame_name", + "job_name", + "host_name", + "exit_status", + "exit_signal", + "retry_count", + "reason" + ] +} + +# Failed frames by host (find problematic hosts) +GET /opencue-frame-events-*/_search +{ + "size": 0, + "query": { + "match": { + "header.event_type": "FRAME_FAILED" + } + }, + "aggs": { + "by_host": { + "terms": { + "field": "host_name", + "size": 20 + } + } + } +} +``` + +### State Transitions + +```json +# Frame state transitions summary +GET /opencue-frame-events-*/_search +{ + "size": 0, + "aggs": { + "transitions": { + "composite": { + "sources": [ + { "from": { "terms": { "field": "previous_state" } } }, + { "to": { "terms": { "field": "state" } } } + ] + } + } + } +} +``` + +## Job Events + +```json +# Recent job events +GET /opencue-job-events-*/_search +{ + "query": { + "match_all": {} + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 20 +} + +# Job events by type +GET /opencue-job-events-*/_search +{ + "size": 0, + "aggs": { + "by_event_type": { + "terms": { + "field": "header.event_type" + } + } + } +} + +# Search for a specific job +GET /opencue-job-events-*/_search +{ + "query": { + "match": { + "job_name": "testing-testshot-rfigueiredo_load_test_job_0093" + } + }, + "sort": [ + { "header.timestamp": { "order": "asc" } } + ] +} + +# Jobs by show +GET /opencue-job-events-*/_search +{ + "size": 0, + "aggs": { + "by_show": { + "terms": { + "field": "show" + } + } + } +} + +# Recently completed jobs +GET /opencue-job-events-*/_search +{ + "query": { + "bool": { + "must": [ + { "match": { "header.event_type": "JOB_COMPLETED" } }, + { "range": { "header.timestamp": { "gte": "now-1h" } } } + ] + } + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 20 +} +``` + +## Proc Events + +```json +# Proc events by type +GET /opencue-proc-events-*/_search +{ + "size": 0, + "aggs": { + "by_event_type": { + "terms": { + "field": "header.event_type" + } + } + } +} + +# Recent proc bookings +GET /opencue-proc-events-*/_search +{ + "query": { + "match": { + "header.event_type": "PROC_BOOKED" + } + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 20, + "_source": [ + "header.timestamp", + "host_name", + "job_id", + "frame_id", + "reserved_cores", + "reserved_memory" + ] +} + +# Proc unbookings (frames finished or killed) +GET /opencue-proc-events-*/_search +{ + "query": { + "match": { + "header.event_type": "PROC_UNBOOKED" + } + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 20 +} +``` + +## Layer Events + +```json +# Layer events summary +GET /opencue-layer-events-*/_search +{ + "size": 0, + "aggs": { + "by_event_type": { + "terms": { + "field": "header.event_type" + } + } + } +} + +# Layers by type (Render, Util, etc.) +GET /opencue-layer-events-*/_search +{ + "size": 0, + "aggs": { + "by_layer_type": { + "terms": { + "field": "type" + } + } + } +} +``` + +## Host Events + +```json +# Recent host events +GET /opencue-host-events-*/_search +{ + "query": { + "match_all": {} + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 10 +} +``` + +## Time-Based Analytics + +```json +# Frame events histogram (per minute, last hour) +GET /opencue-frame-events-*/_search +{ + "size": 0, + "query": { + "range": { + "header.timestamp": { "gte": "now-1h" } + } + }, + "aggs": { + "events_over_time": { + "date_histogram": { + "field": "header.timestamp", + "fixed_interval": "1m" + }, + "aggs": { + "by_type": { + "terms": { + "field": "header.event_type" + } + } + } + } + } +} + +# Events by show over time +GET /opencue-frame-events-*/_search +{ + "size": 0, + "query": { + "range": { + "header.timestamp": { "gte": "now-24h" } + } + }, + "aggs": { + "by_show": { + "terms": { + "field": "show" + }, + "aggs": { + "over_time": { + "date_histogram": { + "field": "header.timestamp", + "fixed_interval": "1h" + } + } + } + } + } +} +``` + +## Correlation Queries + +```json +# Track a job's complete frame lifecycle using correlation_id (job_id) +# Replace YOUR-JOB-ID with an actual job ID +GET /opencue-frame-events-*/_search +{ + "query": { + "term": { + "header.correlation_id": "YOUR-JOB-ID-HERE" + } + }, + "sort": [ + { "header.timestamp": { "order": "asc" } } + ], + "size": 100 +} + +# All events for a specific frame +GET /opencue-frame-events-*/_search +{ + "query": { + "term": { + "frame_id": "YOUR-FRAME-ID-HERE" + } + }, + "sort": [ + { "header.timestamp": { "order": "asc" } } + ] +} + +# Frames dispatched to a specific host +GET /opencue-frame-events-*/_search +{ + "query": { + "bool": { + "must": [ + { "match": { "header.event_type": "FRAME_STARTED" } }, + { "match": { "host_name": "172.19.0.11" } } + ] + } + }, + "sort": [ + { "header.timestamp": { "order": "desc" } } + ], + "size": 50 +} +``` + +## Index Mapping Reference + +```json +# View frame events mapping +GET /opencue-frame-events-*/_mapping + +# View the index template +GET /_index_template/opencue-frame-events +``` + +## Notes + +- **Timestamp Field**: All events use `header.timestamp` as the time field (epoch_millis format) +- **Event Types**: Use `header.event_type` to filter by event type +- **Correlation**: Use `header.correlation_id` (typically the job_id) to track related events +- **Keyword Fields**: For exact matching and aggregations, use the field name directly (ES auto-detects keyword sub-fields) + +## Grafana Dashboard + +Access the pre-built monitoring dashboard at: http://localhost:3000/d/opencue-monitoring + +The dashboard includes: +- Frames Completed/Failed stats +- Jobs Completed by Show +- Layer Runtime/Memory distributions +- **Pickup Time Metrics** (NEW): + - Frames Started (WAITING -> RUNNING) + - Frames Dispatchable (DEPEND -> WAITING) + - Pickup Time Events Over Time + - Recent FRAME_STARTED/FRAME_DISPATCHED tables diff --git a/sandbox/load_test_jobs.py b/sandbox/load_test_jobs.py new file mode 100644 index 000000000..334c9b9ff --- /dev/null +++ b/sandbox/load_test_jobs.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 + +# Copyright Contributors to the OpenCue Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Load test script to submit jobs to OpenCue for monitoring testing. + +Usage: + python load_test_jobs.py # Uses defaults: 1000 jobs, batch size 50 + python load_test_jobs.py -n 100 # Submit 100 jobs + python load_test_jobs.py -n 500 -b 25 # Submit 500 jobs in batches of 25 + python load_test_jobs.py --num-jobs 100 --batch-size 10 +""" + +import argparse +import time + +import outline +from outline.modules.shell import Shell + +DEFAULT_NUM_JOBS = 1000 +DEFAULT_BATCH_SIZE = 50 + + +def submit_jobs(num_jobs, batch_size): + print(f"Submitting {num_jobs} jobs to OpenCue (batch size: {batch_size})...") + print("-" * 60) + + submitted = 0 + failed = 0 + + for i in range(num_jobs): + job_name = f'load_test_job_{i:04d}' + try: + ol = outline.Outline(job_name, shot='testshot', show='testing') + # Create a simple layer with 1-3 frames + num_frames = (i % 3) + 1 + layer = Shell('test_layer', + command=['/bin/sleep', str((i % 5) + 1)], # Sleep 1-5 seconds + range=f'1-{num_frames}') + ol.add_layer(layer) + outline.cuerun.launch(ol, use_pycuerun=False) + submitted += 1 + + # Progress indicator + if (i + 1) % 10 == 0: + print(f"Submitted {i + 1}/{num_jobs} jobs ({submitted} successful, {failed} failed)") + + # Small delay between batches to avoid overwhelming the system + if (i + 1) % batch_size == 0: + print(f" Batch complete, pausing briefly...") + time.sleep(1) + + except Exception as e: + failed += 1 + print(f" Failed to submit job {job_name}: {e}") + + print("-" * 60) + print(f"Load test complete!") + print(f" Submitted: {submitted}") + print(f" Failed: {failed}") + print(f" Total frames: ~{submitted * 2}") # Average 2 frames per job + + return submitted, failed + + +def main(): + parser = argparse.ArgumentParser( + description='Load test script to submit jobs to OpenCue for monitoring testing.' + ) + parser.add_argument( + '-n', '--num-jobs', + type=int, + default=DEFAULT_NUM_JOBS, + help=f'Number of jobs to submit (default: {DEFAULT_NUM_JOBS})' + ) + parser.add_argument( + '-b', '--batch-size', + type=int, + default=DEFAULT_BATCH_SIZE, + help=f'Batch size for submission pauses (default: {DEFAULT_BATCH_SIZE})' + ) + + args = parser.parse_args() + submit_jobs(args.num_jobs, args.batch_size) + + +if __name__ == '__main__': + main() diff --git a/sandbox/monitor_events.py b/sandbox/monitor_events.py new file mode 100644 index 000000000..242dbaa2e --- /dev/null +++ b/sandbox/monitor_events.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +# Copyright Contributors to the OpenCue Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Simple Kafka consumer for OpenCue monitoring events. + +Requirements: + pip install kafka-python lz4 + +Usage: + python monitor_events.py +""" + +from kafka import KafkaConsumer +import json +from datetime import datetime + +# Connect to Kafka +# Note: The cuebot producer uses lz4 compression, so the lz4 library must be installed +consumer = KafkaConsumer( + 'opencue.frame.events', + 'opencue.job.events', + bootstrap_servers=['localhost:9092'], + value_deserializer=lambda m: json.loads(m.decode('utf-8')), + auto_offset_reset='earliest', + group_id='tutorial-consumer' +) + +print("Listening for OpenCue events...") +print("-" * 60) + +for message in consumer: + event = message.value + + # Events have a 'header' field containing event metadata + header = event.get('header', {}) + event_type = header.get('event_type', 'UNKNOWN') + timestamp = header.get('timestamp', '') + + # Convert timestamp from milliseconds to readable format + if timestamp: + try: + dt = datetime.fromtimestamp(int(timestamp) / 1000) + timestamp = dt.strftime('%Y-%m-%d %H:%M:%S') + except (ValueError, OSError): + pass + + # Format output based on event type + if event_type.startswith('FRAME_'): + job_name = event.get('job_name', 'N/A') + frame_name = event.get('frame_name', 'N/A') + state = event.get('state', 'N/A') + print(f"[{timestamp}] {event_type}") + print(f" Job: {job_name}") + print(f" Frame: {frame_name}") + print(f" State: {state}") + if event_type == 'FRAME_COMPLETED': + runtime = event.get('run_time', 0) + print(f" Runtime: {runtime}s") + elif event_type == 'FRAME_FAILED': + exit_status = event.get('exit_status', -1) + print(f" Exit Status: {exit_status}") + print() + + elif event_type.startswith('JOB_'): + job_name = event.get('job_name', 'N/A') + show_name = event.get('show', 'N/A') + print(f"[{timestamp}] {event_type}") + print(f" Job: {job_name}") + print(f" Show: {show_name}") + print()