Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.13
1.14
22 changes: 22 additions & 0 deletions proto/src/host.proto
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,26 @@ enum ThreadMode {

// -------- Primary Message Types --------]

// GPU device information for detailed GPU inventory
message GpuDevice {
string id = 1; // Device ID (e.g., "0", "1")
string vendor = 2; // "NVIDIA", "AMD", "Apple", "Intel"
string model = 3; // "Tesla V100", "Apple M3 Max", etc.
uint64 memory_bytes = 4; // Total memory in bytes
string pci_bus = 5; // PCI bus ID (e.g., "0000:01:00.0") or "integrated"
string driver_version = 6; // Driver version
string cuda_version = 7; // CUDA compute capability (e.g., "7.0") or Metal version
map<string, string> attributes = 8; // Extensible metadata
}

// Per-GPU utilization telemetry
message GpuUsage {
string device_id = 1; // Matches GpuDevice.id
uint32 utilization_pct = 2; // 0-100
uint64 memory_used_bytes = 3; // Current memory usage in bytes
uint32 temperature_c = 4; // Temperature in Celsius (optional)
}

message Deed {
string id = 1;
string host = 2;
Expand Down Expand Up @@ -274,6 +294,7 @@ message Host {
ThreadMode thread_mode = 27;
float gpus = 28;
float idle_gpus = 29;
repeated GpuDevice gpu_devices = 30; // Detailed GPU inventory (backward compatible)
}

message HostSearchCriteria {
Expand Down Expand Up @@ -321,6 +342,7 @@ message NestedHost {
NestedProcSeq procs = 28;
float gpus = 29;
float idle_gpus = 30;
repeated GpuDevice gpu_devices = 31; // Detailed GPU inventory (backward compatible)
}

message NestedHostSeq {
Expand Down
8 changes: 8 additions & 0 deletions proto/src/job.proto
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ option go_package = "opencue_gateway/gen/go";

import "comment.proto";
import "depend.proto";
import "host.proto";
import "limit.proto";
import "renderPartition.proto";

// Note: GpuUsage is defined in host.proto

// Job related messages and services
// This includes Job, Layer, Frame, and Group objects

Expand Down Expand Up @@ -520,6 +523,7 @@ message Frame {
int64 max_gpu_memory = 21;
int64 used_gpu_memory = 22;
FrameStateDisplayOverride frame_state_display_override = 23;
repeated host.GpuUsage gpu_usage = 24; // Per-device GPU usage snapshot
}

// Object for frame searching
Expand Down Expand Up @@ -566,6 +570,7 @@ message UpdatedFrame {
int64 max_gpu_memory = 11;
int64 used_gpu_memory = 12;
FrameStateDisplayOverride frame_state_display_override = 13;
repeated host.GpuUsage gpu_usage = 14; // Per-device GPU usage snapshot
}

message UpdatedFrameSeq {
Expand Down Expand Up @@ -714,6 +719,9 @@ message Layer {
float min_gpus = 20;
float max_gpus = 21;
string command = 22;
string gpu_vendor = 23; // GPU vendor filter: "NVIDIA", "AMD", "Apple", "" (any)
repeated string gpu_models_allowed = 24; // GPU model whitelist: ["Tesla V100", "A100"], empty = any
uint64 min_gpu_memory_bytes = 25; // Minimum GPU memory per device in bytes (more precise than min_gpu_memory)
}

message LayerSeq {
Expand Down
6 changes: 5 additions & 1 deletion proto/src/report.proto
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ option go_package = "opencue_gateway/gen/go";

import "host.proto";

// Note: GpuDevice and GpuUsage are defined in host.proto

// Interface to handle RQD pings.


Expand Down Expand Up @@ -82,9 +84,10 @@ message RenderHost {
repeated string tags = 15; // an array of default tags that are added to the host record
host.HardwareState state = 16; // hardware state for the host
map<string, string> attributes = 17; // additional data can be provided about the host
int32 num_gpus = 18; // the number of physical GPU's
int32 num_gpus = 18; // the number of physical GPU's (legacy, use gpu_devices for details)
int64 free_gpu_mem = 19; // the current amount of free gpu memory in kB
int64 total_gpu_mem = 20; // the total size of gpu memory in kB
repeated host.GpuDevice gpu_devices = 21; // Detailed GPU inventory
};

message RunningFrameInfo {
Expand All @@ -107,6 +110,7 @@ message RunningFrameInfo {
int64 used_gpu_memory = 17; // kB
ChildrenProcStats children = 18; //additional data about the running frame's child processes
int64 used_swap_memory = 19; // kB
repeated host.GpuUsage gpu_usage = 20; // Per-device GPU usage
};

message ChildrenProcStats {
Expand Down
3 changes: 2 additions & 1 deletion rqd/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ dependencies = [
"opencue_proto",
"psutil==5.9.8",
"pynput==1.7.6",
"future==1.0.0"
"future==1.0.0",
"pynvml>=11.5.0"
]
requires-python = ">3.7"
description = "RQD is a software client that runs on all hosts doing work for an OpenCue deployment."
Expand Down
6 changes: 5 additions & 1 deletion rqd/rqd/rqcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,11 @@ def __createEnvVariables(self):

# Add GPU's to use all assigned GPU cores
if 'GPU_LIST' in self.runFrame.attributes:
self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST']
gpu_list = self.runFrame.attributes['GPU_LIST']
self.frameEnv['CUE_GPU_CORES'] = gpu_list
# Set CUDA_VISIBLE_DEVICES and NVIDIA_VISIBLE_DEVICES for GPU isolation
self.frameEnv['CUDA_VISIBLE_DEVICES'] = gpu_list
self.frameEnv['NVIDIA_VISIBLE_DEVICES'] = gpu_list

# pylint: disable=inconsistent-return-statements
def _createCommandFile(self, command):
Expand Down
Loading
Loading