diff --git a/VERSION.in b/VERSION.in index d3456a90f..63738cc28 100644 --- a/VERSION.in +++ b/VERSION.in @@ -1 +1 @@ -1.13 +1.14 diff --git a/proto/src/host.proto b/proto/src/host.proto index b321fdec1..feea069eb 100644 --- a/proto/src/host.proto +++ b/proto/src/host.proto @@ -225,6 +225,26 @@ enum ThreadMode { // -------- Primary Message Types --------] +// GPU device information for detailed GPU inventory +message GpuDevice { + string id = 1; // Device ID (e.g., "0", "1") + string vendor = 2; // "NVIDIA", "AMD", "Apple", "Intel" + string model = 3; // "Tesla V100", "Apple M3 Max", etc. + uint64 memory_bytes = 4; // Total memory in bytes + string pci_bus = 5; // PCI bus ID (e.g., "0000:01:00.0") or "integrated" + string driver_version = 6; // Driver version + string cuda_version = 7; // CUDA compute capability (e.g., "7.0") or Metal version + map attributes = 8; // Extensible metadata +} + +// Per-GPU utilization telemetry +message GpuUsage { + string device_id = 1; // Matches GpuDevice.id + uint32 utilization_pct = 2; // 0-100 + uint64 memory_used_bytes = 3; // Current memory usage in bytes + uint32 temperature_c = 4; // Temperature in Celsius (optional) +} + message Deed { string id = 1; string host = 2; @@ -274,6 +294,7 @@ message Host { ThreadMode thread_mode = 27; float gpus = 28; float idle_gpus = 29; + repeated GpuDevice gpu_devices = 30; // Detailed GPU inventory (backward compatible) } message HostSearchCriteria { @@ -321,6 +342,7 @@ message NestedHost { NestedProcSeq procs = 28; float gpus = 29; float idle_gpus = 30; + repeated GpuDevice gpu_devices = 31; // Detailed GPU inventory (backward compatible) } message NestedHostSeq { diff --git a/proto/src/job.proto b/proto/src/job.proto index 4c76308fa..a5babcac0 100644 --- a/proto/src/job.proto +++ b/proto/src/job.proto @@ -9,9 +9,12 @@ option go_package = "opencue_gateway/gen/go"; import "comment.proto"; import "depend.proto"; +import "host.proto"; import "limit.proto"; import "renderPartition.proto"; +// Note: GpuUsage is defined in host.proto + // Job related messages and services // This includes Job, Layer, Frame, and Group objects @@ -520,6 +523,7 @@ message Frame { int64 max_gpu_memory = 21; int64 used_gpu_memory = 22; FrameStateDisplayOverride frame_state_display_override = 23; + repeated host.GpuUsage gpu_usage = 24; // Per-device GPU usage snapshot } // Object for frame searching @@ -566,6 +570,7 @@ message UpdatedFrame { int64 max_gpu_memory = 11; int64 used_gpu_memory = 12; FrameStateDisplayOverride frame_state_display_override = 13; + repeated host.GpuUsage gpu_usage = 14; // Per-device GPU usage snapshot } message UpdatedFrameSeq { @@ -714,6 +719,9 @@ message Layer { float min_gpus = 20; float max_gpus = 21; string command = 22; + string gpu_vendor = 23; // GPU vendor filter: "NVIDIA", "AMD", "Apple", "" (any) + repeated string gpu_models_allowed = 24; // GPU model whitelist: ["Tesla V100", "A100"], empty = any + uint64 min_gpu_memory_bytes = 25; // Minimum GPU memory per device in bytes (more precise than min_gpu_memory) } message LayerSeq { diff --git a/proto/src/report.proto b/proto/src/report.proto index 6ace5708e..205bfafdf 100644 --- a/proto/src/report.proto +++ b/proto/src/report.proto @@ -9,6 +9,8 @@ option go_package = "opencue_gateway/gen/go"; import "host.proto"; +// Note: GpuDevice and GpuUsage are defined in host.proto + // Interface to handle RQD pings. @@ -82,9 +84,10 @@ message RenderHost { repeated string tags = 15; // an array of default tags that are added to the host record host.HardwareState state = 16; // hardware state for the host map attributes = 17; // additional data can be provided about the host - int32 num_gpus = 18; // the number of physical GPU's + int32 num_gpus = 18; // the number of physical GPU's (legacy, use gpu_devices for details) int64 free_gpu_mem = 19; // the current amount of free gpu memory in kB int64 total_gpu_mem = 20; // the total size of gpu memory in kB + repeated host.GpuDevice gpu_devices = 21; // Detailed GPU inventory }; message RunningFrameInfo { @@ -107,6 +110,7 @@ message RunningFrameInfo { int64 used_gpu_memory = 17; // kB ChildrenProcStats children = 18; //additional data about the running frame's child processes int64 used_swap_memory = 19; // kB + repeated host.GpuUsage gpu_usage = 20; // Per-device GPU usage }; message ChildrenProcStats { diff --git a/rqd/pyproject.toml b/rqd/pyproject.toml index 48bab66a2..a27766265 100644 --- a/rqd/pyproject.toml +++ b/rqd/pyproject.toml @@ -11,7 +11,8 @@ dependencies = [ "opencue_proto", "psutil==5.9.8", "pynput==1.7.6", - "future==1.0.0" + "future==1.0.0", + "pynvml>=11.5.0" ] requires-python = ">3.7" description = "RQD is a software client that runs on all hosts doing work for an OpenCue deployment." diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index e60d49365..4e4b03b09 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -820,7 +820,11 @@ def __createEnvVariables(self): # Add GPU's to use all assigned GPU cores if 'GPU_LIST' in self.runFrame.attributes: - self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST'] + gpu_list = self.runFrame.attributes['GPU_LIST'] + self.frameEnv['CUE_GPU_CORES'] = gpu_list + # Set CUDA_VISIBLE_DEVICES and NVIDIA_VISIBLE_DEVICES for GPU isolation + self.frameEnv['CUDA_VISIBLE_DEVICES'] = gpu_list + self.frameEnv['NVIDIA_VISIBLE_DEVICES'] = gpu_list # pylint: disable=inconsistent-return-statements def _createCommandFile(self, command): diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py index 610df78ee..83d4235b6 100644 --- a/rqd/rqd/rqmachine.py +++ b/rqd/rqd/rqmachine.py @@ -27,6 +27,7 @@ import codecs import ctypes import errno +import json import logging import math import os @@ -64,6 +65,172 @@ KILOBYTE = 1024 +# ===== GPU Discovery Abstraction ===== + +class GpuDiscovery(object): + """Abstract GPU discovery interface.""" + def detect_devices(self): + """Returns list of GpuDevice proto messages.""" + raise NotImplementedError + + def get_utilization(self, device_id): + """Returns GpuUsage proto message.""" + raise NotImplementedError + + +class NvidiaGpuDiscovery(GpuDiscovery): + """NVIDIA GPU discovery using NVML (preferred) or nvidia-smi fallback.""" + + def __init__(self): + self.use_nvml = False + try: + import pynvml # pylint: disable=import-outside-toplevel + pynvml.nvmlInit() + self.pynvml = pynvml + self.use_nvml = True + log.info("Using NVML for NVIDIA GPU discovery") + except (ImportError, Exception) as e: + log.warning("NVML unavailable, falling back to nvidia-smi: %s", e) + + def detect_devices(self): + """Detect NVIDIA GPUs via NVML or nvidia-smi.""" + if self.use_nvml: + return self._detect_via_nvml() + return self._detect_via_smi() + + def _detect_via_nvml(self): + """Use pynvml for detailed GPU metadata.""" + devices = [] + device_count = self.pynvml.nvmlDeviceGetCount() + for i in range(device_count): + handle = self.pynvml.nvmlDeviceGetHandleByIndex(i) + name = self.pynvml.nvmlDeviceGetName(handle) + if isinstance(name, bytes): + name = name.decode('utf-8') + mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(handle) + pci_info = self.pynvml.nvmlDeviceGetPciInfo(handle) + driver_version = self.pynvml.nvmlSystemGetDriverVersion() + if isinstance(driver_version, bytes): + driver_version = driver_version.decode('utf-8') + cuda_version = self.pynvml.nvmlSystemGetCudaDriverVersion() + pci_bus = pci_info.busId + if isinstance(pci_bus, bytes): + pci_bus = pci_bus.decode('utf-8') + + # Build GpuDevice proto + gpu_device = opencue_proto.host_pb2.GpuDevice( + id=str(i), + vendor="NVIDIA", + model=name, + memory_bytes=mem_info.total, + pci_bus=pci_bus, + driver_version=driver_version, + cuda_version="{}.{}".format(cuda_version // 1000, (cuda_version % 1000) // 10), + ) + devices.append(gpu_device) + return devices + + def _detect_via_smi(self): + """Fallback to nvidia-smi.""" + devices = [] + try: + output = subprocess.check_output( + ['nvidia-smi', '--query-gpu=index,name,memory.total,pci.bus_id,driver_version', + '--format=csv,noheader,nounits'], + encoding='utf-8' + ) + for line in output.strip().splitlines(): + parts = [p.strip() for p in line.split(',')] + idx, name, mem_mb, pci, driver = parts + gpu_device = opencue_proto.host_pb2.GpuDevice( + id=idx, + vendor="NVIDIA", + model=name, + memory_bytes=int(float(mem_mb) * 1048576), # MB → bytes + pci_bus=pci, + driver_version=driver, + ) + devices.append(gpu_device) + except Exception as e: + log.error("nvidia-smi GPU detection failed: %s", e) + return devices + + def get_utilization(self, device_id): + """Get current utilization for a device.""" + if not self.use_nvml: + return opencue_proto.host_pb2.GpuUsage( + device_id=device_id, utilization_pct=0, memory_used_bytes=0) + + try: + handle = self.pynvml.nvmlDeviceGetHandleByIndex(int(device_id)) + util = self.pynvml.nvmlDeviceGetUtilizationRates(handle) + mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(handle) + return opencue_proto.host_pb2.GpuUsage( + device_id=device_id, + utilization_pct=util.gpu, + memory_used_bytes=mem_info.used, + ) + except Exception as e: + log.warning("Failed to get GPU utilization for device %s: %s", device_id, e) + return opencue_proto.host_pb2.GpuUsage( + device_id=device_id, utilization_pct=0, memory_used_bytes=0) + + +class AppleMetalGpuDiscovery(GpuDiscovery): + """macOS Apple Silicon GPU discovery via system_profiler.""" + + def detect_devices(self): + """Detect Apple GPUs via system_profiler.""" + devices = [] + try: + output = subprocess.check_output( + ['system_profiler', 'SPDisplaysDataType', '-json'], + encoding='utf-8' + ) + data = json.loads(output) + + # Parse SPDisplaysDataType for GPU info + displays = data.get('SPDisplaysDataType', []) + gpu_idx = 0 + for display in displays: + chipset_model = display.get('sppci_model', 'Unknown') + vram = display.get('spdisplays_vram', '0 MB') + vram_bytes = self._parse_vram(vram) + + # Apple GPUs are integrated, so treat as single device + gpu_device = opencue_proto.host_pb2.GpuDevice( + id=str(gpu_idx), + vendor="Apple", + model=chipset_model, + memory_bytes=vram_bytes, + pci_bus="integrated", + driver_version="Metal", + cuda_version="N/A", + ) + gpu_device.attributes['metal_supported'] = 'true' + devices.append(gpu_device) + gpu_idx += 1 + except Exception as e: + log.error("Apple GPU detection failed: %s", e) + return devices + + def _parse_vram(self, vram_str): + """Parse '16 GB' or '16384 MB' to bytes.""" + match = re.match(r'(\d+)\s*(GB|MB)', vram_str) + if match: + val, unit = match.groups() + if unit == 'GB': + return int(val) * 1024 * 1024 * 1024 + if unit == 'MB': + return int(val) * 1024 * 1024 + return 0 + + def get_utilization(self, device_id): + """Apple Metal does not expose per-process GPU utilization; return empty.""" + return opencue_proto.host_pb2.GpuUsage( + device_id=device_id, utilization_pct=0, memory_used_bytes=0) + + class Machine(object): """Gathers information about the machine and resources""" def __init__(self, rqCore, coreInfo): @@ -76,6 +243,7 @@ def __init__(self, rqCore, coreInfo): self.__rqCore = rqCore self.__coreInfo = coreInfo self.__gpusets = set() + self.__gpu_discovery = None # A dictionary built from /proc/cpuinfo containing # { : { : set([, , ...]), ... }, ... } @@ -183,7 +351,14 @@ def isUserLoggedIn(self): def __updateGpuAndLlu(self, frame): if 'GPU_LIST' in frame.runFrame.attributes: usedGpuMemory = 0 + # Clear previous GPU usage and collect fresh data + frame.gpuUsage = [] for unitId in frame.runFrame.attributes.get('GPU_LIST').split(','): + # Collect per-device GPU usage + gpu_usage = self.getGpuUtilization(unitId) + frame.gpuUsage.append(gpu_usage) + + # Legacy memory tracking (backward compatibility) usedGpuMemory += self.getGpuMemoryUsed(unitId) frame.usedGpuMemory = usedGpuMemory @@ -866,6 +1041,12 @@ def updateMachineStats(self): self.__renderHost.total_gpu_mem = self.getGpuMemoryTotal() self.__renderHost.free_gpu_mem = self.getGpuMemoryFree() + # Populate gpu_devices with new detailed GPU inventory + if rqd.rqconstants.ALLOW_GPU: + gpu_devices = self.getGpuDevices() + self.__renderHost.ClearField('gpu_devices') + self.__renderHost.gpu_devices.extend(gpu_devices) + self.__renderHost.attributes['swapout'] = self.__getSwapout() elif platform.system() == 'Darwin': @@ -874,6 +1055,12 @@ def updateMachineStats(self): mcpStat = os.statvfs(self.getTempPath()) self.__renderHost.free_mcp = (mcpStat.f_bavail * mcpStat.f_bsize) // KILOBYTE + # Populate gpu_devices with new detailed GPU inventory + if rqd.rqconstants.ALLOW_GPU: + gpu_devices = self.getGpuDevices() + self.__renderHost.ClearField('gpu_devices') + self.__renderHost.gpu_devices.extend(gpu_devices) + elif platform.system() == 'Windows': TEMP_DEFAULT = 1048576 stats = self.getWindowsMemory() @@ -884,6 +1071,12 @@ def updateMachineStats(self): self.__renderHost.total_gpu_mem = self.getGpuMemoryTotal() self.__renderHost.free_gpu_mem = self.getGpuMemoryFree() + # Populate gpu_devices with new detailed GPU inventory + if rqd.rqconstants.ALLOW_GPU: + gpu_devices = self.getGpuDevices() + self.__renderHost.ClearField('gpu_devices') + self.__renderHost.gpu_devices.extend(gpu_devices) + # Updates dynamic information self.__renderHost.load = self.getLoadAvg() self.__renderHost.nimby_enabled = self.__rqCore.nimby.is_ready @@ -934,8 +1127,33 @@ def setupTaskset(self): def setupGpu(self): """ Setup rqd for Gpus """ + if rqd.rqconstants.ALLOW_GPU: + self.__gpu_discovery = self.__init_gpu_discovery() self.__gpusets = set(range(self.getGpuCount())) + def __init_gpu_discovery(self): + """Initialize platform-specific GPU discovery.""" + if platform.system() == 'Linux': + return NvidiaGpuDiscovery() + if platform.system() == 'Darwin': + return AppleMetalGpuDiscovery() + if platform.system() == 'Windows': + return NvidiaGpuDiscovery() # Assume NVIDIA on Windows for now + return None + + def getGpuDevices(self): + """Return list of GpuDevice protos.""" + if not self.__gpu_discovery: + return [] + return self.__gpu_discovery.detect_devices() + + def getGpuUtilization(self, device_id): + """Return GpuUsage proto for a device.""" + if not self.__gpu_discovery: + return opencue_proto.host_pb2.GpuUsage( + device_id=device_id, utilization_pct=0, memory_used_bytes=0) + return self.__gpu_discovery.get_utilization(device_id) + def reserveHT(self, frameCores): """ Reserve cores for use by taskset taskset -c 0,1,8,9 COMMAND diff --git a/rqd/rqd/rqnetwork.py b/rqd/rqd/rqnetwork.py index 4490c53fa..bf49d5df5 100644 --- a/rqd/rqd/rqnetwork.py +++ b/rqd/rqd/rqnetwork.py @@ -76,6 +76,7 @@ def __init__(self, rqCore, runFrame): self.usedGpuMemory = 0 self.maxUsedGpuMemory = 0 + self.gpuUsage = [] # List of GpuUsage protos self.usedSwapMemory = 0 @@ -110,6 +111,8 @@ def runningFrameInfo(self): children=self._serializeChildrenProcs(), used_swap_memory=self.usedSwapMemory, ) + # Add per-device GPU usage + runningFrameInfo.gpu_usage.extend(self.gpuUsage) return runningFrameInfo def _serializeChildrenProcs(self): diff --git a/rust/crates/rqd/Cargo.toml b/rust/crates/rqd/Cargo.toml index 2cfef543a..a2ebea745 100644 --- a/rust/crates/rqd/Cargo.toml +++ b/rust/crates/rqd/Cargo.toml @@ -12,6 +12,8 @@ nimby = ["dep:device_query"] # Containerized Frames is a feature that allows rqd to run in a containerized environment. # This feature is highly experimental and may not be stable. containerized_frames = ["bollard"] +# NVML support for NVIDIA GPU discovery and monitoring +nvml = ["dep:nvml-wrapper"] [[bin]] path = "src/main.rs" @@ -66,6 +68,7 @@ device_query = { version = "3.0", optional = true } pnet = "0.35.0" log = "0.4.27" ureq = { version = "3.1.0", features = ["json"] } +nvml-wrapper = { version = "0.10", optional = true } [dev-dependencies] tempfile = "3.14.0" diff --git a/rust/crates/rqd/src/system/gpu.rs b/rust/crates/rqd/src/system/gpu.rs new file mode 100644 index 000000000..6b4a836a6 --- /dev/null +++ b/rust/crates/rqd/src/system/gpu.rs @@ -0,0 +1,283 @@ +use miette::Result; +use opencue_proto::host::{GpuDevice, GpuUsage}; +use std::collections::HashMap; +use tracing::{error, info, warn}; + +/// Abstract GPU discovery interface +pub trait GpuDiscovery { + /// Detect GPU devices on this machine + fn detect_devices(&self) -> Result>; + + /// Get current utilization for a specific GPU device + fn get_utilization(&self, device_id: &str) -> Result; +} + +/// NVIDIA GPU discovery using NVML library +pub struct NvidiaGpuDiscovery { + nvml_available: bool, +} + +impl NvidiaGpuDiscovery { + pub fn new() -> Self { + let nvml_available = Self::check_nvml_available(); + if nvml_available { + info!("Using NVML for NVIDIA GPU discovery"); + } else { + warn!("NVML unavailable, GPU features will be limited"); + } + Self { nvml_available } + } + + fn check_nvml_available() -> bool { + #[cfg(feature = "nvml")] + { + match nvml_wrapper::Nvml::init() { + Ok(_) => true, + Err(e) => { + warn!("NVML initialization failed: {}", e); + false + } + } + } + #[cfg(not(feature = "nvml"))] + { + false + } + } + + #[cfg(feature = "nvml")] + fn detect_via_nvml(&self) -> Result> { + use nvml_wrapper::Nvml; + + let nvml = Nvml::init().map_err(|e| miette::miette!("NVML init failed: {}", e))?; + let device_count = nvml.device_count().map_err(|e| miette::miette!("Failed to get device count: {}", e))?; + + let mut devices = Vec::new(); + for i in 0..device_count { + match nvml.device_by_index(i) { + Ok(device) => { + let name = device.name().unwrap_or_else(|_| "Unknown".to_string()); + let memory_info = device.memory_info().ok(); + let pci_info = device.pci_info().ok(); + let driver_version = nvml.sys_driver_version().unwrap_or_else(|_| "Unknown".to_string()); + let cuda_version = nvml.sys_cuda_driver_version().ok(); + + let gpu_device = GpuDevice { + id: i.to_string(), + vendor: "NVIDIA".to_string(), + model: name, + memory_bytes: memory_info.map(|m| m.total).unwrap_or(0), + pci_bus: pci_info.map(|p| p.bus_id).unwrap_or_else(|| "Unknown".to_string()), + driver_version, + cuda_version: cuda_version.map(|v| format!("{}.{}", v / 1000, (v % 1000) / 10)).unwrap_or_else(|| "Unknown".to_string()), + attributes: HashMap::new(), + }; + devices.push(gpu_device); + } + Err(e) => { + warn!("Failed to get device {}: {}", i, e); + } + } + } + Ok(devices) + } + + #[cfg(not(feature = "nvml"))] + fn detect_via_nvml(&self) -> Result> { + Ok(Vec::new()) + } + + fn detect_via_smi(&self) -> Result> { + use std::process::Command; + + let output = Command::new("nvidia-smi") + .args(&[ + "--query-gpu=index,name,memory.total,pci.bus_id,driver_version", + "--format=csv,noheader,nounits", + ]) + .output() + .map_err(|e| miette::miette!("Failed to run nvidia-smi: {}", e))?; + + if !output.status.success() { + return Err(miette::miette!("nvidia-smi command failed")); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let mut devices = Vec::new(); + + for line in stdout.lines() { + let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect(); + if parts.len() >= 5 { + let memory_mb: f64 = parts[2].parse().unwrap_or(0.0); + let memory_bytes = (memory_mb * 1_048_576.0) as u64; // MB to bytes + + let gpu_device = GpuDevice { + id: parts[0].to_string(), + vendor: "NVIDIA".to_string(), + model: parts[1].to_string(), + memory_bytes, + pci_bus: parts[3].to_string(), + driver_version: parts[4].to_string(), + cuda_version: "Unknown".to_string(), + attributes: HashMap::new(), + }; + devices.push(gpu_device); + } + } + + Ok(devices) + } +} + +impl GpuDiscovery for NvidiaGpuDiscovery { + fn detect_devices(&self) -> Result> { + if self.nvml_available { + self.detect_via_nvml() + } else { + self.detect_via_smi() + } + } + + fn get_utilization(&self, device_id: &str) -> Result { + #[cfg(feature = "nvml")] + { + if self.nvml_available { + use nvml_wrapper::Nvml; + + let nvml = Nvml::init().map_err(|e| miette::miette!("NVML init failed: {}", e))?; + let index: u32 = device_id.parse().map_err(|e| miette::miette!("Invalid device ID: {}", e))?; + let device = nvml.device_by_index(index).map_err(|e| miette::miette!("Device not found: {}", e))?; + + let utilization = device.utilization_rates().ok(); + let memory_info = device.memory_info().ok(); + let temperature = device.temperature(nvml_wrapper::enum_wrappers::device::TemperatureSensor::Gpu).ok(); + + return Ok(GpuUsage { + device_id: device_id.to_string(), + utilization_pct: utilization.map(|u| u.gpu).unwrap_or(0), + memory_used_bytes: memory_info.map(|m| m.used).unwrap_or(0), + temperature_c: temperature.unwrap_or(0), + }); + } + } + + // Fallback: return empty usage + Ok(GpuUsage { + device_id: device_id.to_string(), + utilization_pct: 0, + memory_used_bytes: 0, + temperature_c: 0, + }) + } +} + +/// Apple Metal GPU discovery for macOS +pub struct AppleMetalGpuDiscovery; + +impl AppleMetalGpuDiscovery { + pub fn new() -> Self { + Self + } + + fn parse_vram(vram_str: &str) -> u64 { + // Parse strings like "16 GB" or "16384 MB" to bytes + let parts: Vec<&str> = vram_str.split_whitespace().collect(); + if parts.len() >= 2 { + if let Ok(value) = parts[0].parse::() { + match parts[1] { + "GB" => return value * 1024 * 1024 * 1024, + "MB" => return value * 1024 * 1024, + _ => {} + } + } + } + 0 + } +} + +impl GpuDiscovery for AppleMetalGpuDiscovery { + fn detect_devices(&self) -> Result> { + use std::process::Command; + + let output = Command::new("system_profiler") + .args(&["SPDisplaysDataType", "-json"]) + .output() + .map_err(|e| miette::miette!("Failed to run system_profiler: {}", e))?; + + if !output.status.success() { + return Err(miette::miette!("system_profiler command failed")); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let json_data: serde_json::Value = serde_json::from_str(&stdout) + .map_err(|e| miette::miette!("Failed to parse JSON: {}", e))?; + + let mut devices = Vec::new(); + let mut gpu_idx = 0; + + if let Some(displays) = json_data["SPDisplaysDataType"].as_array() { + for display in displays { + let chipset_model = display["sppci_model"] + .as_str() + .unwrap_or("Unknown") + .to_string(); + let vram = display["spdisplays_vram"] + .as_str() + .unwrap_or("0 MB") + .to_string(); + let vram_bytes = Self::parse_vram(&vram); + + let mut attributes = HashMap::new(); + attributes.insert("metal_supported".to_string(), "true".to_string()); + + let gpu_device = GpuDevice { + id: gpu_idx.to_string(), + vendor: "Apple".to_string(), + model: chipset_model, + memory_bytes: vram_bytes, + pci_bus: "integrated".to_string(), + driver_version: "Metal".to_string(), + cuda_version: "N/A".to_string(), + attributes, + }; + devices.push(gpu_device); + gpu_idx += 1; + } + } + + Ok(devices) + } + + fn get_utilization(&self, device_id: &str) -> Result { + // Apple Metal does not expose per-process GPU utilization + Ok(GpuUsage { + device_id: device_id.to_string(), + utilization_pct: 0, + memory_used_bytes: 0, + temperature_c: 0, + }) + } +} + +/// Factory function to create the appropriate GPU discovery backend for this platform +pub fn create_gpu_discovery() -> Option> { + #[cfg(target_os = "linux")] + { + Some(Box::new(NvidiaGpuDiscovery::new())) + } + + #[cfg(target_os = "macos")] + { + Some(Box::new(AppleMetalGpuDiscovery::new())) + } + + #[cfg(target_os = "windows")] + { + Some(Box::new(NvidiaGpuDiscovery::new())) + } + + #[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))] + { + None + } +} diff --git a/rust/crates/rqd/src/system/linux.rs b/rust/crates/rqd/src/system/linux.rs index 28cca98c5..1c600554d 100644 --- a/rust/crates/rqd/src/system/linux.rs +++ b/rust/crates/rqd/src/system/linux.rs @@ -771,12 +771,48 @@ impl SystemManager for LinuxSystem { } fn collect_gpu_stats(&self) -> MachineGpuStats { - // TODO: missing implementation, returning dummy val - MachineGpuStats { - count: 0, - total_memory: 0, - free_memory: 0, - _used_memory_by_unit: HashMap::default(), + use crate::system::gpu; + + // Create GPU discovery backend + let gpu_discovery = gpu::create_gpu_discovery(); + + if let Some(discovery) = gpu::create_gpu_discovery() { + // Detect GPU devices + match discovery.detect_devices() { + Ok(devices) => { + let count = devices.len() as u32; + let total_memory: u64 = devices.iter().map(|d| d.memory_bytes).sum(); + + // Note: free_memory calculation would require querying each device + // For now, we'll set it to 0 and let the detailed gpu_devices provide the info + MachineGpuStats { + count, + total_memory, + free_memory: 0, // Legacy field, use gpu_devices for detailed info + _used_memory_by_unit: HashMap::default(), + gpu_devices: devices, + } + } + Err(e) => { + warn!("Failed to detect GPU devices: {}", e); + MachineGpuStats { + count: 0, + total_memory: 0, + free_memory: 0, + _used_memory_by_unit: HashMap::default(), + gpu_devices: Vec::new(), + } + } + } + } else { + // No GPU discovery available for this platform + MachineGpuStats { + count: 0, + total_memory: 0, + free_memory: 0, + _used_memory_by_unit: HashMap::default(), + gpu_devices: Vec::new(), + } } } fn create_user_if_unexisting(&self, username: &str, uid: u32, gid: u32) -> Result { diff --git a/rust/crates/rqd/src/system/machine.rs b/rust/crates/rqd/src/system/machine.rs index 8b646625f..55a812e61 100644 --- a/rust/crates/rqd/src/system/machine.rs +++ b/rust/crates/rqd/src/system/machine.rs @@ -464,6 +464,7 @@ impl MachineMonitor { num_gpus: gpu_stats.count as i32, free_gpu_mem: gpu_stats.free_memory as i64, total_gpu_mem: gpu_stats.total_memory as i64, + gpu_devices: gpu_stats.gpu_devices, }) } } diff --git a/rust/crates/rqd/src/system/macos.rs b/rust/crates/rqd/src/system/macos.rs index 6db2afd1d..8f3cd9ce4 100644 --- a/rust/crates/rqd/src/system/macos.rs +++ b/rust/crates/rqd/src/system/macos.rs @@ -664,12 +664,48 @@ impl SystemManager for MacOsSystem { } fn collect_gpu_stats(&self) -> MachineGpuStats { - // TODO: missing implementation, returning dummy val - MachineGpuStats { - count: 0, - total_memory: 0, - free_memory: 0, - _used_memory_by_unit: HashMap::default(), + use crate::system::gpu; + + // Create GPU discovery backend (Apple Metal on macOS) + let gpu_discovery = gpu::create_gpu_discovery(); + + if let Some(discovery) = gpu_discovery { + // Detect GPU devices + match discovery.detect_devices() { + Ok(devices) => { + let count = devices.len() as u32; + let total_memory: u64 = devices.iter().map(|d| d.memory_bytes).sum(); + + // Note: free_memory calculation would require querying each device + // For now, we'll set it to 0 and let the detailed gpu_devices provide the info + MachineGpuStats { + count, + total_memory, + free_memory: 0, // Legacy field, use gpu_devices for detailed info + _used_memory_by_unit: HashMap::default(), + gpu_devices: devices, + } + } + Err(e) => { + warn!("Failed to detect GPU devices: {}", e); + MachineGpuStats { + count: 0, + total_memory: 0, + free_memory: 0, + _used_memory_by_unit: HashMap::default(), + gpu_devices: Vec::new(), + } + } + } + } else { + // No GPU discovery available for this platform + MachineGpuStats { + count: 0, + total_memory: 0, + free_memory: 0, + _used_memory_by_unit: HashMap::default(), + gpu_devices: Vec::new(), + } } } diff --git a/rust/crates/rqd/src/system/manager.rs b/rust/crates/rqd/src/system/manager.rs index 23087dbc6..69765ebfa 100644 --- a/rust/crates/rqd/src/system/manager.rs +++ b/rust/crates/rqd/src/system/manager.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use miette::{Diagnostic, Result}; -use opencue_proto::{host::HardwareState, report::ChildrenProcStats}; +use opencue_proto::{host::{GpuDevice, GpuUsage, HardwareState}, report::ChildrenProcStats}; use thiserror::Error; use tracing::error; use uuid::Uuid; @@ -99,6 +99,8 @@ pub struct MachineGpuStats { pub free_memory: u64, /// Used memory by unit of each GPU, where the key in the HashMap is the unit ID, and the value is the used memory pub _used_memory_by_unit: HashMap, + /// Detailed GPU device inventory + pub gpu_devices: Vec, } /// Tracks memory and runtime statistics for a rendering process and its children. @@ -118,6 +120,8 @@ pub struct ProcessStats { pub max_used_gpu_memory: u64, /// Current GPU memory usage (KB). pub used_gpu_memory: u64, + /// Per-device GPU usage statistics + pub gpu_usage: Vec, /// Additional data about the running frame's child processes. pub children: Option, /// Unix timestamp denoting the start time of the frame process. @@ -136,6 +140,7 @@ impl Default for ProcessStats { llu_time: 0, max_used_gpu_memory: 0, used_gpu_memory: 0, + gpu_usage: Vec::new(), children: None, epoch_start_time: std::time::SystemTime::now() .duration_since(std::time::UNIX_EPOCH) @@ -157,6 +162,7 @@ impl ProcessStats { vsize: new.vsize, llu_time: new.llu_time, used_gpu_memory: new.used_gpu_memory, + gpu_usage: new.gpu_usage, children: new.children, epoch_start_time: new.epoch_start_time, }; diff --git a/rust/crates/rqd/src/system/mod.rs b/rust/crates/rqd/src/system/mod.rs index add1f2792..8de52bec6 100644 --- a/rust/crates/rqd/src/system/mod.rs +++ b/rust/crates/rqd/src/system/mod.rs @@ -1,5 +1,6 @@ use uuid::Uuid; +pub mod gpu; pub mod linux; pub mod machine; #[cfg(feature = "nimby")]