diff --git a/VERSION.in b/VERSION.in
index d3456a90f..63738cc28 100644
--- a/VERSION.in
+++ b/VERSION.in
@@ -1 +1 @@
-1.13
+1.14
diff --git a/proto/src/host.proto b/proto/src/host.proto
index b321fdec1..feea069eb 100644
--- a/proto/src/host.proto
+++ b/proto/src/host.proto
@@ -225,6 +225,26 @@ enum ThreadMode {
 
 // -------- Primary Message Types --------]
 
+// GPU device information for detailed GPU inventory
+message GpuDevice {
+    string id = 1;                      // Device ID (e.g., "0", "1")
+    string vendor = 2;                  // "NVIDIA", "AMD", "Apple", "Intel"
+    string model = 3;                   // "Tesla V100", "Apple M3 Max", etc.
+    uint64 memory_bytes = 4;            // Total memory in bytes
+    string pci_bus = 5;                 // PCI bus ID (e.g., "0000:01:00.0") or "integrated"
+    string driver_version = 6;          // Driver version
+    string cuda_version = 7;            // CUDA compute capability (e.g., "7.0") or Metal version
+    map<string, string> attributes = 8; // Extensible metadata
+}
+
+// Per-GPU utilization telemetry
+message GpuUsage {
+    string device_id = 1;               // Matches GpuDevice.id
+    uint32 utilization_pct = 2;         // 0-100
+    uint64 memory_used_bytes = 3;       // Current memory usage in bytes
+    uint32 temperature_c = 4;           // Temperature in Celsius (optional)
+}
+
 message Deed {
     string id = 1;
     string host = 2;
@@ -274,6 +294,7 @@ message Host {
     ThreadMode thread_mode = 27;
     float gpus = 28;
     float idle_gpus = 29;
+    repeated GpuDevice gpu_devices = 30;  // Detailed GPU inventory (backward compatible)
 }
 
 message HostSearchCriteria {
@@ -321,6 +342,7 @@ message NestedHost {
     NestedProcSeq procs = 28;
     float gpus = 29;
     float idle_gpus = 30;
+    repeated GpuDevice gpu_devices = 31;  // Detailed GPU inventory (backward compatible)
 }
 
 message NestedHostSeq {
diff --git a/proto/src/job.proto b/proto/src/job.proto
index 4c76308fa..a5babcac0 100644
--- a/proto/src/job.proto
+++ b/proto/src/job.proto
@@ -9,9 +9,12 @@ option go_package = "opencue_gateway/gen/go";
 
 import "comment.proto";
 import "depend.proto";
+import "host.proto";
 import "limit.proto";
 import "renderPartition.proto";
 
+// Note: GpuUsage is defined in host.proto
+
 // Job related messages and services
 // This includes Job, Layer, Frame, and Group objects
 
@@ -520,6 +523,7 @@ message Frame {
     int64 max_gpu_memory = 21;
     int64 used_gpu_memory = 22;
     FrameStateDisplayOverride frame_state_display_override = 23;
+    repeated host.GpuUsage gpu_usage = 24; // Per-device GPU usage snapshot
 }
 
 // Object for frame searching
@@ -566,6 +570,7 @@ message UpdatedFrame {
     int64 max_gpu_memory = 11;
     int64 used_gpu_memory = 12;
     FrameStateDisplayOverride frame_state_display_override = 13;
+    repeated host.GpuUsage gpu_usage = 14; // Per-device GPU usage snapshot
 }
 
 message UpdatedFrameSeq {
@@ -714,6 +719,9 @@ message Layer {
     float min_gpus = 20;
     float max_gpus = 21;
     string command = 22;
+    string gpu_vendor = 23;               // GPU vendor filter: "NVIDIA", "AMD", "Apple", "" (any)
+    repeated string gpu_models_allowed = 24; // GPU model whitelist: ["Tesla V100", "A100"], empty = any
+    uint64 min_gpu_memory_bytes = 25;     // Minimum GPU memory per device in bytes (more precise than min_gpu_memory)
 }
 
 message LayerSeq {
diff --git a/proto/src/report.proto b/proto/src/report.proto
index 6ace5708e..205bfafdf 100644
--- a/proto/src/report.proto
+++ b/proto/src/report.proto
@@ -9,6 +9,8 @@ option go_package = "opencue_gateway/gen/go";
 
 import "host.proto";
 
+// Note: GpuDevice and GpuUsage are defined in host.proto
+
 // Interface to handle RQD pings.
 
 
@@ -82,9 +84,10 @@ message RenderHost {
     repeated string tags = 15; // an array of default tags that are added to the host record
     host.HardwareState state = 16; // hardware state for the host
     map<string, string> attributes = 17; // additional data can be provided about the host
-    int32 num_gpus = 18; // the number of physical GPU's
+    int32 num_gpus = 18; // the number of physical GPU's (legacy, use gpu_devices for details)
     int64 free_gpu_mem = 19; // the current amount of free gpu memory in kB
     int64 total_gpu_mem = 20; // the total size of gpu memory in kB
+    repeated host.GpuDevice gpu_devices = 21; // Detailed GPU inventory
 };
 
 message RunningFrameInfo {
@@ -107,6 +110,7 @@ message RunningFrameInfo {
     int64 used_gpu_memory = 17; // kB
     ChildrenProcStats children = 18; //additional data about the running frame's child processes
     int64 used_swap_memory = 19; // kB
+    repeated host.GpuUsage gpu_usage = 20; // Per-device GPU usage
 };
 
 message ChildrenProcStats {
diff --git a/rqd/pyproject.toml b/rqd/pyproject.toml
index 48bab66a2..a27766265 100644
--- a/rqd/pyproject.toml
+++ b/rqd/pyproject.toml
@@ -11,7 +11,8 @@ dependencies = [
     "opencue_proto",
     "psutil==5.9.8",
     "pynput==1.7.6",
-    "future==1.0.0"
+    "future==1.0.0",
+    "pynvml>=11.5.0"
 ]
 requires-python = ">3.7"
 description = "RQD is a software client that runs on all hosts doing work for an OpenCue deployment."
diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py
index e60d49365..4e4b03b09 100644
--- a/rqd/rqd/rqcore.py
+++ b/rqd/rqd/rqcore.py
@@ -820,7 +820,11 @@ def __createEnvVariables(self):
 
         # Add GPU's to use all assigned GPU cores
         if 'GPU_LIST' in self.runFrame.attributes:
-            self.frameEnv['CUE_GPU_CORES'] = self.runFrame.attributes['GPU_LIST']
+            gpu_list = self.runFrame.attributes['GPU_LIST']
+            self.frameEnv['CUE_GPU_CORES'] = gpu_list
+            # Set CUDA_VISIBLE_DEVICES and NVIDIA_VISIBLE_DEVICES for GPU isolation
+            self.frameEnv['CUDA_VISIBLE_DEVICES'] = gpu_list
+            self.frameEnv['NVIDIA_VISIBLE_DEVICES'] = gpu_list
 
     # pylint: disable=inconsistent-return-statements
     def _createCommandFile(self, command):
diff --git a/rqd/rqd/rqmachine.py b/rqd/rqd/rqmachine.py
index 610df78ee..83d4235b6 100644
--- a/rqd/rqd/rqmachine.py
+++ b/rqd/rqd/rqmachine.py
@@ -27,6 +27,7 @@
 import codecs
 import ctypes
 import errno
+import json
 import logging
 import math
 import os
@@ -64,6 +65,172 @@
 KILOBYTE = 1024
 
 
+# ===== GPU Discovery Abstraction =====
+
+class GpuDiscovery(object):
+    """Abstract GPU discovery interface."""
+    def detect_devices(self):
+        """Returns list of GpuDevice proto messages."""
+        raise NotImplementedError
+
+    def get_utilization(self, device_id):
+        """Returns GpuUsage proto message."""
+        raise NotImplementedError
+
+
+class NvidiaGpuDiscovery(GpuDiscovery):
+    """NVIDIA GPU discovery using NVML (preferred) or nvidia-smi fallback."""
+
+    def __init__(self):
+        self.use_nvml = False
+        try:
+            import pynvml  # pylint: disable=import-outside-toplevel
+            pynvml.nvmlInit()
+            self.pynvml = pynvml
+            self.use_nvml = True
+            log.info("Using NVML for NVIDIA GPU discovery")
+        except (ImportError, Exception) as e:
+            log.warning("NVML unavailable, falling back to nvidia-smi: %s", e)
+
+    def detect_devices(self):
+        """Detect NVIDIA GPUs via NVML or nvidia-smi."""
+        if self.use_nvml:
+            return self._detect_via_nvml()
+        return self._detect_via_smi()
+
+    def _detect_via_nvml(self):
+        """Use pynvml for detailed GPU metadata."""
+        devices = []
+        device_count = self.pynvml.nvmlDeviceGetCount()
+        for i in range(device_count):
+            handle = self.pynvml.nvmlDeviceGetHandleByIndex(i)
+            name = self.pynvml.nvmlDeviceGetName(handle)
+            if isinstance(name, bytes):
+                name = name.decode('utf-8')
+            mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(handle)
+            pci_info = self.pynvml.nvmlDeviceGetPciInfo(handle)
+            driver_version = self.pynvml.nvmlSystemGetDriverVersion()
+            if isinstance(driver_version, bytes):
+                driver_version = driver_version.decode('utf-8')
+            cuda_version = self.pynvml.nvmlSystemGetCudaDriverVersion()
+            pci_bus = pci_info.busId
+            if isinstance(pci_bus, bytes):
+                pci_bus = pci_bus.decode('utf-8')
+
+            # Build GpuDevice proto
+            gpu_device = opencue_proto.host_pb2.GpuDevice(
+                id=str(i),
+                vendor="NVIDIA",
+                model=name,
+                memory_bytes=mem_info.total,
+                pci_bus=pci_bus,
+                driver_version=driver_version,
+                cuda_version="{}.{}".format(cuda_version // 1000, (cuda_version % 1000) // 10),
+            )
+            devices.append(gpu_device)
+        return devices
+
+    def _detect_via_smi(self):
+        """Fallback to nvidia-smi."""
+        devices = []
+        try:
+            output = subprocess.check_output(
+                ['nvidia-smi', '--query-gpu=index,name,memory.total,pci.bus_id,driver_version',
+                 '--format=csv,noheader,nounits'],
+                encoding='utf-8'
+            )
+            for line in output.strip().splitlines():
+                parts = [p.strip() for p in line.split(',')]
+                idx, name, mem_mb, pci, driver = parts
+                gpu_device = opencue_proto.host_pb2.GpuDevice(
+                    id=idx,
+                    vendor="NVIDIA",
+                    model=name,
+                    memory_bytes=int(float(mem_mb) * 1048576),  # MB → bytes
+                    pci_bus=pci,
+                    driver_version=driver,
+                )
+                devices.append(gpu_device)
+        except Exception as e:
+            log.error("nvidia-smi GPU detection failed: %s", e)
+        return devices
+
+    def get_utilization(self, device_id):
+        """Get current utilization for a device."""
+        if not self.use_nvml:
+            return opencue_proto.host_pb2.GpuUsage(
+                device_id=device_id, utilization_pct=0, memory_used_bytes=0)
+
+        try:
+            handle = self.pynvml.nvmlDeviceGetHandleByIndex(int(device_id))
+            util = self.pynvml.nvmlDeviceGetUtilizationRates(handle)
+            mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(handle)
+            return opencue_proto.host_pb2.GpuUsage(
+                device_id=device_id,
+                utilization_pct=util.gpu,
+                memory_used_bytes=mem_info.used,
+            )
+        except Exception as e:
+            log.warning("Failed to get GPU utilization for device %s: %s", device_id, e)
+            return opencue_proto.host_pb2.GpuUsage(
+                device_id=device_id, utilization_pct=0, memory_used_bytes=0)
+
+
+class AppleMetalGpuDiscovery(GpuDiscovery):
+    """macOS Apple Silicon GPU discovery via system_profiler."""
+
+    def detect_devices(self):
+        """Detect Apple GPUs via system_profiler."""
+        devices = []
+        try:
+            output = subprocess.check_output(
+                ['system_profiler', 'SPDisplaysDataType', '-json'],
+                encoding='utf-8'
+            )
+            data = json.loads(output)
+
+            # Parse SPDisplaysDataType for GPU info
+            displays = data.get('SPDisplaysDataType', [])
+            gpu_idx = 0
+            for display in displays:
+                chipset_model = display.get('sppci_model', 'Unknown')
+                vram = display.get('spdisplays_vram', '0 MB')
+                vram_bytes = self._parse_vram(vram)
+
+                # Apple GPUs are integrated, so treat as single device
+                gpu_device = opencue_proto.host_pb2.GpuDevice(
+                    id=str(gpu_idx),
+                    vendor="Apple",
+                    model=chipset_model,
+                    memory_bytes=vram_bytes,
+                    pci_bus="integrated",
+                    driver_version="Metal",
+                    cuda_version="N/A",
+                )
+                gpu_device.attributes['metal_supported'] = 'true'
+                devices.append(gpu_device)
+                gpu_idx += 1
+        except Exception as e:
+            log.error("Apple GPU detection failed: %s", e)
+        return devices
+
+    def _parse_vram(self, vram_str):
+        """Parse '16 GB' or '16384 MB' to bytes."""
+        match = re.match(r'(\d+)\s*(GB|MB)', vram_str)
+        if match:
+            val, unit = match.groups()
+            if unit == 'GB':
+                return int(val) * 1024 * 1024 * 1024
+            if unit == 'MB':
+                return int(val) * 1024 * 1024
+        return 0
+
+    def get_utilization(self, device_id):
+        """Apple Metal does not expose per-process GPU utilization; return empty."""
+        return opencue_proto.host_pb2.GpuUsage(
+            device_id=device_id, utilization_pct=0, memory_used_bytes=0)
+
+
 class Machine(object):
     """Gathers information about the machine and resources"""
     def __init__(self, rqCore, coreInfo):
@@ -76,6 +243,7 @@ def __init__(self, rqCore, coreInfo):
         self.__rqCore = rqCore
         self.__coreInfo = coreInfo
         self.__gpusets = set()
+        self.__gpu_discovery = None
 
         # A dictionary built from /proc/cpuinfo containing
         # { <physical id> : { <core_id> : set([<processor>, <processor>, ...]), ... }, ... }
@@ -183,7 +351,14 @@ def isUserLoggedIn(self):
     def __updateGpuAndLlu(self, frame):
         if 'GPU_LIST' in frame.runFrame.attributes:
             usedGpuMemory = 0
+            # Clear previous GPU usage and collect fresh data
+            frame.gpuUsage = []
             for unitId in frame.runFrame.attributes.get('GPU_LIST').split(','):
+                # Collect per-device GPU usage
+                gpu_usage = self.getGpuUtilization(unitId)
+                frame.gpuUsage.append(gpu_usage)
+
+                # Legacy memory tracking (backward compatibility)
                 usedGpuMemory += self.getGpuMemoryUsed(unitId)
 
             frame.usedGpuMemory = usedGpuMemory
@@ -866,6 +1041,12 @@ def updateMachineStats(self):
             self.__renderHost.total_gpu_mem = self.getGpuMemoryTotal()
             self.__renderHost.free_gpu_mem = self.getGpuMemoryFree()
 
+            # Populate gpu_devices with new detailed GPU inventory
+            if rqd.rqconstants.ALLOW_GPU:
+                gpu_devices = self.getGpuDevices()
+                self.__renderHost.ClearField('gpu_devices')
+                self.__renderHost.gpu_devices.extend(gpu_devices)
+
             self.__renderHost.attributes['swapout'] = self.__getSwapout()
 
         elif platform.system() == 'Darwin':
@@ -874,6 +1055,12 @@ def updateMachineStats(self):
             mcpStat = os.statvfs(self.getTempPath())
             self.__renderHost.free_mcp = (mcpStat.f_bavail * mcpStat.f_bsize) // KILOBYTE
 
+            # Populate gpu_devices with new detailed GPU inventory
+            if rqd.rqconstants.ALLOW_GPU:
+                gpu_devices = self.getGpuDevices()
+                self.__renderHost.ClearField('gpu_devices')
+                self.__renderHost.gpu_devices.extend(gpu_devices)
+
         elif platform.system() == 'Windows':
             TEMP_DEFAULT = 1048576
             stats = self.getWindowsMemory()
@@ -884,6 +1071,12 @@ def updateMachineStats(self):
             self.__renderHost.total_gpu_mem = self.getGpuMemoryTotal()
             self.__renderHost.free_gpu_mem = self.getGpuMemoryFree()
 
+            # Populate gpu_devices with new detailed GPU inventory
+            if rqd.rqconstants.ALLOW_GPU:
+                gpu_devices = self.getGpuDevices()
+                self.__renderHost.ClearField('gpu_devices')
+                self.__renderHost.gpu_devices.extend(gpu_devices)
+
         # Updates dynamic information
         self.__renderHost.load = self.getLoadAvg()
         self.__renderHost.nimby_enabled = self.__rqCore.nimby.is_ready
@@ -934,8 +1127,33 @@ def setupTaskset(self):
 
     def setupGpu(self):
         """ Setup rqd for Gpus """
+        if rqd.rqconstants.ALLOW_GPU:
+            self.__gpu_discovery = self.__init_gpu_discovery()
         self.__gpusets = set(range(self.getGpuCount()))
 
+    def __init_gpu_discovery(self):
+        """Initialize platform-specific GPU discovery."""
+        if platform.system() == 'Linux':
+            return NvidiaGpuDiscovery()
+        if platform.system() == 'Darwin':
+            return AppleMetalGpuDiscovery()
+        if platform.system() == 'Windows':
+            return NvidiaGpuDiscovery()  # Assume NVIDIA on Windows for now
+        return None
+
+    def getGpuDevices(self):
+        """Return list of GpuDevice protos."""
+        if not self.__gpu_discovery:
+            return []
+        return self.__gpu_discovery.detect_devices()
+
+    def getGpuUtilization(self, device_id):
+        """Return GpuUsage proto for a device."""
+        if not self.__gpu_discovery:
+            return opencue_proto.host_pb2.GpuUsage(
+                device_id=device_id, utilization_pct=0, memory_used_bytes=0)
+        return self.__gpu_discovery.get_utilization(device_id)
+
     def reserveHT(self, frameCores):
         """ Reserve cores for use by taskset
         taskset -c 0,1,8,9 COMMAND
diff --git a/rqd/rqd/rqnetwork.py b/rqd/rqd/rqnetwork.py
index 4490c53fa..bf49d5df5 100644
--- a/rqd/rqd/rqnetwork.py
+++ b/rqd/rqd/rqnetwork.py
@@ -76,6 +76,7 @@ def __init__(self, rqCore, runFrame):
 
         self.usedGpuMemory = 0
         self.maxUsedGpuMemory = 0
+        self.gpuUsage = []  # List of GpuUsage protos
 
         self.usedSwapMemory = 0
 
@@ -110,6 +111,8 @@ def runningFrameInfo(self):
             children=self._serializeChildrenProcs(),
             used_swap_memory=self.usedSwapMemory,
         )
+        # Add per-device GPU usage
+        runningFrameInfo.gpu_usage.extend(self.gpuUsage)
         return runningFrameInfo
 
     def _serializeChildrenProcs(self):
diff --git a/rust/crates/rqd/Cargo.toml b/rust/crates/rqd/Cargo.toml
index 2cfef543a..a2ebea745 100644
--- a/rust/crates/rqd/Cargo.toml
+++ b/rust/crates/rqd/Cargo.toml
@@ -12,6 +12,8 @@ nimby = ["dep:device_query"]
 # Containerized Frames is a feature that allows rqd to run in a containerized environment.
 # This feature is highly experimental and may not be stable.
 containerized_frames = ["bollard"]
+# NVML support for NVIDIA GPU discovery and monitoring
+nvml = ["dep:nvml-wrapper"]
 
 [[bin]]
 path = "src/main.rs"
@@ -66,6 +68,7 @@ device_query = { version = "3.0", optional = true }
 pnet = "0.35.0"
 log = "0.4.27"
 ureq = { version = "3.1.0", features = ["json"] }
+nvml-wrapper = { version = "0.10", optional = true }
 
 [dev-dependencies]
 tempfile = "3.14.0"
diff --git a/rust/crates/rqd/src/system/gpu.rs b/rust/crates/rqd/src/system/gpu.rs
new file mode 100644
index 000000000..6b4a836a6
--- /dev/null
+++ b/rust/crates/rqd/src/system/gpu.rs
@@ -0,0 +1,283 @@
+use miette::Result;
+use opencue_proto::host::{GpuDevice, GpuUsage};
+use std::collections::HashMap;
+use tracing::{error, info, warn};
+
+/// Abstract GPU discovery interface
+pub trait GpuDiscovery {
+    /// Detect GPU devices on this machine
+    fn detect_devices(&self) -> Result<Vec<GpuDevice>>;
+
+    /// Get current utilization for a specific GPU device
+    fn get_utilization(&self, device_id: &str) -> Result<GpuUsage>;
+}
+
+/// NVIDIA GPU discovery using NVML library
+pub struct NvidiaGpuDiscovery {
+    nvml_available: bool,
+}
+
+impl NvidiaGpuDiscovery {
+    pub fn new() -> Self {
+        let nvml_available = Self::check_nvml_available();
+        if nvml_available {
+            info!("Using NVML for NVIDIA GPU discovery");
+        } else {
+            warn!("NVML unavailable, GPU features will be limited");
+        }
+        Self { nvml_available }
+    }
+
+    fn check_nvml_available() -> bool {
+        #[cfg(feature = "nvml")]
+        {
+            match nvml_wrapper::Nvml::init() {
+                Ok(_) => true,
+                Err(e) => {
+                    warn!("NVML initialization failed: {}", e);
+                    false
+                }
+            }
+        }
+        #[cfg(not(feature = "nvml"))]
+        {
+            false
+        }
+    }
+
+    #[cfg(feature = "nvml")]
+    fn detect_via_nvml(&self) -> Result<Vec<GpuDevice>> {
+        use nvml_wrapper::Nvml;
+
+        let nvml = Nvml::init().map_err(|e| miette::miette!("NVML init failed: {}", e))?;
+        let device_count = nvml.device_count().map_err(|e| miette::miette!("Failed to get device count: {}", e))?;
+
+        let mut devices = Vec::new();
+        for i in 0..device_count {
+            match nvml.device_by_index(i) {
+                Ok(device) => {
+                    let name = device.name().unwrap_or_else(|_| "Unknown".to_string());
+                    let memory_info = device.memory_info().ok();
+                    let pci_info = device.pci_info().ok();
+                    let driver_version = nvml.sys_driver_version().unwrap_or_else(|_| "Unknown".to_string());
+                    let cuda_version = nvml.sys_cuda_driver_version().ok();
+
+                    let gpu_device = GpuDevice {
+                        id: i.to_string(),
+                        vendor: "NVIDIA".to_string(),
+                        model: name,
+                        memory_bytes: memory_info.map(|m| m.total).unwrap_or(0),
+                        pci_bus: pci_info.map(|p| p.bus_id).unwrap_or_else(|| "Unknown".to_string()),
+                        driver_version,
+                        cuda_version: cuda_version.map(|v| format!("{}.{}", v / 1000, (v % 1000) / 10)).unwrap_or_else(|| "Unknown".to_string()),
+                        attributes: HashMap::new(),
+                    };
+                    devices.push(gpu_device);
+                }
+                Err(e) => {
+                    warn!("Failed to get device {}: {}", i, e);
+                }
+            }
+        }
+        Ok(devices)
+    }
+
+    #[cfg(not(feature = "nvml"))]
+    fn detect_via_nvml(&self) -> Result<Vec<GpuDevice>> {
+        Ok(Vec::new())
+    }
+
+    fn detect_via_smi(&self) -> Result<Vec<GpuDevice>> {
+        use std::process::Command;
+
+        let output = Command::new("nvidia-smi")
+            .args(&[
+                "--query-gpu=index,name,memory.total,pci.bus_id,driver_version",
+                "--format=csv,noheader,nounits",
+            ])
+            .output()
+            .map_err(|e| miette::miette!("Failed to run nvidia-smi: {}", e))?;
+
+        if !output.status.success() {
+            return Err(miette::miette!("nvidia-smi command failed"));
+        }
+
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        let mut devices = Vec::new();
+
+        for line in stdout.lines() {
+            let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
+            if parts.len() >= 5 {
+                let memory_mb: f64 = parts[2].parse().unwrap_or(0.0);
+                let memory_bytes = (memory_mb * 1_048_576.0) as u64; // MB to bytes
+
+                let gpu_device = GpuDevice {
+                    id: parts[0].to_string(),
+                    vendor: "NVIDIA".to_string(),
+                    model: parts[1].to_string(),
+                    memory_bytes,
+                    pci_bus: parts[3].to_string(),
+                    driver_version: parts[4].to_string(),
+                    cuda_version: "Unknown".to_string(),
+                    attributes: HashMap::new(),
+                };
+                devices.push(gpu_device);
+            }
+        }
+
+        Ok(devices)
+    }
+}
+
+impl GpuDiscovery for NvidiaGpuDiscovery {
+    fn detect_devices(&self) -> Result<Vec<GpuDevice>> {
+        if self.nvml_available {
+            self.detect_via_nvml()
+        } else {
+            self.detect_via_smi()
+        }
+    }
+
+    fn get_utilization(&self, device_id: &str) -> Result<GpuUsage> {
+        #[cfg(feature = "nvml")]
+        {
+            if self.nvml_available {
+                use nvml_wrapper::Nvml;
+
+                let nvml = Nvml::init().map_err(|e| miette::miette!("NVML init failed: {}", e))?;
+                let index: u32 = device_id.parse().map_err(|e| miette::miette!("Invalid device ID: {}", e))?;
+                let device = nvml.device_by_index(index).map_err(|e| miette::miette!("Device not found: {}", e))?;
+
+                let utilization = device.utilization_rates().ok();
+                let memory_info = device.memory_info().ok();
+                let temperature = device.temperature(nvml_wrapper::enum_wrappers::device::TemperatureSensor::Gpu).ok();
+
+                return Ok(GpuUsage {
+                    device_id: device_id.to_string(),
+                    utilization_pct: utilization.map(|u| u.gpu).unwrap_or(0),
+                    memory_used_bytes: memory_info.map(|m| m.used).unwrap_or(0),
+                    temperature_c: temperature.unwrap_or(0),
+                });
+            }
+        }
+
+        // Fallback: return empty usage
+        Ok(GpuUsage {
+            device_id: device_id.to_string(),
+            utilization_pct: 0,
+            memory_used_bytes: 0,
+            temperature_c: 0,
+        })
+    }
+}
+
+/// Apple Metal GPU discovery for macOS
+pub struct AppleMetalGpuDiscovery;
+
+impl AppleMetalGpuDiscovery {
+    pub fn new() -> Self {
+        Self
+    }
+
+    fn parse_vram(vram_str: &str) -> u64 {
+        // Parse strings like "16 GB" or "16384 MB" to bytes
+        let parts: Vec<&str> = vram_str.split_whitespace().collect();
+        if parts.len() >= 2 {
+            if let Ok(value) = parts[0].parse::<u64>() {
+                match parts[1] {
+                    "GB" => return value * 1024 * 1024 * 1024,
+                    "MB" => return value * 1024 * 1024,
+                    _ => {}
+                }
+            }
+        }
+        0
+    }
+}
+
+impl GpuDiscovery for AppleMetalGpuDiscovery {
+    fn detect_devices(&self) -> Result<Vec<GpuDevice>> {
+        use std::process::Command;
+
+        let output = Command::new("system_profiler")
+            .args(&["SPDisplaysDataType", "-json"])
+            .output()
+            .map_err(|e| miette::miette!("Failed to run system_profiler: {}", e))?;
+
+        if !output.status.success() {
+            return Err(miette::miette!("system_profiler command failed"));
+        }
+
+        let stdout = String::from_utf8_lossy(&output.stdout);
+        let json_data: serde_json::Value = serde_json::from_str(&stdout)
+            .map_err(|e| miette::miette!("Failed to parse JSON: {}", e))?;
+
+        let mut devices = Vec::new();
+        let mut gpu_idx = 0;
+
+        if let Some(displays) = json_data["SPDisplaysDataType"].as_array() {
+            for display in displays {
+                let chipset_model = display["sppci_model"]
+                    .as_str()
+                    .unwrap_or("Unknown")
+                    .to_string();
+                let vram = display["spdisplays_vram"]
+                    .as_str()
+                    .unwrap_or("0 MB")
+                    .to_string();
+                let vram_bytes = Self::parse_vram(&vram);
+
+                let mut attributes = HashMap::new();
+                attributes.insert("metal_supported".to_string(), "true".to_string());
+
+                let gpu_device = GpuDevice {
+                    id: gpu_idx.to_string(),
+                    vendor: "Apple".to_string(),
+                    model: chipset_model,
+                    memory_bytes: vram_bytes,
+                    pci_bus: "integrated".to_string(),
+                    driver_version: "Metal".to_string(),
+                    cuda_version: "N/A".to_string(),
+                    attributes,
+                };
+                devices.push(gpu_device);
+                gpu_idx += 1;
+            }
+        }
+
+        Ok(devices)
+    }
+
+    fn get_utilization(&self, device_id: &str) -> Result<GpuUsage> {
+        // Apple Metal does not expose per-process GPU utilization
+        Ok(GpuUsage {
+            device_id: device_id.to_string(),
+            utilization_pct: 0,
+            memory_used_bytes: 0,
+            temperature_c: 0,
+        })
+    }
+}
+
+/// Factory function to create the appropriate GPU discovery backend for this platform
+pub fn create_gpu_discovery() -> Option<Box<dyn GpuDiscovery + Send + Sync>> {
+    #[cfg(target_os = "linux")]
+    {
+        Some(Box::new(NvidiaGpuDiscovery::new()))
+    }
+
+    #[cfg(target_os = "macos")]
+    {
+        Some(Box::new(AppleMetalGpuDiscovery::new()))
+    }
+
+    #[cfg(target_os = "windows")]
+    {
+        Some(Box::new(NvidiaGpuDiscovery::new()))
+    }
+
+    #[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
+    {
+        None
+    }
+}
diff --git a/rust/crates/rqd/src/system/linux.rs b/rust/crates/rqd/src/system/linux.rs
index 28cca98c5..1c600554d 100644
--- a/rust/crates/rqd/src/system/linux.rs
+++ b/rust/crates/rqd/src/system/linux.rs
@@ -771,12 +771,48 @@ impl SystemManager for LinuxSystem {
     }
 
     fn collect_gpu_stats(&self) -> MachineGpuStats {
-        // TODO: missing implementation, returning dummy val
-        MachineGpuStats {
-            count: 0,
-            total_memory: 0,
-            free_memory: 0,
-            _used_memory_by_unit: HashMap::default(),
+        use crate::system::gpu;
+
+        // Create GPU discovery backend
+        let gpu_discovery = gpu::create_gpu_discovery();
+
+        if let Some(discovery) = gpu::create_gpu_discovery() {
+            // Detect GPU devices
+            match discovery.detect_devices() {
+                Ok(devices) => {
+                    let count = devices.len() as u32;
+                    let total_memory: u64 = devices.iter().map(|d| d.memory_bytes).sum();
+
+                    // Note: free_memory calculation would require querying each device
+                    // For now, we'll set it to 0 and let the detailed gpu_devices provide the info
+                    MachineGpuStats {
+                        count,
+                        total_memory,
+                        free_memory: 0, // Legacy field, use gpu_devices for detailed info
+                        _used_memory_by_unit: HashMap::default(),
+                        gpu_devices: devices,
+                    }
+                }
+                Err(e) => {
+                    warn!("Failed to detect GPU devices: {}", e);
+                    MachineGpuStats {
+                        count: 0,
+                        total_memory: 0,
+                        free_memory: 0,
+                        _used_memory_by_unit: HashMap::default(),
+                        gpu_devices: Vec::new(),
+                    }
+                }
+            }
+        } else {
+            // No GPU discovery available for this platform
+            MachineGpuStats {
+                count: 0,
+                total_memory: 0,
+                free_memory: 0,
+                _used_memory_by_unit: HashMap::default(),
+                gpu_devices: Vec::new(),
+            }
         }
     }
     fn create_user_if_unexisting(&self, username: &str, uid: u32, gid: u32) -> Result<u32> {
diff --git a/rust/crates/rqd/src/system/machine.rs b/rust/crates/rqd/src/system/machine.rs
index 8b646625f..55a812e61 100644
--- a/rust/crates/rqd/src/system/machine.rs
+++ b/rust/crates/rqd/src/system/machine.rs
@@ -464,6 +464,7 @@ impl MachineMonitor {
             num_gpus: gpu_stats.count as i32,
             free_gpu_mem: gpu_stats.free_memory as i64,
             total_gpu_mem: gpu_stats.total_memory as i64,
+            gpu_devices: gpu_stats.gpu_devices,
         })
     }
 }
diff --git a/rust/crates/rqd/src/system/macos.rs b/rust/crates/rqd/src/system/macos.rs
index 6db2afd1d..8f3cd9ce4 100644
--- a/rust/crates/rqd/src/system/macos.rs
+++ b/rust/crates/rqd/src/system/macos.rs
@@ -664,12 +664,48 @@ impl SystemManager for MacOsSystem {
     }
 
     fn collect_gpu_stats(&self) -> MachineGpuStats {
-        // TODO: missing implementation, returning dummy val
-        MachineGpuStats {
-            count: 0,
-            total_memory: 0,
-            free_memory: 0,
-            _used_memory_by_unit: HashMap::default(),
+        use crate::system::gpu;
+
+        // Create GPU discovery backend (Apple Metal on macOS)
+        let gpu_discovery = gpu::create_gpu_discovery();
+
+        if let Some(discovery) = gpu_discovery {
+            // Detect GPU devices
+            match discovery.detect_devices() {
+                Ok(devices) => {
+                    let count = devices.len() as u32;
+                    let total_memory: u64 = devices.iter().map(|d| d.memory_bytes).sum();
+
+                    // Note: free_memory calculation would require querying each device
+                    // For now, we'll set it to 0 and let the detailed gpu_devices provide the info
+                    MachineGpuStats {
+                        count,
+                        total_memory,
+                        free_memory: 0, // Legacy field, use gpu_devices for detailed info
+                        _used_memory_by_unit: HashMap::default(),
+                        gpu_devices: devices,
+                    }
+                }
+                Err(e) => {
+                    warn!("Failed to detect GPU devices: {}", e);
+                    MachineGpuStats {
+                        count: 0,
+                        total_memory: 0,
+                        free_memory: 0,
+                        _used_memory_by_unit: HashMap::default(),
+                        gpu_devices: Vec::new(),
+                    }
+                }
+            }
+        } else {
+            // No GPU discovery available for this platform
+            MachineGpuStats {
+                count: 0,
+                total_memory: 0,
+                free_memory: 0,
+                _used_memory_by_unit: HashMap::default(),
+                gpu_devices: Vec::new(),
+            }
         }
     }
 
diff --git a/rust/crates/rqd/src/system/manager.rs b/rust/crates/rqd/src/system/manager.rs
index 23087dbc6..69765ebfa 100644
--- a/rust/crates/rqd/src/system/manager.rs
+++ b/rust/crates/rqd/src/system/manager.rs
@@ -1,7 +1,7 @@
 use std::collections::HashMap;
 
 use miette::{Diagnostic, Result};
-use opencue_proto::{host::HardwareState, report::ChildrenProcStats};
+use opencue_proto::{host::{GpuDevice, GpuUsage, HardwareState}, report::ChildrenProcStats};
 use thiserror::Error;
 use tracing::error;
 use uuid::Uuid;
@@ -99,6 +99,8 @@ pub struct MachineGpuStats {
     pub free_memory: u64,
     /// Used memory by unit of each GPU, where the key in the HashMap is the unit ID, and the value is the used memory
     pub _used_memory_by_unit: HashMap<u32, u64>,
+    /// Detailed GPU device inventory
+    pub gpu_devices: Vec<GpuDevice>,
 }
 
 /// Tracks memory and runtime statistics for a rendering process and its children.
@@ -118,6 +120,8 @@ pub struct ProcessStats {
     pub max_used_gpu_memory: u64,
     /// Current GPU memory usage (KB).
     pub used_gpu_memory: u64,
+    /// Per-device GPU usage statistics
+    pub gpu_usage: Vec<GpuUsage>,
     /// Additional data about the running frame's child processes.
     pub children: Option<ChildrenProcStats>,
     /// Unix timestamp denoting the start time of the frame process.
@@ -136,6 +140,7 @@ impl Default for ProcessStats {
             llu_time: 0,
             max_used_gpu_memory: 0,
             used_gpu_memory: 0,
+            gpu_usage: Vec::new(),
             children: None,
             epoch_start_time: std::time::SystemTime::now()
                 .duration_since(std::time::UNIX_EPOCH)
@@ -157,6 +162,7 @@ impl ProcessStats {
             vsize: new.vsize,
             llu_time: new.llu_time,
             used_gpu_memory: new.used_gpu_memory,
+            gpu_usage: new.gpu_usage,
             children: new.children,
             epoch_start_time: new.epoch_start_time,
         };
diff --git a/rust/crates/rqd/src/system/mod.rs b/rust/crates/rqd/src/system/mod.rs
index add1f2792..8de52bec6 100644
--- a/rust/crates/rqd/src/system/mod.rs
+++ b/rust/crates/rqd/src/system/mod.rs
@@ -1,5 +1,6 @@
 use uuid::Uuid;
 
+pub mod gpu;
 pub mod linux;
 pub mod machine;
 #[cfg(feature = "nimby")]