@@ -73,7 +73,12 @@ func NewNvidiaProducer() (*NvidiaProducer, error) {
7373 }
7474 powerLimit , ret := nvml .DeviceGetPowerManagementLimit (device )
7575 if ! errors .Is (ret , nvml .SUCCESS ) {
76- return nil , fmt .Errorf ("failed to get power limit for Nvidia device %d: %s" , i , nvml .ErrorString (ret ))
76+ // Not supported on DGX
77+ if errors .Is (ret , nvml .ERROR_NOT_SUPPORTED ) {
78+ slog .Warn ("power limit not supported" , "device" , i , "err" , nvml .ErrorString (ret ))
79+ } else {
80+ return nil , fmt .Errorf ("failed to get power limit for Nvidia device %d: %s" , i , nvml .ErrorString (ret ))
81+ }
7782 }
7883
7984 devices [i ] = & perDeviceState {
@@ -369,7 +374,11 @@ func (ds *perDeviceState) collectUtilization() error {
369374
370375 sampleType , samples , ret := ds .d .GetSamples (nvml .GPU_UTILIZATION_SAMPLES , maxTimestamp )
371376 if ! errors .Is (ret , nvml .SUCCESS ) {
372- return ret
377+ if errors .Is (ret , nvml .ERROR_NOT_FOUND ) {
378+ slog .Warn ("get GPU_UTILIZATION_SAMPLES returned not found" , "err" , ret )
379+ return nil
380+ }
381+ return fmt .Errorf ("failed to get GPU_UTILIZATION_SAMPLES: %w" , ret )
373382 }
374383 getValue , err := valueGetter (sampleType )
375384 if err != nil {
@@ -412,7 +421,11 @@ func (ds *perDeviceState) collectMemoryUtilization() error {
412421
413422 sampleType , samples , ret := ds .d .GetSamples (nvml .MEMORY_UTILIZATION_SAMPLES , maxTimestamp )
414423 if ! errors .Is (ret , nvml .SUCCESS ) {
415- return ret
424+ if errors .Is (ret , nvml .ERROR_NOT_FOUND ) {
425+ slog .Warn ("get MEMORY_UTILIZATION_SAMPLES not found" , "err" , ret )
426+ return nil
427+ }
428+ return fmt .Errorf ("get MEMORY_UTILIZATION_SAMPLES failed %w" , ret )
416429 }
417430 getValue , err := valueGetter (sampleType )
418431 if err != nil {
@@ -481,7 +494,7 @@ func (ds *perDeviceState) collectProcessUtilization() error {
481494 utilization , ret := ds .d .GetProcessUtilization (uint64 (process .Pid ))
482495 if ! errors .Is (ret , nvml .SUCCESS ) {
483496 // If the process is not found (likely terminated), skip it and continue
484- if errors .Is (ret , nvml .ERROR_NOT_FOUND ) {
497+ if errors .Is (ret , nvml .ERROR_NOT_FOUND ) || errors . Is ( ret , nvml . ERROR_NO_DATA ) {
485498 slog .Debug ("process not found, skipping" , "pid" , process .Pid , "error" , nvml .ErrorString (ret ))
486499 continue
487500 }
@@ -491,7 +504,7 @@ func (ds *perDeviceState) collectProcessUtilization() error {
491504 processName , ret := nvml .SystemGetProcessName (int (process .Pid )) // could easily be cached
492505 if ! errors .Is (ret , nvml .SUCCESS ) {
493506 // If the process is not found (likely terminated), skip it and continue
494- if errors .Is (ret , nvml .ERROR_NOT_FOUND ) {
507+ if errors .Is (ret , nvml .ERROR_NOT_FOUND ) || errors . Is ( ret , nvml . ERROR_NO_DATA ) {
495508 slog .Debug ("process not found, skipping" , "pid" , process .Pid , "error" , nvml .ErrorString (ret ))
496509 continue
497510 }
@@ -540,7 +553,12 @@ func (ds *perDeviceState) collectClock() error {
540553 ts := time .Now ()
541554 clock , ret := nvml .DeviceGetClockInfo (ds .d , clockType )
542555 if ! errors .Is (ret , nvml .SUCCESS ) {
543- return fmt .Errorf ("failed to get clock for %d %s: %s" , ds .index , clockName , nvml .ErrorString (ret ))
556+ // Allow NOT_SUPPORTED for DGX
557+ if errors .Is (ret , nvml .ERROR_NOT_SUPPORTED ) {
558+ slog .Warn ("clock not found" , "device" , ds .index , "clock" , clockName , "err" , ret )
559+ } else {
560+ return fmt .Errorf ("failed to get clock for %d %s: %s" , ds .index , clockName , nvml .ErrorString (ret ))
561+ }
544562 }
545563 clock *= 1e6 // MHz to Hertz
546564
@@ -565,7 +583,11 @@ func (ds *perDeviceState) collectPowerConsumption() error {
565583
566584 sampleType , samples , ret := ds .d .GetSamples (nvml .TOTAL_POWER_SAMPLES , maxTimestamp )
567585 if ! errors .Is (ret , nvml .SUCCESS ) {
568- return ret
586+ if errors .Is (ret , nvml .ERROR_NOT_FOUND ) {
587+ slog .Warn ("get TOTAL_POWER_SAMPLES returned not found" , "err" , ret )
588+ return nil
589+ }
590+ return fmt .Errorf ("GetSamples failed %v" , ret )
569591 }
570592 getValue , err := valueGetter (sampleType )
571593 if err != nil {
@@ -608,7 +630,7 @@ func (ds *perDeviceState) collectTemperature() error {
608630 ts := time .Now ()
609631 temp , ret := ds .d .GetTemperature (nvml .TEMPERATURE_GPU )
610632 if ! errors .Is (ret , nvml .SUCCESS ) {
611- return fmt .Errorf ("failed to get temperaturefor %d: %s" , ds .index , nvml .ErrorString (ret ))
633+ return fmt .Errorf ("failed to get temperature for %d: %s" , ds .index , nvml .ErrorString (ret ))
612634 }
613635
614636 g := pmetric .NewGauge ()
@@ -635,7 +657,11 @@ func (ds *perDeviceState) collectPCIThroughput() error {
635657
636658 tp , ret := ds .d .GetPcieThroughput (counter )
637659 if ! errors .Is (ret , nvml .SUCCESS ) {
638- return fmt .Errorf ("failed to get PCIe throughput for %d %d: %s" , ds .index , counter , nvml .ErrorString (ret ))
660+ if errors .Is (ret , nvml .ERROR_NOT_SUPPORTED ) {
661+ slog .Warn ("failed to get PCIe throughput" , "device" , ds .index , "counter" , counter , "err" , ret )
662+ } else {
663+ return fmt .Errorf ("failed to get PCIe throughput for %d %d: %s" , ds .index , counter , nvml .ErrorString (ret ))
664+ }
639665 }
640666
641667 var metricName string
0 commit comments