Skip to content

Commit ccf4b58

Browse files
committed
Turn some frequent DGX errors into warnings
1 parent 5a67814 commit ccf4b58

File tree

1 file changed

+35
-9
lines changed

1 file changed

+35
-9
lines changed

nvidia.go

Lines changed: 35 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,12 @@ func NewNvidiaProducer() (*NvidiaProducer, error) {
7373
}
7474
powerLimit, ret := nvml.DeviceGetPowerManagementLimit(device)
7575
if !errors.Is(ret, nvml.SUCCESS) {
76-
return nil, fmt.Errorf("failed to get power limit for Nvidia device %d: %s", i, nvml.ErrorString(ret))
76+
// Not supported on DGX
77+
if errors.Is(ret, nvml.ERROR_NOT_SUPPORTED) {
78+
slog.Warn("power limit not supported", "device", i, "err", nvml.ErrorString(ret))
79+
} else {
80+
return nil, fmt.Errorf("failed to get power limit for Nvidia device %d: %s", i, nvml.ErrorString(ret))
81+
}
7782
}
7883

7984
devices[i] = &perDeviceState{
@@ -369,7 +374,11 @@ func (ds *perDeviceState) collectUtilization() error {
369374

370375
sampleType, samples, ret := ds.d.GetSamples(nvml.GPU_UTILIZATION_SAMPLES, maxTimestamp)
371376
if !errors.Is(ret, nvml.SUCCESS) {
372-
return ret
377+
if errors.Is(ret, nvml.ERROR_NOT_FOUND) {
378+
slog.Warn("get GPU_UTILIZATION_SAMPLES returned not found", "err", ret)
379+
return nil
380+
}
381+
return fmt.Errorf("failed to get GPU_UTILIZATION_SAMPLES: %w", ret)
373382
}
374383
getValue, err := valueGetter(sampleType)
375384
if err != nil {
@@ -412,7 +421,11 @@ func (ds *perDeviceState) collectMemoryUtilization() error {
412421

413422
sampleType, samples, ret := ds.d.GetSamples(nvml.MEMORY_UTILIZATION_SAMPLES, maxTimestamp)
414423
if !errors.Is(ret, nvml.SUCCESS) {
415-
return ret
424+
if errors.Is(ret, nvml.ERROR_NOT_FOUND) {
425+
slog.Warn("get MEMORY_UTILIZATION_SAMPLES not found", "err", ret)
426+
return nil
427+
}
428+
return fmt.Errorf("get MEMORY_UTILIZATION_SAMPLES failed %w", ret)
416429
}
417430
getValue, err := valueGetter(sampleType)
418431
if err != nil {
@@ -481,7 +494,7 @@ func (ds *perDeviceState) collectProcessUtilization() error {
481494
utilization, ret := ds.d.GetProcessUtilization(uint64(process.Pid))
482495
if !errors.Is(ret, nvml.SUCCESS) {
483496
// If the process is not found (likely terminated), skip it and continue
484-
if errors.Is(ret, nvml.ERROR_NOT_FOUND) {
497+
if errors.Is(ret, nvml.ERROR_NOT_FOUND) || errors.Is(ret, nvml.ERROR_NO_DATA) {
485498
slog.Debug("process not found, skipping", "pid", process.Pid, "error", nvml.ErrorString(ret))
486499
continue
487500
}
@@ -491,7 +504,7 @@ func (ds *perDeviceState) collectProcessUtilization() error {
491504
processName, ret := nvml.SystemGetProcessName(int(process.Pid)) // could easily be cached
492505
if !errors.Is(ret, nvml.SUCCESS) {
493506
// If the process is not found (likely terminated), skip it and continue
494-
if errors.Is(ret, nvml.ERROR_NOT_FOUND) {
507+
if errors.Is(ret, nvml.ERROR_NOT_FOUND) || errors.Is(ret, nvml.ERROR_NO_DATA) {
495508
slog.Debug("process not found, skipping", "pid", process.Pid, "error", nvml.ErrorString(ret))
496509
continue
497510
}
@@ -540,7 +553,12 @@ func (ds *perDeviceState) collectClock() error {
540553
ts := time.Now()
541554
clock, ret := nvml.DeviceGetClockInfo(ds.d, clockType)
542555
if !errors.Is(ret, nvml.SUCCESS) {
543-
return fmt.Errorf("failed to get clock for %d %s: %s", ds.index, clockName, nvml.ErrorString(ret))
556+
// Allow NOT_SUPPORTED for DGX
557+
if errors.Is(ret, nvml.ERROR_NOT_SUPPORTED) {
558+
slog.Warn("clock not found", "device", ds.index, "clock", clockName, "err", ret)
559+
} else {
560+
return fmt.Errorf("failed to get clock for %d %s: %s", ds.index, clockName, nvml.ErrorString(ret))
561+
}
544562
}
545563
clock *= 1e6 // MHz to Hertz
546564

@@ -565,7 +583,11 @@ func (ds *perDeviceState) collectPowerConsumption() error {
565583

566584
sampleType, samples, ret := ds.d.GetSamples(nvml.TOTAL_POWER_SAMPLES, maxTimestamp)
567585
if !errors.Is(ret, nvml.SUCCESS) {
568-
return ret
586+
if errors.Is(ret, nvml.ERROR_NOT_FOUND) {
587+
slog.Warn("get TOTAL_POWER_SAMPLES returned not found", "err", ret)
588+
return nil
589+
}
590+
return fmt.Errorf("GetSamples failed %v", ret)
569591
}
570592
getValue, err := valueGetter(sampleType)
571593
if err != nil {
@@ -608,7 +630,7 @@ func (ds *perDeviceState) collectTemperature() error {
608630
ts := time.Now()
609631
temp, ret := ds.d.GetTemperature(nvml.TEMPERATURE_GPU)
610632
if !errors.Is(ret, nvml.SUCCESS) {
611-
return fmt.Errorf("failed to get temperaturefor %d: %s", ds.index, nvml.ErrorString(ret))
633+
return fmt.Errorf("failed to get temperature for %d: %s", ds.index, nvml.ErrorString(ret))
612634
}
613635

614636
g := pmetric.NewGauge()
@@ -635,7 +657,11 @@ func (ds *perDeviceState) collectPCIThroughput() error {
635657

636658
tp, ret := ds.d.GetPcieThroughput(counter)
637659
if !errors.Is(ret, nvml.SUCCESS) {
638-
return fmt.Errorf("failed to get PCIe throughput for %d %d: %s", ds.index, counter, nvml.ErrorString(ret))
660+
if errors.Is(ret, nvml.ERROR_NOT_SUPPORTED) {
661+
slog.Warn("failed to get PCIe throughput", "device", ds.index, "counter", counter, "err", ret)
662+
} else {
663+
return fmt.Errorf("failed to get PCIe throughput for %d %d: %s", ds.index, counter, nvml.ErrorString(ret))
664+
}
639665
}
640666

641667
var metricName string

0 commit comments

Comments
 (0)