Skip to content

Commit c5e473c

Browse files
committed
Ignore errors getting device memory using NVML
On certain systems, the NVML nvmlDeviceGetMemoryInformation API is not supported and returns an error. In these cases we ignore these errors and log a warning instead. This means that: * For the GPU Device Plugin, memory limits will be enforced for MPS partioning. * For GFD, no nvidia.com/gpu.memory label will be generated. Signed-off-by: Evan Lezar <[email protected]>
1 parent 82d5d24 commit c5e473c

File tree

2 files changed

+5
-2
lines changed

2 files changed

+5
-2
lines changed

internal/lm/resource.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import (
2121
"regexp"
2222
"strings"
2323

24+
"k8s.io/klog/v2"
25+
2426
spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
2527
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
2628
)
@@ -46,7 +48,7 @@ func NewGPUResourceLabeler(config *spec.Config, device resource.Device, count in
4648

4749
totalMemoryMiB, err := device.GetTotalMemoryMiB()
4850
if err != nil {
49-
return nil, fmt.Errorf("failed to get memory info for device: %v", err)
51+
klog.Warningf("Ignoring error getting memory info for device: %v", err)
5052
}
5153

5254
resourceLabeler := newResourceLabeler(fullGPUResourceName, config)

internal/rm/devices.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"strconv"
2222
"strings"
2323

24+
"k8s.io/klog/v2"
2425
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
2526
)
2627

@@ -73,7 +74,7 @@ func BuildDevice(index string, d deviceInfo) (*Device, error) {
7374

7475
totalMemory, err := d.GetTotalMemory()
7576
if err != nil {
76-
return nil, fmt.Errorf("error getting device memory: %w", err)
77+
klog.Warningf("Ignoring error getting device memory: %v", err)
7778
}
7879

7980
computeCapability, err := d.GetComputeCapability()

0 commit comments

Comments
 (0)