Skip to content

Commit b4f0ad9

Browse files
BUGFIX: modifier: respect GPU volume-mount device requests
The gated modifiers used to add support for GDS, Mofed, and CUDA Forward Comatibility only check the NVIDIA_VISIBLE_DEVICES envvar to determine whether GPUs are requested and modifications should be made. This means that use cases where volume mounts are used to request devices (e.g. when using the GPU Device Plugin) are not supported. This patch takes visibleDevicesFromEnvVar private, making VisibleDevices the only exported method to query valid devices. And edits the gated modifiers to use this func, ensuring device requests via mounts are also taken into acount. Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 6fc6f2c commit b4f0ad9

File tree

7 files changed

+313
-35
lines changed

7 files changed

+313
-35
lines changed

internal/config/image/cuda_image.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ func parseMajorMinorVersion(version string) (string, error) {
212212
// OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/
213213
func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
214214
var hasCDIdevice bool
215-
for _, device := range i.VisibleDevicesFromEnvVar() {
215+
for _, device := range i.visibleDevicesFromEnvVar() {
216216
if !parser.IsQualifiedName(device) {
217217
return false
218218
}
@@ -258,7 +258,7 @@ func (i CUDA) VisibleDevices() []string {
258258
}
259259

260260
// Get the Fallback to reading from the environment variable if privileges are correct
261-
envVarDeviceRequests := i.VisibleDevicesFromEnvVar()
261+
envVarDeviceRequests := i.visibleDevicesFromEnvVar()
262262
if len(envVarDeviceRequests) == 0 {
263263
return nil
264264
}
@@ -278,11 +278,11 @@ func (i CUDA) VisibleDevices() []string {
278278
return nil
279279
}
280280

281-
// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
281+
// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
282282
// If any of the preferredVisibleDeviceEnvVars are present in the image, they
283283
// are used to determine the visible devices. If this is not the case, the
284284
// NVIDIA_VISIBLE_DEVICES environment variable is used.
285-
func (i CUDA) VisibleDevicesFromEnvVar() []string {
285+
func (i CUDA) visibleDevicesFromEnvVar() []string {
286286
envVars := i.visibleEnvVars()
287287
return i.DevicesFromEnvvars(envVars...).List()
288288
}

internal/config/image/cuda_image_test.go

Lines changed: 88 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
429429
)
430430

431431
require.NoError(t, err)
432-
devices := image.VisibleDevicesFromEnvVar()
432+
devices := image.visibleDevicesFromEnvVar()
433433
require.EqualValues(t, tc.expectedDevices, devices)
434434
})
435435
}
@@ -508,13 +508,15 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) {
508508

509509
func TestVisibleDevices(t *testing.T) {
510510
var tests = []struct {
511-
description string
512-
mountDevices []specs.Mount
513-
envvarDevices string
514-
privileged bool
515-
acceptUnprivileged bool
516-
acceptMounts bool
517-
expectedDevices []string
511+
description string
512+
mountDevices []specs.Mount
513+
envvarDevices string
514+
privileged bool
515+
acceptUnprivileged bool
516+
acceptMounts bool
517+
preferredVisibleDeviceEnvVars []string
518+
env map[string]string
519+
expectedDevices []string
518520
}{
519521
{
520522
description: "Mount devices, unprivileged, no accept unprivileged",
@@ -597,20 +599,92 @@ func TestVisibleDevices(t *testing.T) {
597599
acceptMounts: false,
598600
expectedDevices: nil,
599601
},
602+
// New test cases for visibleEnvVars functionality
603+
{
604+
description: "preferred env var set and present in env, privileged",
605+
mountDevices: nil,
606+
envvarDevices: "",
607+
privileged: true,
608+
acceptUnprivileged: false,
609+
acceptMounts: true,
610+
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
611+
env: map[string]string{
612+
"DOCKER_RESOURCE_GPUS": "GPU-12345",
613+
},
614+
expectedDevices: []string{"GPU-12345"},
615+
},
616+
{
617+
description: "preferred env var set and present in env, unprivileged but accepted",
618+
mountDevices: nil,
619+
envvarDevices: "",
620+
privileged: false,
621+
acceptUnprivileged: true,
622+
acceptMounts: true,
623+
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
624+
env: map[string]string{
625+
"DOCKER_RESOURCE_GPUS": "GPU-12345",
626+
},
627+
expectedDevices: []string{"GPU-12345"},
628+
},
629+
{
630+
description: "preferred env var set and present in env, unprivileged and not accepted",
631+
mountDevices: nil,
632+
envvarDevices: "",
633+
privileged: false,
634+
acceptUnprivileged: false,
635+
acceptMounts: true,
636+
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
637+
env: map[string]string{
638+
"DOCKER_RESOURCE_GPUS": "GPU-12345",
639+
},
640+
expectedDevices: nil,
641+
},
642+
{
643+
description: "multiple preferred env vars, both present, privileged",
644+
mountDevices: nil,
645+
envvarDevices: "",
646+
privileged: true,
647+
acceptUnprivileged: false,
648+
acceptMounts: true,
649+
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
650+
env: map[string]string{
651+
"DOCKER_RESOURCE_GPUS": "GPU-12345",
652+
"DOCKER_RESOURCE_GPUS_ADDITIONAL": "GPU-67890",
653+
},
654+
expectedDevices: []string{"GPU-12345", "GPU-67890"},
655+
},
656+
{
657+
description: "preferred env var not present, fallback to NVIDIA_VISIBLE_DEVICES, privileged",
658+
mountDevices: nil,
659+
envvarDevices: "GPU-12345",
660+
privileged: true,
661+
acceptUnprivileged: false,
662+
acceptMounts: true,
663+
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
664+
env: map[string]string{
665+
EnvVarNvidiaVisibleDevices: "GPU-12345",
666+
},
667+
expectedDevices: []string{"GPU-12345"},
668+
},
600669
}
601670
for _, tc := range tests {
602671
t.Run(tc.description, func(t *testing.T) {
603-
// Wrap the call to getDevices() in a closure.
672+
// Create env map with both NVIDIA_VISIBLE_DEVICES and any additional env vars
673+
env := make(map[string]string)
674+
if tc.envvarDevices != "" {
675+
env[EnvVarNvidiaVisibleDevices] = tc.envvarDevices
676+
}
677+
for k, v := range tc.env {
678+
env[k] = v
679+
}
680+
604681
image, err := New(
605-
WithEnvMap(
606-
map[string]string{
607-
EnvVarNvidiaVisibleDevices: tc.envvarDevices,
608-
},
609-
),
682+
WithEnvMap(env),
610683
WithMounts(tc.mountDevices),
611684
WithPrivileged(tc.privileged),
612685
WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts),
613686
WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged),
687+
WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...),
614688
)
615689
require.NoError(t, err)
616690
require.Equal(t, tc.expectedDevices, image.VisibleDevices())

internal/modifier/cdi.go

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,37 +86,38 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C
8686
if err != nil {
8787
return nil, err
8888
}
89-
if cfg.AcceptDeviceListAsVolumeMounts {
90-
mountDevices := container.CDIDevicesFromMounts()
91-
if len(mountDevices) > 0 {
92-
return mountDevices, nil
93-
}
94-
}
9589

9690
var devices []string
9791
seen := make(map[string]bool)
98-
for _, name := range container.VisibleDevicesFromEnvVar() {
92+
for _, name := range container.VisibleDevices() {
93+
// Skip empty device names
94+
if strings.TrimSpace(name) == "" {
95+
continue
96+
}
97+
9998
if !parser.IsQualifiedName(name) {
10099
name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name)
101100
}
101+
102+
// Skip devices that result in empty device IDs after qualification
103+
if strings.HasSuffix(name, "=") {
104+
logger.Debugf("Ignoring device with empty ID: %q", name)
105+
continue
106+
}
107+
102108
if seen[name] {
103109
logger.Debugf("Ignoring duplicate device %q", name)
104110
continue
105111
}
106112
devices = append(devices, name)
113+
seen[name] = true
107114
}
108115

109116
if len(devices) == 0 {
110117
return nil, nil
111118
}
112119

113-
if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged((*image.OCISpec)(rawSpec)) {
114-
return devices, nil
115-
}
116-
117-
logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES: %v", devices)
118-
119-
return nil, nil
120+
return devices, nil
120121
}
121122

122123
// getAnnotationDevices returns a list of devices specified in the annotations.

0 commit comments

Comments
 (0)