Skip to content

Commit 2a92d6a

Browse files
committed
Fix bug where docker swarm device selection is overriden by NVIDIA_VISIBLE_DEVICES
This change fixes a bug where the value of NVIDIA_VISIBLE_DEVICES would be used to select devices even if the `swarm-resource` config option is specified. Note that this does not change the value of NVIDIA_VISIBLE_DEVICES in the container. Signed-off-by: Evan Lezar <[email protected]>
1 parent 602eaf0 commit 2a92d6a

File tree

2 files changed

+191
-0
lines changed

2 files changed

+191
-0
lines changed

pkg/container_config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ func getDevicesFromEnvvar(env map[string]string, legacyImage bool) *string {
216216
for _, envVar := range envVars {
217217
if devs, ok := env[envVar]; ok {
218218
devices = &devs
219+
break
219220
}
220221
}
221222

pkg/container_test.go

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,3 +632,193 @@ func TestDeviceListSourcePriority(t *testing.T) {
632632
})
633633
}
634634
}
635+
636+
func TestGetDevicesFromEnvvar(t *testing.T) {
637+
all := "all"
638+
empty := ""
639+
envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
640+
gpuID := "GPU-12345"
641+
anotherGPUID := "GPU-67890"
642+
643+
var tests = []struct {
644+
description string
645+
envSwarmGPU *string
646+
env map[string]string
647+
legacyImage bool
648+
expectedDevices *string
649+
}{
650+
{
651+
description: "empty env returns nil for non-legacy image",
652+
},
653+
{
654+
description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
655+
env: map[string]string{
656+
envNVVisibleDevices: "",
657+
},
658+
},
659+
{
660+
description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
661+
env: map[string]string{
662+
envNVVisibleDevices: "void",
663+
},
664+
},
665+
{
666+
description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image",
667+
env: map[string]string{
668+
envNVVisibleDevices: "none",
669+
},
670+
expectedDevices: &empty,
671+
},
672+
{
673+
description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image",
674+
env: map[string]string{
675+
envNVVisibleDevices: gpuID,
676+
},
677+
expectedDevices: &gpuID,
678+
},
679+
{
680+
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
681+
env: map[string]string{
682+
envNVVisibleDevices: gpuID,
683+
},
684+
legacyImage: true,
685+
expectedDevices: &gpuID,
686+
},
687+
{
688+
description: "empty env returns all for legacy image",
689+
legacyImage: true,
690+
expectedDevices: &all,
691+
},
692+
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is ignored when
693+
// not enabled
694+
{
695+
description: "missing NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
696+
env: map[string]string{
697+
envDockerResourceGPUs: anotherGPUID,
698+
},
699+
},
700+
{
701+
description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
702+
env: map[string]string{
703+
envNVVisibleDevices: "",
704+
envDockerResourceGPUs: anotherGPUID,
705+
},
706+
},
707+
{
708+
description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
709+
env: map[string]string{
710+
envNVVisibleDevices: "void",
711+
envDockerResourceGPUs: anotherGPUID,
712+
},
713+
},
714+
{
715+
description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image",
716+
env: map[string]string{
717+
envNVVisibleDevices: "none",
718+
envDockerResourceGPUs: anotherGPUID,
719+
},
720+
expectedDevices: &empty,
721+
},
722+
{
723+
description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image",
724+
env: map[string]string{
725+
envNVVisibleDevices: gpuID,
726+
envDockerResourceGPUs: anotherGPUID,
727+
},
728+
expectedDevices: &gpuID,
729+
},
730+
{
731+
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
732+
env: map[string]string{
733+
envNVVisibleDevices: gpuID,
734+
envDockerResourceGPUs: anotherGPUID,
735+
},
736+
legacyImage: true,
737+
expectedDevices: &gpuID,
738+
},
739+
{
740+
description: "empty env returns all for legacy image",
741+
env: map[string]string{
742+
envDockerResourceGPUs: anotherGPUID,
743+
},
744+
legacyImage: true,
745+
expectedDevices: &all,
746+
},
747+
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
748+
// enabled
749+
{
750+
description: "empty env returns nil for non-legacy image",
751+
envSwarmGPU: &envDockerResourceGPUs,
752+
},
753+
{
754+
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
755+
envSwarmGPU: &envDockerResourceGPUs,
756+
env: map[string]string{
757+
envDockerResourceGPUs: "",
758+
},
759+
},
760+
{
761+
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
762+
envSwarmGPU: &envDockerResourceGPUs,
763+
env: map[string]string{
764+
envDockerResourceGPUs: "void",
765+
},
766+
},
767+
{
768+
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
769+
envSwarmGPU: &envDockerResourceGPUs,
770+
env: map[string]string{
771+
envDockerResourceGPUs: "none",
772+
},
773+
expectedDevices: &empty,
774+
},
775+
{
776+
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
777+
envSwarmGPU: &envDockerResourceGPUs,
778+
env: map[string]string{
779+
envDockerResourceGPUs: gpuID,
780+
},
781+
expectedDevices: &gpuID,
782+
},
783+
{
784+
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
785+
envSwarmGPU: &envDockerResourceGPUs,
786+
env: map[string]string{
787+
envDockerResourceGPUs: gpuID,
788+
},
789+
legacyImage: true,
790+
expectedDevices: &gpuID,
791+
},
792+
{
793+
description: "DOCKER_RESOURCE_GPUS is selected if present",
794+
envSwarmGPU: &envDockerResourceGPUs,
795+
env: map[string]string{
796+
envDockerResourceGPUs: anotherGPUID,
797+
},
798+
expectedDevices: &anotherGPUID,
799+
},
800+
{
801+
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
802+
envSwarmGPU: &envDockerResourceGPUs,
803+
env: map[string]string{
804+
envNVVisibleDevices: gpuID,
805+
envDockerResourceGPUs: anotherGPUID,
806+
},
807+
expectedDevices: &anotherGPUID,
808+
},
809+
}
810+
811+
for i, tc := range tests {
812+
t.Run(tc.description, func(t *testing.T) {
813+
envSwarmGPU = tc.envSwarmGPU
814+
devices := getDevicesFromEnvvar(tc.env, tc.legacyImage)
815+
if tc.expectedDevices == nil {
816+
require.Nil(t, devices, "%d: %v", i, tc)
817+
return
818+
}
819+
820+
require.NotNil(t, devices, "%d: %v", i, tc)
821+
require.Equal(t, *tc.expectedDevices, *devices, "%d: %v", i, tc)
822+
})
823+
}
824+
}

0 commit comments

Comments
 (0)