@@ -16,10 +16,11 @@ import (
1616)
1717
1818const (
19- visibleDevicesBC = "RUNAI-VISIBLE-DEVICES" // Deprecated, this value was replaced with NVIDIA_VISIBLE_DEVICES
19+ visibleDevicesBC = "RUNAI-VISIBLE-DEVICES" // Deprecated, this value was replaced with NVIDIA_VISIBLE_DEVICES
20+ NumOfGpusEnvVarBC = "RUNAI_NUM_OF_GPUS" // Deprecated, please use GPU_PORTION env var instead
2021)
2122
22- func AddVisibleDevicesEnvVars (container * v1.Container , sharedGpuConfigMapName string ) {
23+ func AddGPUSharingEnvVars (container * v1.Container , sharedGpuConfigMapName string ) {
2324 AddEnvVarToContainer (container , v1.EnvVar {
2425 Name : NvidiaVisibleDevices ,
2526 ValueFrom : & v1.EnvVarSource {
@@ -33,10 +34,22 @@ func AddVisibleDevicesEnvVars(container *v1.Container, sharedGpuConfigMapName st
3334 })
3435
3536 AddEnvVarToContainer (container , v1.EnvVar {
36- Name : NumOfGpusEnvVar ,
37+ Name : NumOfGpusEnvVarBC ,
3738 ValueFrom : & v1.EnvVarSource {
3839 ConfigMapKeyRef : & v1.ConfigMapKeySelector {
39- Key : NumOfGpusEnvVar ,
40+ Key : NumOfGpusEnvVarBC ,
41+ LocalObjectReference : v1.LocalObjectReference {
42+ Name : sharedGpuConfigMapName ,
43+ },
44+ },
45+ },
46+ })
47+
48+ AddEnvVarToContainer (container , v1.EnvVar {
49+ Name : GPUPortion ,
50+ ValueFrom : & v1.EnvVarSource {
51+ ConfigMapKeyRef : & v1.ConfigMapKeySelector {
52+ Key : GPUPortion ,
4053 LocalObjectReference : v1.LocalObjectReference {
4154 Name : sharedGpuConfigMapName ,
4255 },
@@ -69,6 +82,8 @@ func SetNvidiaVisibleDevices(
6982 return err
7083 }
7184 updateFunc = func (data map [string ]string ) error {
85+ // BC for pods that were created with NVIDIA_VISIBLE_DEVICES env var
86+ // with value from RUNAI-VISIBLE-DEVICES entry in GPU sharing configmap
7287 if _ , found := data [visibleDevicesBC ]; found {
7388 data [visibleDevicesBC ] = visibleDevicesValue
7489 }
@@ -83,12 +98,13 @@ func SetNvidiaVisibleDevices(
8398 return nil
8499}
85100
86- func SetNumOfGPUDevices (
101+ func SetGPUPortion (
87102 ctx context.Context , kubeClient client.Client , pod * v1.Pod , containerRef * gpusharingconfigmap.PodContainerRef ,
88- numOfGPUs string ,
103+ gpuPortionStr string ,
89104) error {
90105 updateFunc := func (data map [string ]string ) error {
91- data [NumOfGpusEnvVar ] = numOfGPUs
106+ data [NumOfGpusEnvVarBC ] = gpuPortionStr
107+ data [GPUPortion ] = gpuPortionStr
92108 return nil
93109 }
94110 capabilitiesMapName , err := gpusharingconfigmap .ExtractCapabilitiesConfigMapName (pod , containerRef )
@@ -98,7 +114,7 @@ func SetNumOfGPUDevices(
98114
99115 err = UpdateConfigMapEnvironmentVariable (ctx , kubeClient , pod , capabilitiesMapName , updateFunc )
100116 if err != nil {
101- return fmt .Errorf ("failed to update gpu sharing configmap for pod <%s/%s>: %v" ,
117+ return fmt .Errorf ("failed to update GPU_PORTION value in gpu sharing configmap for pod <%s/%s>: %v" ,
102118 pod .Namespace , pod .Name , err )
103119 }
104120 return nil
0 commit comments