Skip to content

Commit c23fd82

Browse files
authored
Added GPU_PORTION env var for GPU sharing pods (#312)
Added GPU_PORTION env var for GPU sharing pods
1 parent 77358e0 commit c23fd82

File tree

9 files changed

+41
-19
lines changed

9 files changed

+41
-19
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
66

77
## [Unreleased]
88

9+
### Added
10+
- Added GPU_PORTION env var for GPU sharing pods
11+
912
### Changed
1013
- Changed RUNAI-VISIBLE-DEVICES key in GPU sharing configmap to NVIDIA_VISIBLE_DEVICES
1114

deployments/crds/internal/scheduling.run.ai_bindrequests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ spec:
5656
type: integer
5757
portion:
5858
description: |-
59-
This is the portion size that the pod will receive from each connected gpu device
59+
This is the portion size that the pod will receive from each connected GPU device
6060
This is a serialized float that should be written as a decimal point number.
6161
type: string
6262
type: object

hack/update-client.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ kube::codegen::gen_client \
2626

2727
rm -f generate-dep.go && rm -r vendor && go mod tidy
2828

29-
changed_files=$(git diff --name-only | grep v1alpha2)
29+
changed_files=$(git diff --name-only | grep pkg/apis/client | grep v1alpha2)
3030
${SDK_HACK_DIR}/replace_headers.sh \
3131
${SDK_HACK_DIR}/boilerplate.go.txt \
3232
${changed_files}

pkg/apis/scheduling/v1alpha2/bindrequest_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ type ReceivedGPU struct {
3737
// Count is the amount of GPUs devices that were received
3838
Count int `json:"count,omitempty"`
3939

40-
// This is the portion size that the pod will receive from each connected gpu device
40+
// This is the portion size that the pod will receive from each connected GPU device
4141
// This is a serialized float that should be written as a decimal point number.
4242
Portion string `json:"portion,omitempty"`
4343
}

pkg/binder/binding/default_binder_test.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,10 @@ func TestBindApplyResourceReceivedType(t *testing.T) {
118118
},
119119
},
120120
{
121-
Name: common.NumOfGpusEnvVar,
121+
Name: common.GPUPortion,
122122
ValueFrom: &v1.EnvVarSource{
123123
ConfigMapKeyRef: &v1.ConfigMapKeySelector{
124-
Key: common.NumOfGpusEnvVar,
124+
Key: common.GPUPortion,
125125
LocalObjectReference: v1.LocalObjectReference{
126126
Name: "my-config-0",
127127
},
@@ -159,7 +159,10 @@ func TestBindApplyResourceReceivedType(t *testing.T) {
159159
SelectedNode: "my-node",
160160
ReceivedResourceType: common.ReceivedTypeFraction,
161161
SelectedGPUGroups: []string{"group1"},
162-
ReceivedGPU: &v1alpha2.ReceivedGPU{Count: 1, Portion: "1"},
162+
ReceivedGPU: &v1alpha2.ReceivedGPU{
163+
Count: 1,
164+
Portion: "1",
165+
},
163166
},
164167
}
165168

pkg/binder/binding/fraction_binder_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,8 @@ var _ = Describe("FractionBinder", func() {
217217
Fail(fmt.Sprintf("Failed to read configmap: %v", err))
218218
} else {
219219
Expect(configMap.Data[common.NvidiaVisibleDevices]).To(Equal(testData.gpuIndexByGroupIndex))
220-
Expect(configMap.Data[common.NumOfGpusEnvVar]).To(Equal("0.5"))
220+
Expect(configMap.Data[common.NumOfGpusEnvVarBC]).To(Equal("0.5"))
221+
Expect(configMap.Data[common.GPUPortion]).To(Equal("0.5"))
221222
}
222223
})
223224
}

pkg/binder/common/constants.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ package common
55

66
const (
77
NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
8-
NumOfGpusEnvVar = "RUNAI_NUM_OF_GPUS"
8+
GPUPortion = "GPU_PORTION"
99
ReceivedTypeFraction = "Fraction"
1010
ReceivedTypeRegular = "Regular"
1111
)

pkg/binder/common/gpu_access.go

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@ import (
1616
)
1717

1818
const (
19-
visibleDevicesBC = "RUNAI-VISIBLE-DEVICES" // Deprecated, this value was replaced with NVIDIA_VISIBLE_DEVICES
19+
visibleDevicesBC = "RUNAI-VISIBLE-DEVICES" // Deprecated, this value was replaced with NVIDIA_VISIBLE_DEVICES
20+
NumOfGpusEnvVarBC = "RUNAI_NUM_OF_GPUS" // Deprecated, please use GPU_PORTION env var instead
2021
)
2122

22-
func AddVisibleDevicesEnvVars(container *v1.Container, sharedGpuConfigMapName string) {
23+
func AddGPUSharingEnvVars(container *v1.Container, sharedGpuConfigMapName string) {
2324
AddEnvVarToContainer(container, v1.EnvVar{
2425
Name: NvidiaVisibleDevices,
2526
ValueFrom: &v1.EnvVarSource{
@@ -33,10 +34,22 @@ func AddVisibleDevicesEnvVars(container *v1.Container, sharedGpuConfigMapName st
3334
})
3435

3536
AddEnvVarToContainer(container, v1.EnvVar{
36-
Name: NumOfGpusEnvVar,
37+
Name: NumOfGpusEnvVarBC,
3738
ValueFrom: &v1.EnvVarSource{
3839
ConfigMapKeyRef: &v1.ConfigMapKeySelector{
39-
Key: NumOfGpusEnvVar,
40+
Key: NumOfGpusEnvVarBC,
41+
LocalObjectReference: v1.LocalObjectReference{
42+
Name: sharedGpuConfigMapName,
43+
},
44+
},
45+
},
46+
})
47+
48+
AddEnvVarToContainer(container, v1.EnvVar{
49+
Name: GPUPortion,
50+
ValueFrom: &v1.EnvVarSource{
51+
ConfigMapKeyRef: &v1.ConfigMapKeySelector{
52+
Key: GPUPortion,
4053
LocalObjectReference: v1.LocalObjectReference{
4154
Name: sharedGpuConfigMapName,
4255
},
@@ -69,6 +82,8 @@ func SetNvidiaVisibleDevices(
6982
return err
7083
}
7184
updateFunc = func(data map[string]string) error {
85+
// BC for pods that were created with NVIDIA_VISIBLE_DEVICES env var
86+
// with value from RUNAI-VISIBLE-DEVICES entry in GPU sharing configmap
7287
if _, found := data[visibleDevicesBC]; found {
7388
data[visibleDevicesBC] = visibleDevicesValue
7489
}
@@ -83,12 +98,13 @@ func SetNvidiaVisibleDevices(
8398
return nil
8499
}
85100

86-
func SetNumOfGPUDevices(
101+
func SetGPUPortion(
87102
ctx context.Context, kubeClient client.Client, pod *v1.Pod, containerRef *gpusharingconfigmap.PodContainerRef,
88-
numOfGPUs string,
103+
gpuPortionStr string,
89104
) error {
90105
updateFunc := func(data map[string]string) error {
91-
data[NumOfGpusEnvVar] = numOfGPUs
106+
data[NumOfGpusEnvVarBC] = gpuPortionStr
107+
data[GPUPortion] = gpuPortionStr
92108
return nil
93109
}
94110
capabilitiesMapName, err := gpusharingconfigmap.ExtractCapabilitiesConfigMapName(pod, containerRef)
@@ -98,7 +114,7 @@ func SetNumOfGPUDevices(
98114

99115
err = UpdateConfigMapEnvironmentVariable(ctx, kubeClient, pod, capabilitiesMapName, updateFunc)
100116
if err != nil {
101-
return fmt.Errorf("failed to update gpu sharing configmap for pod <%s/%s>: %v",
117+
return fmt.Errorf("failed to update GPU_PORTION value in gpu sharing configmap for pod <%s/%s>: %v",
102118
pod.Namespace, pod.Name, err)
103119
}
104120
return nil

pkg/binder/plugins/gpusharing/gpu_sharing.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ func (p *GPUSharing) Mutate(pod *v1.Pod) error {
7474
return err
7575
}
7676

77-
common.AddVisibleDevicesEnvVars(containerRef.Container, capabilitiesConfigMapName)
77+
common.AddGPUSharingEnvVars(containerRef.Container, capabilitiesConfigMapName)
7878
common.SetConfigMapVolume(pod, capabilitiesConfigMapName)
7979
common.AddDirectEnvVarsConfigMapSource(containerRef.Container, directEnvVarsMapName)
8080

@@ -116,8 +116,7 @@ func (p *GPUSharing) PreBind(
116116
return err
117117
}
118118

119-
numOfGPUDevices := fmt.Sprintf("%v", bindRequest.Spec.ReceivedGPU.Portion)
120-
return common.SetNumOfGPUDevices(ctx, p.kubeClient, pod, containerRef, numOfGPUDevices)
119+
return common.SetGPUPortion(ctx, p.kubeClient, pod, containerRef, bindRequest.Spec.ReceivedGPU.Portion)
121120
}
122121

123122
func (p *GPUSharing) createCapabilitiesConfigMapIfMissing(ctx context.Context, pod *v1.Pod,

0 commit comments

Comments
 (0)