Skip to content

Commit 02344bf

Browse files
committed
feat(resource reservation): add JSON configuration for CPU and memory resources in GPU reservation pods
1 parent 6114d85 commit 02344bf

File tree

6 files changed

+108
-74
lines changed

6 files changed

+108
-74
lines changed

cmd/binder/app/app.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package app
55

66
import (
77
"context"
8+
"encoding/json"
89
"fmt"
910
"time"
1011

@@ -104,12 +105,21 @@ func New(options *Options, config *rest.Config) (*App, error) {
104105
kubeClient := draversionawareclient.NewDRAAwareClient(kubernetes.NewForConfigOrDie(config))
105106
informerFactory := informers.NewSharedInformerFactory(kubeClient, 0)
106107

108+
// Deserialize pod resources if provided
109+
var podResources *corev1.ResourceRequirements
110+
if options.ResourceReservationPodResourcesJSON != "" {
111+
podResources = &corev1.ResourceRequirements{}
112+
if err := json.Unmarshal([]byte(options.ResourceReservationPodResourcesJSON), podResources); err != nil {
113+
setupLog.Error(err, "failed to unmarshal resource reservation pod resources")
114+
return nil, fmt.Errorf("failed to unmarshal resource reservation pod resources: %w", err)
115+
}
116+
}
117+
107118
rrs := resourcereservation.NewService(options.FakeGPUNodes, clientWithWatch, options.ResourceReservationPodImage,
108119
time.Duration(options.ResourceReservationAllocationTimeout)*time.Second,
109120
options.ResourceReservationNamespace, options.ResourceReservationServiceAccount,
110121
options.ResourceReservationAppLabel, options.ScalingPodNamespace, options.RuntimeClassName,
111-
options.ResourceReservationPodCPURequest, options.ResourceReservationPodMemoryRequest,
112-
options.ResourceReservationPodCPULimit, options.ResourceReservationPodMemoryLimit)
122+
podResources)
113123

114124
reconcilerParams := &controllers.ReconcilerParams{
115125
MaxConcurrentReconciles: options.MaxConcurrentReconciles,

cmd/binder/app/options.go

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,7 @@ type Options struct {
1717
ResourceReservationPodImage string
1818
ResourceReservationAppLabel string
1919
ResourceReservationAllocationTimeout int
20-
ResourceReservationPodCPURequest string
21-
ResourceReservationPodMemoryRequest string
22-
ResourceReservationPodCPULimit string
23-
ResourceReservationPodMemoryLimit string
20+
ResourceReservationPodResourcesJSON string
2421
ScalingPodNamespace string
2522
QPS float64
2623
Burst int
@@ -61,18 +58,9 @@ func InitOptions(fs *pflag.FlagSet) *Options {
6158
fs.IntVar(&options.ResourceReservationAllocationTimeout,
6259
"resource-reservation-allocation-timeout", 40,
6360
"Resource reservation allocation timeout in seconds")
64-
fs.StringVar(&options.ResourceReservationPodCPURequest,
65-
"resource-reservation-pod-cpu-request", "",
66-
"CPU request for GPU reservation pods (optional, empty means not set)")
67-
fs.StringVar(&options.ResourceReservationPodMemoryRequest,
68-
"resource-reservation-pod-memory-request", "",
69-
"Memory request for GPU reservation pods (optional, empty means not set)")
70-
fs.StringVar(&options.ResourceReservationPodCPULimit,
71-
"resource-reservation-pod-cpu-limit", "",
72-
"CPU limit for GPU reservation pods (optional, empty means not set)")
73-
fs.StringVar(&options.ResourceReservationPodMemoryLimit,
74-
"resource-reservation-pod-memory-limit", "",
75-
"Memory limit for GPU reservation pods (optional, empty means not set)")
61+
fs.StringVar(&options.ResourceReservationPodResourcesJSON,
62+
"resource-reservation-pod-resources", "",
63+
"JSON-serialized ResourceRequirements for GPU reservation pods (optional, empty means not set)")
7664
fs.StringVar(&options.ScalingPodNamespace,
7765
"scale-adjust-namespace", constants.DefaultScaleAdjustName,
7866
"Scaling pods namespace")

pkg/apis/kai/v1/binder/binder.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ type ResourceReservation struct {
103103
// RuntimeClassName specifies the runtime class used by the reservation pods. Needs to allow access to the GPU
104104
RuntimeClassName *string `json:"runtimeClassName,omitempty"`
105105

106-
// PodResources specifies the CPU and memory resource requests and limits for GPU reservation pods
106+
// PodResources specifies the CPU and memory resource requests and limits for GPU reservation pods.
107+
// If not set, Kubernetes defaults will be used, which allows for better backward compatibility.
107108
// +kubebuilder:validation:Optional
108109
PodResources *common.Resources `json:"podResources,omitempty"`
109110
}
@@ -117,7 +118,4 @@ func (r *ResourceReservation) SetDefaultsWhereNeeded() {
117118
r.ServiceAccountName = common.SetDefault(r.ServiceAccountName, ptr.To(constants.DefaultResourceReservationName))
118119
r.AppLabel = common.SetDefault(r.AppLabel, ptr.To(constants.DefaultResourceReservationName))
119120
r.RuntimeClassName = common.SetDefault(r.RuntimeClassName, ptr.To(constants.DefaultRuntimeClassName))
120-
121-
// PodResources is optional - if not set, Kubernetes defaults will be used
122-
// This allows for better backward compatibility
123121
}

pkg/binder/binding/resourcereservation/resource_reservation.go

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,7 @@ type service struct {
5353
appLabelValue string
5454
scalingPodNamespace string
5555
runtimeClassName string
56-
podCPURequest string
57-
podMemoryRequest string
58-
podCPULimit string
59-
podMemoryLimit string
56+
podResources *v1.ResourceRequirements
6057
}
6158

6259
func NewService(
@@ -69,10 +66,7 @@ func NewService(
6966
appLabelValue string,
7067
scalingPodNamespace string,
7168
runtimeClassName string,
72-
podCPURequest string,
73-
podMemoryRequest string,
74-
podCPULimit string,
75-
podMemoryLimit string,
69+
podResources *v1.ResourceRequirements,
7670
) *service {
7771
return &service{
7872
fakeGPuNodes: fakeGPuNodes,
@@ -85,10 +79,7 @@ func NewService(
8579
appLabelValue: appLabelValue,
8680
scalingPodNamespace: scalingPodNamespace,
8781
runtimeClassName: runtimeClassName,
88-
podCPURequest: podCPURequest,
89-
podMemoryRequest: podMemoryRequest,
90-
podCPULimit: podCPULimit,
91-
podMemoryLimit: podMemoryLimit,
82+
podResources: podResources,
9283
}
9384
}
9485

@@ -395,8 +386,7 @@ func (rsc *service) createGPUReservationPod(ctx context.Context, nodeName, gpuGr
395386

396387
podName := fmt.Sprintf("%s-%s-%s", gpuReservationPodPrefix, nodeName, rand.String(reservationPodRandomCharacters))
397388

398-
// Build resource requirements with configured values
399-
// Only set CPU/Memory if explicitly configured (better backward compatibility)
389+
// Build resource requirements starting with GPU resources
400390
resources := v1.ResourceRequirements{
401391
Limits: v1.ResourceList{
402392
constants.GpuResource: *resource.NewQuantity(numberOfGPUsToReserve, resource.DecimalSI),
@@ -406,20 +396,24 @@ func (rsc *service) createGPUReservationPod(ctx context.Context, nodeName, gpuGr
406396
},
407397
}
408398

409-
// Only set CPU/Memory limits if configured
410-
if rsc.podCPULimit != "" {
411-
resources.Limits[v1.ResourceCPU] = resource.MustParse(rsc.podCPULimit)
412-
}
413-
if rsc.podMemoryLimit != "" {
414-
resources.Limits[v1.ResourceMemory] = resource.MustParse(rsc.podMemoryLimit)
415-
}
416-
417-
// Only set CPU/Memory requests if configured
418-
if rsc.podCPURequest != "" {
419-
resources.Requests[v1.ResourceCPU] = resource.MustParse(rsc.podCPURequest)
420-
}
421-
if rsc.podMemoryRequest != "" {
422-
resources.Requests[v1.ResourceMemory] = resource.MustParse(rsc.podMemoryRequest)
399+
// Merge in configured CPU/Memory resources if provided, but skip GPU resources to prevent override
400+
if rsc.podResources != nil {
401+
if rsc.podResources.Limits != nil {
402+
for resourceName, quantity := range rsc.podResources.Limits {
403+
// Skip GPU resources - they are already set correctly
404+
if resourceName != constants.GpuResource {
405+
resources.Limits[resourceName] = quantity
406+
}
407+
}
408+
}
409+
if rsc.podResources.Requests != nil {
410+
for resourceName, quantity := range rsc.podResources.Requests {
411+
// Skip GPU resources - they are already set correctly
412+
if resourceName != constants.GpuResource {
413+
resources.Requests[resourceName] = quantity
414+
}
415+
}
416+
}
423417
}
424418

425419
pod, err := rsc.createResourceReservationPod(nodeName, gpuGroup, podName, resources)

pkg/binder/binding/resourcereservation/resource_reservation_test.go

Lines changed: 60 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func initializeTestService(
4747
) *service {
4848
service := NewService(false, client, "", 40*time.Millisecond,
4949
resourceReservationNameSpace, resourceReservationServiceAccount, resourceReservationAppLabelValue, scalingPodsNamespace, constants.DefaultRuntimeClassName,
50-
"", "", "", "") // Empty resource configs to use defaults
50+
nil) // nil podResources to use defaults
5151

5252
return service
5353
}
@@ -952,10 +952,16 @@ var _ = Describe("ResourceReservationService", func() {
952952
reservationPodImage: "test-image:latest",
953953
kubeClient: fake.NewClientBuilder().Build(),
954954
runtimeClassName: "nvidia",
955-
podCPURequest: "2m",
956-
podMemoryRequest: "20Mi",
957-
podCPULimit: "100m",
958-
podMemoryLimit: "200Mi",
955+
podResources: &v1.ResourceRequirements{
956+
Requests: v1.ResourceList{
957+
v1.ResourceCPU: resource.MustParse("2m"),
958+
v1.ResourceMemory: resource.MustParse("20Mi"),
959+
},
960+
Limits: v1.ResourceList{
961+
v1.ResourceCPU: resource.MustParse("100m"),
962+
v1.ResourceMemory: resource.MustParse("200Mi"),
963+
},
964+
},
959965
scalingPodNamespace: scalingPodsNamespace,
960966
}
961967

@@ -986,10 +992,7 @@ var _ = Describe("ResourceReservationService", func() {
986992
reservationPodImage: "test-image:latest",
987993
kubeClient: fake.NewClientBuilder().Build(),
988994
runtimeClassName: "nvidia",
989-
podCPURequest: "",
990-
podMemoryRequest: "",
991-
podCPULimit: "",
992-
podMemoryLimit: "",
995+
podResources: nil,
993996
scalingPodNamespace: scalingPodsNamespace,
994997
}
995998

@@ -1025,10 +1028,14 @@ var _ = Describe("ResourceReservationService", func() {
10251028
reservationPodImage: "test-image:latest",
10261029
kubeClient: fake.NewClientBuilder().Build(),
10271030
runtimeClassName: "nvidia",
1028-
podCPURequest: "5m",
1029-
podMemoryRequest: "",
1030-
podCPULimit: "50m",
1031-
podMemoryLimit: "",
1031+
podResources: &v1.ResourceRequirements{
1032+
Requests: v1.ResourceList{
1033+
v1.ResourceCPU: resource.MustParse("5m"),
1034+
},
1035+
Limits: v1.ResourceList{
1036+
v1.ResourceCPU: resource.MustParse("50m"),
1037+
},
1038+
},
10321039
scalingPodNamespace: scalingPodsNamespace,
10331040
}
10341041

@@ -1048,6 +1055,46 @@ var _ = Describe("ResourceReservationService", func() {
10481055
Expect(memRequestExists).To(BeFalse())
10491056
Expect(memLimitExists).To(BeFalse())
10501057
})
1058+
1059+
It("should not allow GPU resources to be overridden by podResources", func() {
1060+
// This test ensures that even if podResources contains GPU configuration,
1061+
// it won't override the GPU resource value set by the service
1062+
rsc := &service{
1063+
namespace: "kai-resource-reservation",
1064+
appLabelValue: "kai-reservation",
1065+
serviceAccountName: "kai-sa",
1066+
reservationPodImage: "test-image:latest",
1067+
kubeClient: fake.NewClientBuilder().Build(),
1068+
runtimeClassName: "nvidia",
1069+
podResources: &v1.ResourceRequirements{
1070+
Requests: v1.ResourceList{
1071+
v1.ResourceCPU: resource.MustParse("10m"),
1072+
constants.GpuResource: resource.MustParse("999"), // This should be ignored
1073+
},
1074+
Limits: v1.ResourceList{
1075+
v1.ResourceCPU: resource.MustParse("100m"),
1076+
constants.GpuResource: resource.MustParse("999"), // This should be ignored
1077+
},
1078+
},
1079+
scalingPodNamespace: scalingPodsNamespace,
1080+
}
1081+
1082+
pod, err := rsc.createGPUReservationPod(context.TODO(), "test-node", "test-gpu-group")
1083+
Expect(err).To(BeNil())
1084+
Expect(pod).NotTo(BeNil())
1085+
1086+
container := pod.Spec.Containers[0]
1087+
1088+
// Verify CPU resources from podResources are set
1089+
Expect(container.Resources.Requests[v1.ResourceCPU]).To(Equal(resource.MustParse("10m")))
1090+
Expect(container.Resources.Limits[v1.ResourceCPU]).To(Equal(resource.MustParse("100m")))
1091+
1092+
// Verify GPU resources are NOT overridden - should always be 1
1093+
gpuRequest := container.Resources.Requests[constants.GpuResource]
1094+
gpuLimit := container.Resources.Limits[constants.GpuResource]
1095+
Expect(gpuRequest.Value()).To(Equal(int64(1)), "GPU request should be 1, not overridden by podResources")
1096+
Expect(gpuLimit.Value()).To(Equal(int64(1)), "GPU limit should be 1, not overridden by podResources")
1097+
})
10511098
})
10521099
})
10531100

pkg/operator/operands/binder/resources.go

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package binder
55

66
import (
77
"context"
8+
"encoding/json"
89
"fmt"
910

1011
appsv1 "k8s.io/api/apps/v1"
@@ -233,19 +234,15 @@ func buildArgsList(kaiConfig *kaiv1.Config, config *kaiv1binder.Binder, fakeGPU
233234
args = append(args, []string{fmt.Sprintf("--runtime-class-name=%s", *config.ResourceReservation.RuntimeClassName)}...)
234235
}
235236

236-
// Add GPU reservation pod resource configurations
237+
// Serialize and add GPU reservation pod resource configurations
237238
if config.ResourceReservation.PodResources != nil {
238-
if cpuRequest, found := config.ResourceReservation.PodResources.Requests[v1.ResourceCPU]; found {
239-
args = append(args, []string{"--resource-reservation-pod-cpu-request", cpuRequest.String()}...)
239+
resourceRequirements := v1.ResourceRequirements{
240+
Requests: config.ResourceReservation.PodResources.Requests,
241+
Limits: config.ResourceReservation.PodResources.Limits,
240242
}
241-
if memoryRequest, found := config.ResourceReservation.PodResources.Requests[v1.ResourceMemory]; found {
242-
args = append(args, []string{"--resource-reservation-pod-memory-request", memoryRequest.String()}...)
243-
}
244-
if cpuLimit, found := config.ResourceReservation.PodResources.Limits[v1.ResourceCPU]; found {
245-
args = append(args, []string{"--resource-reservation-pod-cpu-limit", cpuLimit.String()}...)
246-
}
247-
if memoryLimit, found := config.ResourceReservation.PodResources.Limits[v1.ResourceMemory]; found {
248-
args = append(args, []string{"--resource-reservation-pod-memory-limit", memoryLimit.String()}...)
243+
resourcesJSON, err := json.Marshal(resourceRequirements)
244+
if err == nil {
245+
args = append(args, []string{"--resource-reservation-pod-resources", string(resourcesJSON)}...)
249246
}
250247
}
251248

0 commit comments

Comments
 (0)