Skip to content

Commit 0767a6d

Browse files
authored
fix(operator): support latest gpu operator cdi detection (#641)
1 parent d65607a commit 0767a6d

File tree

3 files changed

+59
-18
lines changed

3 files changed

+59
-18
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
2121
- Fixed a bug where the scheduler would not re-try updating podgroup status after failure
2222
- Fixed a bug where ray workloads gang scheduling would ignore `minReplicas` if autoscaling was not set
2323
- KAI Config wrong statuses when prometheus operand is enabled
24+
- GPU-Operator v25.10.0 support for CDI enabled environments
2425

2526
## [v0.9.1] - 20250-09-15
2627

pkg/operator/operands/binder/binder_test.go

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -96,26 +96,59 @@ var _ = Describe("Binder", func() {
9696
Expect(deployment.Spec.Template.Labels).To(HaveKeyWithValue("kai", "scheduler"))
9797
})
9898

99-
It("sets CDI flag if set in cluser policy", func(ctx context.Context) {
100-
clusterPolicy := &nvidiav1.ClusterPolicy{
101-
ObjectMeta: metav1.ObjectMeta{
102-
Name: "test",
103-
},
104-
Spec: nvidiav1.ClusterPolicySpec{
105-
CDI: nvidiav1.CDIConfigSpec{
106-
Enabled: ptr.To(true),
107-
Default: ptr.To(true),
99+
Context("CDI Detection", func() {
100+
var (
101+
clusterPolicy *nvidiav1.ClusterPolicy
102+
)
103+
BeforeEach(func() {
104+
clusterPolicy = &nvidiav1.ClusterPolicy{
105+
ObjectMeta: metav1.ObjectMeta{
106+
Name: "test",
108107
},
109-
},
110-
}
108+
Spec: nvidiav1.ClusterPolicySpec{
109+
CDI: nvidiav1.CDIConfigSpec{
110+
Enabled: ptr.To(true),
111+
Default: ptr.To(true),
112+
},
113+
},
114+
}
115+
})
111116

112-
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
113-
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
114-
Expect(err).To(BeNil())
117+
It("sets CDI flag if set in cluser policy", func(ctx context.Context) {
118+
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
119+
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
120+
Expect(err).To(BeNil())
115121

116-
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
117-
Expect(deploymentT).NotTo(BeNil())
118-
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true"))
122+
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
123+
Expect(deploymentT).NotTo(BeNil())
124+
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true"))
125+
})
126+
127+
It("sets CDI flag to false if not set by default cluser policy", func(ctx context.Context) {
128+
clusterPolicy.Spec.CDI.Default = ptr.To(false)
129+
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
130+
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
131+
Expect(err).To(BeNil())
132+
133+
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
134+
Expect(deploymentT).NotTo(BeNil())
135+
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=false"))
136+
})
137+
138+
It("detects CDI state with GPU Operator >= v25.10.0", func(ctx context.Context) {
139+
clusterPolicy.Labels = map[string]string{
140+
versionLabelName: gpuOperatorVersionDefaultCDIDeprecated,
141+
}
142+
clusterPolicy.Spec.CDI.Default = ptr.To(false)
143+
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
144+
145+
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
146+
Expect(err).To(BeNil())
147+
148+
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
149+
Expect(deploymentT).NotTo(BeNil())
150+
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true"))
151+
})
119152
})
120153
})
121154

pkg/operator/operands/binder/resources.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"k8s.io/apimachinery/pkg/api/meta"
1515
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1616
"k8s.io/apimachinery/pkg/util/intstr"
17+
"k8s.io/apimachinery/pkg/version"
1718
"sigs.k8s.io/controller-runtime/pkg/client"
1819
"sigs.k8s.io/controller-runtime/pkg/log"
1920

@@ -26,7 +27,9 @@ import (
2627
)
2728

2829
const (
29-
defaultResourceName = "binder"
30+
defaultResourceName = "binder"
31+
gpuOperatorVersionDefaultCDIDeprecated = "v25.10.0"
32+
versionLabelName = "app.kubernetes.io/version"
3033
)
3134

3235
func (b *Binder) deploymentForKAIConfig(
@@ -184,6 +187,10 @@ func isCdiEnabled(ctx context.Context, readerClient client.Reader) (bool, error)
184187

185188
nvidiaClusterPolicy := nvidiaClusterPolicies.Items[0]
186189
if nvidiaClusterPolicy.Spec.CDI.Enabled != nil && *nvidiaClusterPolicy.Spec.CDI.Enabled {
190+
gpuOperatorVersion, found := nvidiaClusterPolicy.Labels[versionLabelName]
191+
if found && version.CompareKubeAwareVersionStrings(gpuOperatorVersion, gpuOperatorVersionDefaultCDIDeprecated) >= 0 {
192+
return true, nil
193+
}
187194
if nvidiaClusterPolicy.Spec.CDI.Default != nil && *nvidiaClusterPolicy.Spec.CDI.Default {
188195
return true, nil
189196
}

0 commit comments

Comments
 (0)