Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- Fixed a bug where the scheduler would not re-try updating podgroup status after failure
- Fixed a bug where ray workloads gang scheduling would ignore `minReplicas` if autoscaling was not set
- KAI Config wrong statuses when prometheus operand is enabled
- GPU-Operator v25.10.0 support for CDI enabled environments

## [v0.9.1] - 20250-09-15

Expand Down
67 changes: 50 additions & 17 deletions pkg/operator/operands/binder/binder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,26 +96,59 @@ var _ = Describe("Binder", func() {
Expect(deployment.Spec.Template.Labels).To(HaveKeyWithValue("kai", "scheduler"))
})

It("sets CDI flag if set in cluser policy", func(ctx context.Context) {
clusterPolicy := &nvidiav1.ClusterPolicy{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
},
Spec: nvidiav1.ClusterPolicySpec{
CDI: nvidiav1.CDIConfigSpec{
Enabled: ptr.To(true),
Default: ptr.To(true),
Context("CDI Detection", func() {
var (
clusterPolicy *nvidiav1.ClusterPolicy
)
BeforeEach(func() {
clusterPolicy = &nvidiav1.ClusterPolicy{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
},
},
}
Spec: nvidiav1.ClusterPolicySpec{
CDI: nvidiav1.CDIConfigSpec{
Enabled: ptr.To(true),
Default: ptr.To(true),
},
},
}
})

Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
Expect(err).To(BeNil())
It("sets CDI flag if set in cluser policy", func(ctx context.Context) {
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
Expect(err).To(BeNil())

deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
Expect(deploymentT).NotTo(BeNil())
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true"))
deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
Expect(deploymentT).NotTo(BeNil())
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true"))
})

It("sets CDI flag to false if not set by default cluser policy", func(ctx context.Context) {
clusterPolicy.Spec.CDI.Default = ptr.To(false)
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())
objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
Expect(err).To(BeNil())

deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
Expect(deploymentT).NotTo(BeNil())
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=false"))
})

It("detects CDI state with GPU Operator >= v25.10.0", func(ctx context.Context) {
clusterPolicy.Labels = map[string]string{
versionLabelName: gpuOperatorVersionDefaultCDIDeprecated,
}
clusterPolicy.Spec.CDI.Default = ptr.To(false)
Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed())

objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig)
Expect(err).To(BeNil())

deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects)
Expect(deploymentT).NotTo(BeNil())
Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true"))
})
})
})

Expand Down
9 changes: 8 additions & 1 deletion pkg/operator/operands/binder/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/version"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"

Expand All @@ -26,7 +27,9 @@ import (
)

const (
defaultResourceName = "binder"
defaultResourceName = "binder"
gpuOperatorVersionDefaultCDIDeprecated = "v25.10.0"
versionLabelName = "app.kubernetes.io/version"
)

func (b *Binder) deploymentForKAIConfig(
Expand Down Expand Up @@ -184,6 +187,10 @@ func isCdiEnabled(ctx context.Context, readerClient client.Reader) (bool, error)

nvidiaClusterPolicy := nvidiaClusterPolicies.Items[0]
if nvidiaClusterPolicy.Spec.CDI.Enabled != nil && *nvidiaClusterPolicy.Spec.CDI.Enabled {
gpuOperatorVersion, found := nvidiaClusterPolicy.Labels[versionLabelName]
if found && version.CompareKubeAwareVersionStrings(gpuOperatorVersion, gpuOperatorVersionDefaultCDIDeprecated) >= 0 {
return true, nil
}
if nvidiaClusterPolicy.Spec.CDI.Default != nil && *nvidiaClusterPolicy.Spec.CDI.Default {
return true, nil
}
Expand Down
Loading