diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e4d8c57e..311333d2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - (Openshift only) - High CPU usage for the operator pod due to continues reconciles - Fixed a bug where the scheduler would not re-try updating podgroup status after failure - Added missing SCC for Openshift installations +- GPU-Operator v25.10.0 support for CDI enabled environments ## [v0.9.1] - 20250-09-15 diff --git a/pkg/operator/operands/binder/binder_test.go b/pkg/operator/operands/binder/binder_test.go index 2de7bb927..119f88c07 100644 --- a/pkg/operator/operands/binder/binder_test.go +++ b/pkg/operator/operands/binder/binder_test.go @@ -96,26 +96,59 @@ var _ = Describe("Binder", func() { Expect(deployment.Spec.Template.Labels).To(HaveKeyWithValue("kai", "scheduler")) }) - It("sets CDI flag if set in cluser policy", func(ctx context.Context) { - clusterPolicy := &nvidiav1.ClusterPolicy{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test", - }, - Spec: nvidiav1.ClusterPolicySpec{ - CDI: nvidiav1.CDIConfigSpec{ - Enabled: ptr.To(true), - Default: ptr.To(true), + Context("CDI Detection", func() { + var ( + clusterPolicy *nvidiav1.ClusterPolicy + ) + BeforeEach(func() { + clusterPolicy = &nvidiav1.ClusterPolicy{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", }, - }, - } + Spec: nvidiav1.ClusterPolicySpec{ + CDI: nvidiav1.CDIConfigSpec{ + Enabled: ptr.To(true), + Default: ptr.To(true), + }, + }, + } + }) - Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed()) - objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig) - Expect(err).To(BeNil()) + It("sets CDI flag if set in cluser policy", func(ctx context.Context) { + Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed()) + objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) - deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects) - Expect(deploymentT).NotTo(BeNil()) - Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true")) + deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects) + Expect(deploymentT).NotTo(BeNil()) + Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true")) + }) + + It("sets CDI flag to false if not set by default cluser policy", func(ctx context.Context) { + clusterPolicy.Spec.CDI.Default = ptr.To(false) + Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed()) + objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) + + deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects) + Expect(deploymentT).NotTo(BeNil()) + Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=false")) + }) + + It("detects CDI state with GPU Operator >= v25.10.0", func(ctx context.Context) { + clusterPolicy.Labels = map[string]string{ + versionLabelName: gpuOperatorVersionDefaultCDIDeprecated, + } + clusterPolicy.Spec.CDI.Default = ptr.To(false) + Expect(fakeKubeClient.Create(ctx, clusterPolicy)).To(Succeed()) + + objects, err := b.DesiredState(ctx, fakeKubeClient, kaiConfig) + Expect(err).To(BeNil()) + + deploymentT := test_utils.FindTypeInObjects[*appsv1.Deployment](objects) + Expect(deploymentT).NotTo(BeNil()) + Expect((*deploymentT).Spec.Template.Spec.Containers[0].Args).To(ContainElement("--cdi-enabled=true")) + }) }) }) diff --git a/pkg/operator/operands/binder/resources.go b/pkg/operator/operands/binder/resources.go index ecd95d398..c52682569 100644 --- a/pkg/operator/operands/binder/resources.go +++ b/pkg/operator/operands/binder/resources.go @@ -13,6 +13,7 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/version" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -25,7 +26,9 @@ import ( ) const ( - mainResourceName = "binder" + mainResourceName = "binder" + gpuOperatorVersionDefaultCDIDeprecated = "v25.10.0" + versionLabelName = "app.kubernetes.io/version" ) func deploymentForKAIConfig( @@ -183,6 +186,10 @@ func isCdiEnabled(ctx context.Context, readerClient client.Reader) (bool, error) nvidiaClusterPolicy := nvidiaClusterPolicies.Items[0] if nvidiaClusterPolicy.Spec.CDI.Enabled != nil && *nvidiaClusterPolicy.Spec.CDI.Enabled { + gpuOperatorVersion, found := nvidiaClusterPolicy.Labels[versionLabelName] + if found && version.CompareKubeAwareVersionStrings(gpuOperatorVersion, gpuOperatorVersionDefaultCDIDeprecated) >= 0 { + return true, nil + } if nvidiaClusterPolicy.Spec.CDI.Default != nil && *nvidiaClusterPolicy.Spec.CDI.Default { return true, nil }