From 741a49d7a151327a95635fdaae0879c2486bb521 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Fri, 1 Aug 2025 15:50:38 -0700
Subject: [PATCH 1/8] Use the nvidia runtime instead of the OCI hook on cri-o

This aligns with how we install NVIDIA Container Toolkit when containerd is the runtime.

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 controllers/object_controls.go | 10 +++++++---
 controllers/transforms_test.go |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index 0e10e3817..7378d7b91 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -147,8 +147,8 @@ const (
 	CDIEnabledEnvName = "CDI_ENABLED"
 	// NvidiaCDIHookPathEnvName is the name of the envvar specifying the path to the 'nvidia-cdi-hook' binary
 	NvidiaCDIHookPathEnvName = "NVIDIA_CDI_HOOK_PATH"
-	// CrioConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration
-	CrioConfigModeEnvName = "CRIO_CONFIG_MODE"
+	// CRIOConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration
+	CRIOConfigModeEnvName = "CRIO_CONFIG_MODE"
 	// DeviceListStrategyEnvName is the name of the envvar for configuring the device-list-strategy in the device-plugin
 	DeviceListStrategyEnvName = "DEVICE_LIST_STRATEGY"
 	// CDIAnnotationPrefixEnvName is the name of the device-plugin envvar for configuring the CDI annotation prefix
@@ -1258,7 +1258,6 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
 	if config.CDI.IsEnabled() {
 		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
 		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/")
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CrioConfigModeEnvName, "config")
 		if config.CDI.IsDefault() {
 			setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi")
 		}
@@ -1327,6 +1326,11 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 		setContainerEnv(mainContainer, "CONTAINERD_RUNTIME_CLASS", getRuntimeClass(config))
 	}
 
+	if runtime == gpuv1.CRIO.String() {
+		// We add the nvidia runtime to the cri-o config by default instead of installing the OCI prestart hook
+		setContainerEnv(mainContainer, CRIOConfigModeEnvName, "config")
+	}
+
 	// setup mounts for runtime config file
 	runtimeConfigFile, err := getRuntimeConfigFile(mainContainer, runtime)
 	if err != nil {
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index 4b75e62a9..794721ede 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -353,6 +353,7 @@ func TestTransformForRuntime(t *testing.T) {
 					Name: "test-ctr",
 					Env: []corev1.EnvVar{
 						{Name: "RUNTIME", Value: gpuv1.CRIO.String()},
+						{Name: CRIOConfigModeEnvName, Value: "config"},
 						{Name: "RUNTIME_CONFIG", Value: filepath.Join(DefaultRuntimeConfigTargetDir, filepath.Base(DefaultCRIOConfigFile))},
 						{Name: "CRIO_CONFIG", Value: filepath.Join(DefaultRuntimeConfigTargetDir, filepath.Base(DefaultCRIOConfigFile))},
 					},

From d4be10a6631582823cca33f05e1ccd5197c416d6 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Fri, 1 Aug 2025 16:00:17 -0700
Subject: [PATCH 2/8] Always set the nvidia runtime class for operands that
 rely on the toolkit

The runtimeClassName field will always be set now regardless if cri-o
or containerd is the container runtime.

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 controllers/object_controls.go | 39 ++++++++++++----------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index 7378d7b91..86d8239ed 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -952,8 +952,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
 		return err
 	}
 
-	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClassName(&obj.Spec.Template.Spec, config)
 
 	// update env required for MIG support
 	applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
@@ -1323,7 +1322,7 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 
 	if runtime == gpuv1.Containerd.String() {
 		// Set the runtime class name that is to be configured for containerd
-		setContainerEnv(mainContainer, "CONTAINERD_RUNTIME_CLASS", getRuntimeClass(config))
+		setContainerEnv(mainContainer, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config))
 	}
 
 	if runtime == gpuv1.CRIO.String() {
@@ -1433,8 +1432,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 		return err
 	}
 
-	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClassName(&obj.Spec.Template.Spec, config)
 
 	// update env required for MIG support
 	applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
@@ -1528,8 +1526,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic
 		return err
 	}
 
-	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClassName(&obj.Spec.Template.Spec, config)
 
 	// update env required for MIG support
 	applyMIGConfiguration(mainContainer, config.MIG.Strategy)
@@ -1637,8 +1634,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 		}
 	}
 
-	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClassName(&obj.Spec.Template.Spec, config)
 
 	// mount configmap for custom metrics if provided by user
 	if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
@@ -1760,8 +1756,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu
 		}
 	}
 
-	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClassName(&obj.Spec.Template.Spec, config)
 
 	return nil
 }
@@ -1803,8 +1798,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 		obj.Spec.Template.Spec.Containers[0].Args = config.MIGManager.Args
 	}
 
-	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClassName(&obj.Spec.Template.Spec, config)
 
 	// set ConfigMap name for "mig-parted-config" Volume
 	for i, vol := range obj.Spec.Template.Spec.Volumes {
@@ -2097,8 +2091,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 		return fmt.Errorf("%v", err)
 	}
 
-	// set RuntimeClass for supported runtimes
-	setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass)
+	setRuntimeClassName(&obj.Spec.Template.Spec, config)
 
 	var validatorErr error
 	// apply changes for individual component validators(initContainers)
@@ -2431,20 +2424,16 @@ func setContainerEnv(c *corev1.Container, key, value string) {
 	c.Env = append(c.Env, corev1.EnvVar{Name: key, Value: value})
 }
 
-func getRuntimeClass(config *gpuv1.ClusterPolicySpec) string {
+func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string {
 	if config.Operator.RuntimeClass != "" {
 		return config.Operator.RuntimeClass
 	}
 	return DefaultRuntimeClass
 }
 
-func setRuntimeClass(podSpec *corev1.PodSpec, runtime gpuv1.Runtime, runtimeClass string) {
-	if runtime == gpuv1.Containerd {
-		if runtimeClass == "" {
-			runtimeClass = DefaultRuntimeClass
-		}
-		podSpec.RuntimeClassName = &runtimeClass
-	}
+func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec) {
+	runtimeClassName := getRuntimeClassName(config)
+	podSpec.RuntimeClassName = &runtimeClassName
 }
 
 func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) {
@@ -4741,7 +4730,7 @@ func transformRuntimeClassLegacy(n ClusterPolicyController, spec nodev1.RuntimeC
 
 	// apply runtime class name as per ClusterPolicy
 	if obj.Name == "FILLED_BY_OPERATOR" {
-		runtimeClassName := getRuntimeClass(&n.singleton.Spec)
+		runtimeClassName := getRuntimeClassName(&n.singleton.Spec)
 		obj.Name = runtimeClassName
 		obj.Handler = runtimeClassName
 	}
@@ -4788,7 +4777,7 @@ func transformRuntimeClass(n ClusterPolicyController, spec nodev1.RuntimeClass)
 
 	// apply runtime class name as per ClusterPolicy
 	if obj.Name == "FILLED_BY_OPERATOR" {
-		runtimeClassName := getRuntimeClass(&n.singleton.Spec)
+		runtimeClassName := getRuntimeClassName(&n.singleton.Spec)
 		obj.Name = runtimeClassName
 		obj.Handler = runtimeClassName
 	}

From c52d2ec1365ed5a2762ef85561468efc3ed7697d Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Tue, 18 Feb 2025 14:50:10 -0800
Subject: [PATCH 3/8] Use native CDI support in runtimes for workload
 containers when cdi.enabled=true

This commit updates the default behavior when cdi.enabled=true. We now leverage
native CDI support in containerd / cri-o to inject GPU devices into workload
containers. This means we no longer configure 'nvidia' as the default runtime.
Our management containers will continue to use the 'nvidia' runtime to access
GPUs by explicitly setting runtimeClassName=nvidia in their pod specs.

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 controllers/object_controls.go |  97 ++++++++++++++++++++--------
 controllers/transforms_test.go | 114 ++++++++++++++++++++++++++++++++-
 2 files changed, 183 insertions(+), 28 deletions(-)

diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index 86d8239ed..cf82e8e82 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -178,6 +178,8 @@ const (
 	// DriverInstallDirCtrPathEnvName is the name of the envvar used by the driver-validator to represent the path
 	// of the driver install dir mounted in the container
 	DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH"
+	// NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime
+	NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT"
 )
 
 // ContainerProbe defines container probe types
@@ -1222,8 +1224,38 @@ func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar {
 	return envVars
 }
 
+func transformToolkitCtrForCDI(container *corev1.Container, config *gpuv1.ClusterPolicySpec) {
+	// When CDI is enabled in GPU Operator, we leverage native CDI support in containerd / cri-o
+	// to inject GPUs into workloads. We do not configure 'nvidia' as the default runtime. The
+	// 'nvidia' runtime will be set as the runtime class for our management containers so that
+	// they get access to all GPUs.
+	//
+	// Note: one could override this and continue to configure 'nvidia' as the default runtime
+	// by directly setting the 'NVIDIA_RUNTIME_SET_AS_DEFAULT' environment variable to 'true' in
+	// the toolkit container. One can leverage the 'toolkit.env' field in ClusterPolicy to
+	// directly configure environment variables for the toolkit container.
+	setContainerEnv(container, CDIEnabledEnvName, "true")
+	setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false")
+
+	if config.CDI.IsDefault() {
+		setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi")
+	}
+}
+
 // TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy
 func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
+	var mainContainer *corev1.Container
+	mainContainerName := "nvidia-container-toolkit-ctr"
+	for i, ctr := range obj.Spec.Template.Spec.Containers {
+		if ctr.Name == mainContainerName {
+			mainContainer = &obj.Spec.Template.Spec.Containers[i]
+			break
+		}
+	}
+	if mainContainer == nil {
+		return fmt.Errorf("failed to find main container %q", mainContainerName)
+	}
+
 	// update validation container
 	err := transformValidationInitContainer(obj, config)
 	if err != nil {
@@ -1234,10 +1266,10 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
 	if err != nil {
 		return err
 	}
-	obj.Spec.Template.Spec.Containers[0].Image = image
+	mainContainer.Image = image
 
 	// update image pull policy
-	obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Toolkit.ImagePullPolicy)
+	mainContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.Toolkit.ImagePullPolicy)
 
 	// set image pull secrets
 	if len(config.Toolkit.ImagePullSecrets) > 0 {
@@ -1255,16 +1287,12 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
 
 	// update env required for CDI support
 	if config.CDI.IsEnabled() {
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/")
-		if config.CDI.IsDefault() {
-			setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi")
-		}
+		transformToolkitCtrForCDI(mainContainer, config)
 	}
 
 	// set install directory for the toolkit
 	if config.Toolkit.InstallDir != "" && config.Toolkit.InstallDir != DefaultToolkitInstallDir {
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), ToolkitInstallDirEnvName, config.Toolkit.InstallDir)
+		setContainerEnv(mainContainer, ToolkitInstallDirEnvName, config.Toolkit.InstallDir)
 
 		for i, volume := range obj.Spec.Template.Spec.Volumes {
 			if volume.Name == "toolkit-install-dir" {
@@ -1273,9 +1301,9 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
 			}
 		}
 
-		for i, volumeMount := range obj.Spec.Template.Spec.Containers[0].VolumeMounts {
+		for i, volumeMount := range mainContainer.VolumeMounts {
 			if volumeMount.Name == "toolkit-install-dir" {
-				obj.Spec.Template.Spec.Containers[0].VolumeMounts[i].MountPath = config.Toolkit.InstallDir
+				mainContainer.VolumeMounts[i].MountPath = config.Toolkit.InstallDir
 				break
 			}
 		}
@@ -1292,13 +1320,13 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
 
 	if len(config.Toolkit.Env) > 0 {
 		for _, env := range config.Toolkit.Env {
-			setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
+			setContainerEnv(mainContainer, env.Name, env.Value)
 		}
 	}
 
 	// configure runtime
 	runtime := n.runtime.String()
-	err = transformForRuntime(obj, config, runtime, "nvidia-container-toolkit-ctr")
+	err = transformForRuntime(obj, config, runtime, mainContainerName)
 	if err != nil {
 		return fmt.Errorf("error transforming toolkit daemonset : %w", err)
 	}
@@ -1384,8 +1412,30 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
 	return nil
 }
 
+func transformDevicePluginCtrForCDI(container *corev1.Container, config *gpuv1.ClusterPolicySpec) {
+	setContainerEnv(container, CDIEnabledEnvName, "true")
+	setContainerEnv(container, DeviceListStrategyEnvName, "cdi-annotations,cdi-cri")
+	setContainerEnv(container, CDIAnnotationPrefixEnvName, "cdi.k8s.io/")
+
+	if config.Toolkit.IsEnabled() {
+		setContainerEnv(container, NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook"))
+	}
+}
+
 // TransformDevicePlugin transforms k8s-device-plugin daemonset with required config as per ClusterPolicy
 func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
+	var mainContainer *corev1.Container
+	mainContainerName := "nvidia-device-plugin"
+	for i, ctr := range obj.Spec.Template.Spec.Containers {
+		if ctr.Name == mainContainerName {
+			mainContainer = &obj.Spec.Template.Spec.Containers[i]
+			break
+		}
+	}
+	if mainContainer == nil {
+		return fmt.Errorf("failed to find main container %q", mainContainerName)
+	}
+
 	// update validation container
 	err := transformValidationInitContainer(obj, config)
 	if err != nil {
@@ -1397,10 +1447,10 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 	if err != nil {
 		return err
 	}
-	obj.Spec.Template.Spec.Containers[0].Image = image
+	mainContainer.Image = image
 
 	// update image pull policy
-	obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy)
+	mainContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy)
 
 	// set image pull secrets
 	if len(config.DevicePlugin.ImagePullSecrets) > 0 {
@@ -1417,13 +1467,13 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 	}
 	// set arguments if specified for device-plugin container
 	if len(config.DevicePlugin.Args) > 0 {
-		obj.Spec.Template.Spec.Containers[0].Args = config.DevicePlugin.Args
+		mainContainer.Args = config.DevicePlugin.Args
 	}
 
 	// add env to allow injection of /dev/nvidia-fs and /dev/infiniband devices for GDS
 	if config.GPUDirectStorage != nil && config.GPUDirectStorage.IsEnabled() {
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), GDSEnabledEnvName, "true")
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), MOFEDEnabledEnvName, "true")
+		setContainerEnv(mainContainer, GDSEnabledEnvName, "true")
+		setContainerEnv(mainContainer, MOFEDEnabledEnvName, "true")
 	}
 
 	// apply plugin configuration through ConfigMap if one is provided
@@ -1435,16 +1485,11 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 	setRuntimeClassName(&obj.Spec.Template.Spec, config)
 
 	// update env required for MIG support
-	applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy)
+	applyMIGConfiguration(mainContainer, config.MIG.Strategy)
 
 	// update env required for CDI support
 	if config.CDI.IsEnabled() {
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations")
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, "nvidia.cdi.k8s.io/")
-		if config.Toolkit.IsEnabled() {
-			setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook"))
-		}
+		transformDevicePluginCtrForCDI(mainContainer, config)
 	}
 
 	// update MPS volumes and set MPS_ROOT env var if a custom MPS root is configured
@@ -1458,12 +1503,12 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 				obj.Spec.Template.Spec.Volumes[i].HostPath.Path = filepath.Join(config.DevicePlugin.MPS.Root, "shm")
 			}
 		}
-		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), MPSRootEnvName, config.DevicePlugin.MPS.Root)
+		setContainerEnv(mainContainer, MPSRootEnvName, config.DevicePlugin.MPS.Root)
 	}
 
 	if len(config.DevicePlugin.Env) > 0 {
 		for _, env := range config.DevicePlugin.Env {
-			setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
+			setContainerEnv(mainContainer, env.Name, env.Value)
 		}
 	}
 
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index 794721ede..f0c4451fe 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -756,7 +756,7 @@ func TestTransformDevicePlugin(t *testing.T) {
 		{
 			description: "transform device plugin",
 			ds: NewDaemonset().
-				WithContainer(corev1.Container{Name: "nvidia-device-plugin-ctr"}).
+				WithContainer(corev1.Container{Name: "nvidia-device-plugin"}).
 				WithContainer(corev1.Container{Name: "dummy"}),
 			cpSpec: &gpuv1.ClusterPolicySpec{
 				DevicePlugin: gpuv1.DevicePluginSpec{
@@ -772,7 +772,7 @@ func TestTransformDevicePlugin(t *testing.T) {
 				},
 			},
 			expectedDs: NewDaemonset().WithContainer(corev1.Container{
-				Name:            "nvidia-device-plugin-ctr",
+				Name:            "nvidia-device-plugin",
 				Image:           "nvcr.io/nvidia/cloud-native/nvidia-device-plugin:v1.0.0",
 				ImagePullPolicy: corev1.PullIfNotPresent,
 				Args:            []string{"--fail-on-init-error=false"},
@@ -1822,3 +1822,113 @@ func TestTransformDriver(t *testing.T) {
 		})
 	}
 }
+
+func TestTransformToolkitCtrForCDI(t *testing.T) {
+	testCases := []struct {
+		description string
+		ds          Daemonset
+		cpSpec      *gpuv1.ClusterPolicySpec
+		expectedDs  Daemonset
+	}{
+		{
+			description: "cdi enabled",
+			ds:          NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				CDI: gpuv1.CDIConfigSpec{
+					Enabled: newBoolPtr(true),
+				},
+			},
+			expectedDs: NewDaemonset().WithContainer(
+				corev1.Container{
+					Name: "main-ctr",
+					Env: []corev1.EnvVar{
+						{Name: CDIEnabledEnvName, Value: "true"},
+						{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
+					},
+				}),
+		},
+		{
+			description: "cdi enabled and cdi default",
+			ds:          NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				CDI: gpuv1.CDIConfigSpec{
+					Enabled: newBoolPtr(true),
+					Default: newBoolPtr(true),
+				},
+			},
+			expectedDs: NewDaemonset().WithContainer(
+				corev1.Container{
+					Name: "main-ctr",
+					Env: []corev1.EnvVar{
+						{Name: CDIEnabledEnvName, Value: "true"},
+						{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
+						{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
+					},
+				}),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			mainContainer := &tc.ds.DaemonSet.Spec.Template.Spec.Containers[0]
+			transformToolkitCtrForCDI(mainContainer, tc.cpSpec)
+			require.EqualValues(t, tc.expectedDs, tc.ds)
+		})
+	}
+}
+
+func TestTransformDevicePluginCtrForCDI(t *testing.T) {
+	testCases := []struct {
+		description string
+		ds          Daemonset
+		cpSpec      *gpuv1.ClusterPolicySpec
+		expectedDs  Daemonset
+	}{
+		{
+			description: "toolkit disabled",
+			ds:          NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Toolkit: gpuv1.ToolkitSpec{
+					Enabled: newBoolPtr(false),
+				},
+			},
+			expectedDs: NewDaemonset().WithContainer(
+				corev1.Container{
+					Name: "main-ctr",
+					Env: []corev1.EnvVar{
+						{Name: CDIEnabledEnvName, Value: "true"},
+						{Name: DeviceListStrategyEnvName, Value: "cdi-annotations,cdi-cri"},
+						{Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"},
+					},
+				}),
+		},
+		{
+			description: "toolkit enabled",
+			ds:          NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}),
+			cpSpec: &gpuv1.ClusterPolicySpec{
+				Toolkit: gpuv1.ToolkitSpec{
+					Enabled:    newBoolPtr(true),
+					InstallDir: "/path/to/install",
+				},
+			},
+			expectedDs: NewDaemonset().WithContainer(
+				corev1.Container{
+					Name: "main-ctr",
+					Env: []corev1.EnvVar{
+						{Name: CDIEnabledEnvName, Value: "true"},
+						{Name: DeviceListStrategyEnvName, Value: "cdi-annotations,cdi-cri"},
+						{Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"},
+						{Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"},
+					},
+				}),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			mainContainer := &tc.ds.DaemonSet.Spec.Template.Spec.Containers[0]
+			transformDevicePluginCtrForCDI(mainContainer, tc.cpSpec)
+			require.EqualValues(t, tc.expectedDs, tc.ds)
+		})
+	}
+}

From e63a990340806abf87950fc50dc7b2fca1a056c8 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Fri, 1 Aug 2025 16:42:56 -0700
Subject: [PATCH 4/8] Make the cdi.default field a no-op

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 api/nvidia/v1/clusterpolicy_types.go          | 19 +++++-------------
 ...rator-certified.clusterserviceversion.yaml | 14 +++++++++++++
 .../manifests/nvidia.com_clusterpolicies.yaml | 10 ++++++----
 .../crd/bases/nvidia.com_clusterpolicies.yaml | 10 ++++++----
 controllers/object_controls.go                |  9 +++------
 controllers/transforms_test.go                | 20 +------------------
 .../crds/nvidia.com_clusterpolicies.yaml      | 10 ++++++----
 deployments/gpu-operator/values.yaml          |  1 -
 8 files changed, 41 insertions(+), 52 deletions(-)

diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go
index 5b9535dba..537be53d9 100644
--- a/api/nvidia/v1/clusterpolicy_types.go
+++ b/api/nvidia/v1/clusterpolicy_types.go
@@ -1658,20 +1658,20 @@ type VGPUDevicesConfigSpec struct {
 
 // CDIConfigSpec defines how the Container Device Interface is used in the cluster.
 type CDIConfigSpec struct {
-	// Enabled indicates whether CDI can be used to make GPUs accessible to containers.
+	// Enabled indicates whether the Container Device Interface (CDI) should be used as the mechanism for making GPUs accessible to containers.
 	// +kubebuilder:validation:Optional
 	// +kubebuilder:default=false
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
-	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as a mechanism for making GPUs accessible to containers"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as the mechanism for making GPUs accessible to containers"
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
 	Enabled *bool `json:"enabled,omitempty"`
 
-	// Default indicates whether to use CDI as the default mechanism for providing GPU access to containers.
+	// Deprecated: This field is no longer used. Setting cdi.enabled=true will configure CDI as the default mechanism for making GPUs accessible to containers.
 	// +kubebuilder:validation:Optional
 	// +kubebuilder:default=false
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
-	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Configure CDI as the default mechanism for making GPUs accessible to containers"
-	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Deprecated: This field is no longer used"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch,urn:alm:descriptor:com.tectonic.ui:hidden"
 	Default *bool `json:"default,omitempty"`
 }
 
@@ -2075,15 +2075,6 @@ func (c *CDIConfigSpec) IsEnabled() bool {
 	return *c.Enabled
 }
 
-// IsDefault returns true if CDI is enabled as the default
-// mechanism for providing GPU access to containers
-func (c *CDIConfigSpec) IsDefault() bool {
-	if c.Default == nil {
-		return false
-	}
-	return *c.Default
-}
-
 // IsEnabled returns true if Kata Manager is enabled
 func (k *KataManagerSpec) IsEnabled() bool {
 	if k.Enabled == nil {
diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
index cd0603f6a..0f6297464 100644
--- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
+++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
@@ -531,6 +531,20 @@ spec:
           path: toolkit.imagePullPolicy
           x-descriptors:
             - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy'
+        - displayName: CDI
+          description: Container Device Interface (CDI) Configuration
+          path: cdi
+        - displayName: Enabled
+          description: 'Enabled indicates whether CDI should be used as the mechanism for making GPUs accessible to containers.'
+          path: cdi.enabled
+          x-descriptors:
+            - 'urn:alm:descriptor:com.tectonic.ui:booleanSwitch'
+        - displayName: Default
+          description: 'Deprecated: This field is no longer used. Setting cdi.enabled=true will configure CDI as the default mechanism for making GPUs accessible to containers.'
+          path: cdi.default
+          x-descriptors:
+            - 'urn:alm:descriptor:com.tectonic.ui:hidden'
+            - 'urn:alm:descriptor:com.tectonic.ui:booleanSwitch'
         - displayName: NVIDIA DCGM config
           description: NVIDIA DCGM config
           path: dcgm
diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml
index c032907c6..964c17fdf 100644
--- a/bundle/manifests/nvidia.com_clusterpolicies.yaml
+++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml
@@ -136,13 +136,15 @@ spec:
                 properties:
                   default:
                     default: false
-                    description: Default indicates whether to use CDI as the default
-                      mechanism for providing GPU access to containers.
+                    description: 'Deprecated: This field is no longer used. Setting
+                      cdi.enabled=true will configure CDI as the default mechanism
+                      for making GPUs accessible to containers.'
                     type: boolean
                   enabled:
                     default: false
-                    description: Enabled indicates whether CDI can be used to make
-                      GPUs accessible to containers.
+                    description: Enabled indicates whether the Container Device Interface
+                      (CDI) should be used as the mechanism for making GPUs accessible
+                      to containers.
                     type: boolean
                 type: object
               daemonsets:
diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml
index c032907c6..964c17fdf 100644
--- a/config/crd/bases/nvidia.com_clusterpolicies.yaml
+++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml
@@ -136,13 +136,15 @@ spec:
                 properties:
                   default:
                     default: false
-                    description: Default indicates whether to use CDI as the default
-                      mechanism for providing GPU access to containers.
+                    description: 'Deprecated: This field is no longer used. Setting
+                      cdi.enabled=true will configure CDI as the default mechanism
+                      for making GPUs accessible to containers.'
                     type: boolean
                   enabled:
                     default: false
-                    description: Enabled indicates whether CDI can be used to make
-                      GPUs accessible to containers.
+                    description: Enabled indicates whether the Container Device Interface
+                      (CDI) should be used as the mechanism for making GPUs accessible
+                      to containers.
                     type: boolean
                 type: object
               daemonsets:
diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index cf82e8e82..41e6abac4 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -1224,7 +1224,7 @@ func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar {
 	return envVars
 }
 
-func transformToolkitCtrForCDI(container *corev1.Container, config *gpuv1.ClusterPolicySpec) {
+func transformToolkitCtrForCDI(container *corev1.Container) {
 	// When CDI is enabled in GPU Operator, we leverage native CDI support in containerd / cri-o
 	// to inject GPUs into workloads. We do not configure 'nvidia' as the default runtime. The
 	// 'nvidia' runtime will be set as the runtime class for our management containers so that
@@ -1236,10 +1236,7 @@ func transformToolkitCtrForCDI(container *corev1.Container, config *gpuv1.Cluste
 	// directly configure environment variables for the toolkit container.
 	setContainerEnv(container, CDIEnabledEnvName, "true")
 	setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false")
-
-	if config.CDI.IsDefault() {
-		setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi")
-	}
+	setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi")
 }
 
 // TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy
@@ -1287,7 +1284,7 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n
 
 	// update env required for CDI support
 	if config.CDI.IsEnabled() {
-		transformToolkitCtrForCDI(mainContainer, config)
+		transformToolkitCtrForCDI(mainContainer)
 	}
 
 	// set install directory for the toolkit
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index f0c4451fe..c347691e8 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -1838,24 +1838,6 @@ func TestTransformToolkitCtrForCDI(t *testing.T) {
 					Enabled: newBoolPtr(true),
 				},
 			},
-			expectedDs: NewDaemonset().WithContainer(
-				corev1.Container{
-					Name: "main-ctr",
-					Env: []corev1.EnvVar{
-						{Name: CDIEnabledEnvName, Value: "true"},
-						{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
-					},
-				}),
-		},
-		{
-			description: "cdi enabled and cdi default",
-			ds:          NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}),
-			cpSpec: &gpuv1.ClusterPolicySpec{
-				CDI: gpuv1.CDIConfigSpec{
-					Enabled: newBoolPtr(true),
-					Default: newBoolPtr(true),
-				},
-			},
 			expectedDs: NewDaemonset().WithContainer(
 				corev1.Container{
 					Name: "main-ctr",
@@ -1871,7 +1853,7 @@ func TestTransformToolkitCtrForCDI(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.description, func(t *testing.T) {
 			mainContainer := &tc.ds.DaemonSet.Spec.Template.Spec.Containers[0]
-			transformToolkitCtrForCDI(mainContainer, tc.cpSpec)
+			transformToolkitCtrForCDI(mainContainer)
 			require.EqualValues(t, tc.expectedDs, tc.ds)
 		})
 	}
diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
index c032907c6..964c17fdf 100644
--- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
+++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
@@ -136,13 +136,15 @@ spec:
                 properties:
                   default:
                     default: false
-                    description: Default indicates whether to use CDI as the default
-                      mechanism for providing GPU access to containers.
+                    description: 'Deprecated: This field is no longer used. Setting
+                      cdi.enabled=true will configure CDI as the default mechanism
+                      for making GPUs accessible to containers.'
                     type: boolean
                   enabled:
                     default: false
-                    description: Enabled indicates whether CDI can be used to make
-                      GPUs accessible to containers.
+                    description: Enabled indicates whether the Container Device Interface
+                      (CDI) should be used as the mechanism for making GPUs accessible
+                      to containers.
                     type: boolean
                 type: object
               daemonsets:
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 4f457dc42..672034cad 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -14,7 +14,6 @@ psa:
 
 cdi:
   enabled: false
-  default: false
 
 sandboxWorkloads:
   enabled: false

From 67d2e125c9490124212f99133233879c900ff181 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Fri, 1 Aug 2025 16:47:04 -0700
Subject: [PATCH 5/8] Set cdi.enabled to true by default

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 api/nvidia/v1/clusterpolicy_types.go          |  4 ++--
 ...rator-certified.clusterserviceversion.yaml |  3 +++
 .../manifests/nvidia.com_clusterpolicies.yaml |  2 +-
 .../crd/bases/nvidia.com_clusterpolicies.yaml |  2 +-
 controllers/transforms_test.go                | 20 +++++++++++++++++++
 .../crds/nvidia.com_clusterpolicies.yaml      |  2 +-
 deployments/gpu-operator/values.yaml          |  2 +-
 7 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go
index 537be53d9..42aaf4782 100644
--- a/api/nvidia/v1/clusterpolicy_types.go
+++ b/api/nvidia/v1/clusterpolicy_types.go
@@ -1660,7 +1660,7 @@ type VGPUDevicesConfigSpec struct {
 type CDIConfigSpec struct {
 	// Enabled indicates whether the Container Device Interface (CDI) should be used as the mechanism for making GPUs accessible to containers.
 	// +kubebuilder:validation:Optional
-	// +kubebuilder:default=false
+	// +kubebuilder:default=true
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as the mechanism for making GPUs accessible to containers"
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
@@ -2070,7 +2070,7 @@ func (l *DriverLicensingConfigSpec) IsNLSEnabled() bool {
 // providing GPU access to containers
 func (c *CDIConfigSpec) IsEnabled() bool {
 	if c.Enabled == nil {
-		return false
+		return true
 	}
 	return *c.Enabled
 }
diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
index 0f6297464..238756fca 100644
--- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
+++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml
@@ -34,6 +34,9 @@ metadata:
               "initContainer": {
               }
             },
+            "cdi": {
+              "enabled": true
+            },
             "sandboxWorkloads": {
               "enabled": false,
               "defaultWorkload": "container"
diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml
index 964c17fdf..0ebfa1b58 100644
--- a/bundle/manifests/nvidia.com_clusterpolicies.yaml
+++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml
@@ -141,7 +141,7 @@ spec:
                       for making GPUs accessible to containers.'
                     type: boolean
                   enabled:
-                    default: false
+                    default: true
                     description: Enabled indicates whether the Container Device Interface
                       (CDI) should be used as the mechanism for making GPUs accessible
                       to containers.
diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml
index 964c17fdf..0ebfa1b58 100644
--- a/config/crd/bases/nvidia.com_clusterpolicies.yaml
+++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml
@@ -141,7 +141,7 @@ spec:
                       for making GPUs accessible to containers.'
                     type: boolean
                   enabled:
-                    default: false
+                    default: true
                     description: Enabled indicates whether the Container Device Interface
                       (CDI) should be used as the mechanism for making GPUs accessible
                       to containers.
diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index c347691e8..8c0b1455a 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -643,6 +643,9 @@ func TestTransformToolkit(t *testing.T) {
 						},
 					},
 					Env: []corev1.EnvVar{
+						{Name: CDIEnabledEnvName, Value: "true"},
+						{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
+						{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
 						{Name: "foo", Value: "bar"},
 						{Name: "RUNTIME", Value: "containerd"},
 						{Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"},
@@ -713,6 +716,9 @@ func TestTransformToolkit(t *testing.T) {
 						},
 					},
 					Env: []corev1.EnvVar{
+						{Name: CDIEnabledEnvName, Value: "true"},
+						{Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"},
+						{Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"},
 						{Name: "CONTAINERD_CONFIG", Value: "/runtime/config-dir/config.toml"},
 						{Name: "CONTAINERD_SOCKET", Value: "/runtime/sock-dir/containerd.sock"},
 						{Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"},
@@ -770,6 +776,10 @@ func TestTransformDevicePlugin(t *testing.T) {
 						{Name: "foo", Value: "bar"},
 					},
 				},
+				Toolkit: gpuv1.ToolkitSpec{
+					Enabled:    newBoolPtr(true),
+					InstallDir: "/path/to/install",
+				},
 			},
 			expectedDs: NewDaemonset().WithContainer(corev1.Container{
 				Name:            "nvidia-device-plugin",
@@ -778,6 +788,10 @@ func TestTransformDevicePlugin(t *testing.T) {
 				Args:            []string{"--fail-on-init-error=false"},
 				Env: []corev1.EnvVar{
 					{Name: "NVIDIA_MIG_MONITOR_DEVICES", Value: "all"},
+					{Name: CDIEnabledEnvName, Value: "true"},
+					{Name: DeviceListStrategyEnvName, Value: "cdi-annotations,cdi-cri"},
+					{Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"},
+					{Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"},
 					{Name: "foo", Value: "bar"},
 				},
 			}).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"),
@@ -867,6 +881,10 @@ func TestTransformMigManager(t *testing.T) {
 						{Name: "foo", Value: "bar"},
 					},
 				},
+				Toolkit: gpuv1.ToolkitSpec{
+					Enabled:    newBoolPtr(true),
+					InstallDir: "/path/to/install",
+				},
 			},
 			expectedDs: NewDaemonset().WithContainer(corev1.Container{
 				Name:            "mig-manager",
@@ -874,6 +892,8 @@ func TestTransformMigManager(t *testing.T) {
 				ImagePullPolicy: corev1.PullIfNotPresent,
 				Args:            []string{"--test-flag"},
 				Env: []corev1.EnvVar{
+					{Name: CDIEnabledEnvName, Value: "true"},
+					{Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"},
 					{Name: "foo", Value: "bar"},
 				},
 			}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"),
diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
index 964c17fdf..0ebfa1b58 100644
--- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
+++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
@@ -141,7 +141,7 @@ spec:
                       for making GPUs accessible to containers.'
                     type: boolean
                   enabled:
-                    default: false
+                    default: true
                     description: Enabled indicates whether the Container Device Interface
                       (CDI) should be used as the mechanism for making GPUs accessible
                       to containers.
diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
index 672034cad..f9c01b1fd 100644
--- a/deployments/gpu-operator/values.yaml
+++ b/deployments/gpu-operator/values.yaml
@@ -13,7 +13,7 @@ psa:
   enabled: false
 
 cdi:
-  enabled: false
+  enabled: true
 
 sandboxWorkloads:
   enabled: false

From 02c3e6c2d11669ae1f2c8c13279082e034b78f8f Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Tue, 5 Aug 2025 13:36:33 -0700
Subject: [PATCH 6/8] [ci] Bump containerd version to 1.7.27 in holodeck config

Containerd 1.7.0 is the first release that supports CDI. CDI devices were exclusively
passed to containerd via annotations.
https://github.com/containerd/containerd/releases/tag/v1.7.0

In containerd 1.7.2 support was added for making use
of the new CRI fields 'Config.CDIDevices' so that CDI devices could be passed through the CRI
instead of annotations.
https://github.com/containerd/containerd/releases/tag/v1.7.2

This commit updates our holodeck configuration to a version of containerd that supports CDI
and the new CRI fields. containerd 1.7.27 is chosen since it is the latest release on the 1.7
branch that is available in the Ubuntu 22.04 repositories.

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 tests/holodeck.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/holodeck.yaml b/tests/holodeck.yaml
index e30131457..ce4d96568 100644
--- a/tests/holodeck.yaml
+++ b/tests/holodeck.yaml
@@ -27,6 +27,7 @@ spec:
   containerRuntime:
     install: true
     name: containerd
+    version: 1.7.27
   kubernetes:
     install: true
     installer: kubeadm

From d2d8d5db0bcf5514f563897fadc416990bbe72e5 Mon Sep 17 00:00:00 2001
From: Christopher Desiniotis <cdesiniotis@nvidia.com>
Date: Fri, 29 Aug 2025 15:17:45 -0700
Subject: [PATCH 7/8] Add ld.so.cache file to the gpu-operator container image

The distroless image does not contain the ld.so.cache file. The update-ldcache
createContainer hook will skip updating the container's ldcache if this file
does not exist. This can lead to issues running the CUDA vectorAdd sample
if the NVIDIA libraries are not present in the default dynamic linker search
path(s).

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
---
 docker/Dockerfile | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 47805845e..476f771f9 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -121,6 +121,13 @@ COPY --from=sample-builder /build/vectorAdd /usr/bin/vectorAdd
 # Once new sample images are published that contain the compat libs, we can update the below.
 COPY --from=builder /usr/local/cuda/compat /usr/local/cuda/compat
 
+# The distroless image does not contain the ld.so.cache file. The update-ldcache
+# createContainer hook will skip updating the container's ldcache if this file
+# does not exist. This can lead to issues running the CUDA vectorAdd sample
+# if the NVIDIA libraries are not present in the default dynamic linker search
+# path(s).
+COPY --from=sample-builder /etc/ld.so.cache /etc/
+
 COPY assets /opt/gpu-operator/
 COPY manifests /opt/gpu-operator/manifests
 COPY validator/manifests /opt/validator/manifests

From ea036dbf3e11c4cfa8004d8054a92d46b038464f Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Wed, 17 Sep 2025 16:21:12 +0200
Subject: [PATCH 8/8] [no-relnote] Address lint comments

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 controllers/transforms_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go
index 8c0b1455a..0afebe039 100644
--- a/controllers/transforms_test.go
+++ b/controllers/transforms_test.go
@@ -1872,7 +1872,7 @@ func TestTransformToolkitCtrForCDI(t *testing.T) {
 
 	for _, tc := range testCases {
 		t.Run(tc.description, func(t *testing.T) {
-			mainContainer := &tc.ds.DaemonSet.Spec.Template.Spec.Containers[0]
+			mainContainer := &tc.ds.Spec.Template.Spec.Containers[0]
 			transformToolkitCtrForCDI(mainContainer)
 			require.EqualValues(t, tc.expectedDs, tc.ds)
 		})
@@ -1928,7 +1928,7 @@ func TestTransformDevicePluginCtrForCDI(t *testing.T) {
 
 	for _, tc := range testCases {
 		t.Run(tc.description, func(t *testing.T) {
-			mainContainer := &tc.ds.DaemonSet.Spec.Template.Spec.Containers[0]
+			mainContainer := &tc.ds.Spec.Template.Spec.Containers[0]
 			transformDevicePluginCtrForCDI(mainContainer, tc.cpSpec)
 			require.EqualValues(t, tc.expectedDs, tc.ds)
 		})