From 741a49d7a151327a95635fdaae0879c2486bb521 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Fri, 1 Aug 2025 15:50:38 -0700 Subject: [PATCH 1/8] Use the nvidia runtime instead of the OCI hook on cri-o This aligns with how we install NVIDIA Container Toolkit when containerd is the runtime. Signed-off-by: Christopher Desiniotis --- controllers/object_controls.go | 10 +++++++--- controllers/transforms_test.go | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 0e10e3817..7378d7b91 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -147,8 +147,8 @@ const ( CDIEnabledEnvName = "CDI_ENABLED" // NvidiaCDIHookPathEnvName is the name of the envvar specifying the path to the 'nvidia-cdi-hook' binary NvidiaCDIHookPathEnvName = "NVIDIA_CDI_HOOK_PATH" - // CrioConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration - CrioConfigModeEnvName = "CRIO_CONFIG_MODE" + // CRIOConfigModeEnvName is the name of the envvar controlling how the toolkit container updates the cri-o configuration + CRIOConfigModeEnvName = "CRIO_CONFIG_MODE" // DeviceListStrategyEnvName is the name of the envvar for configuring the device-list-strategy in the device-plugin DeviceListStrategyEnvName = "DEVICE_LIST_STRATEGY" // CDIAnnotationPrefixEnvName is the name of the device-plugin envvar for configuring the CDI annotation prefix @@ -1258,7 +1258,6 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n if config.CDI.IsEnabled() { setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CrioConfigModeEnvName, "config") if config.CDI.IsDefault() { setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi") } @@ -1327,6 +1326,11 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, setContainerEnv(mainContainer, "CONTAINERD_RUNTIME_CLASS", getRuntimeClass(config)) } + if runtime == gpuv1.CRIO.String() { + // We add the nvidia runtime to the cri-o config by default instead of installing the OCI prestart hook + setContainerEnv(mainContainer, CRIOConfigModeEnvName, "config") + } + // setup mounts for runtime config file runtimeConfigFile, err := getRuntimeConfigFile(mainContainer, runtime) if err != nil { diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 4b75e62a9..794721ede 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -353,6 +353,7 @@ func TestTransformForRuntime(t *testing.T) { Name: "test-ctr", Env: []corev1.EnvVar{ {Name: "RUNTIME", Value: gpuv1.CRIO.String()}, + {Name: CRIOConfigModeEnvName, Value: "config"}, {Name: "RUNTIME_CONFIG", Value: filepath.Join(DefaultRuntimeConfigTargetDir, filepath.Base(DefaultCRIOConfigFile))}, {Name: "CRIO_CONFIG", Value: filepath.Join(DefaultRuntimeConfigTargetDir, filepath.Base(DefaultCRIOConfigFile))}, }, From d4be10a6631582823cca33f05e1ccd5197c416d6 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Fri, 1 Aug 2025 16:00:17 -0700 Subject: [PATCH 2/8] Always set the nvidia runtime class for operands that rely on the toolkit The runtimeClassName field will always be set now regardless if cri-o or containerd is the container runtime. Signed-off-by: Christopher Desiniotis --- controllers/object_controls.go | 39 ++++++++++++---------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 7378d7b91..86d8239ed 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -952,8 +952,7 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol return err } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -1323,7 +1322,7 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, if runtime == gpuv1.Containerd.String() { // Set the runtime class name that is to be configured for containerd - setContainerEnv(mainContainer, "CONTAINERD_RUNTIME_CLASS", getRuntimeClass(config)) + setContainerEnv(mainContainer, "CONTAINERD_RUNTIME_CLASS", getRuntimeClassName(config)) } if runtime == gpuv1.CRIO.String() { @@ -1433,8 +1432,7 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe return err } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) // update env required for MIG support applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) @@ -1528,8 +1526,7 @@ func TransformMPSControlDaemon(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolic return err } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) // update env required for MIG support applyMIGConfiguration(mainContainer, config.MIG.Strategy) @@ -1637,8 +1634,7 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) // mount configmap for custom metrics if provided by user if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" { @@ -1760,8 +1756,7 @@ func TransformDCGM(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n Clu } } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) return nil } @@ -1803,8 +1798,7 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, obj.Spec.Template.Spec.Containers[0].Args = config.MIGManager.Args } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) // set ConfigMap name for "mig-parted-config" Volume for i, vol := range obj.Spec.Template.Spec.Volumes { @@ -2097,8 +2091,7 @@ func TransformValidator(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, return fmt.Errorf("%v", err) } - // set RuntimeClass for supported runtimes - setRuntimeClass(&obj.Spec.Template.Spec, n.runtime, config.Operator.RuntimeClass) + setRuntimeClassName(&obj.Spec.Template.Spec, config) var validatorErr error // apply changes for individual component validators(initContainers) @@ -2431,20 +2424,16 @@ func setContainerEnv(c *corev1.Container, key, value string) { c.Env = append(c.Env, corev1.EnvVar{Name: key, Value: value}) } -func getRuntimeClass(config *gpuv1.ClusterPolicySpec) string { +func getRuntimeClassName(config *gpuv1.ClusterPolicySpec) string { if config.Operator.RuntimeClass != "" { return config.Operator.RuntimeClass } return DefaultRuntimeClass } -func setRuntimeClass(podSpec *corev1.PodSpec, runtime gpuv1.Runtime, runtimeClass string) { - if runtime == gpuv1.Containerd { - if runtimeClass == "" { - runtimeClass = DefaultRuntimeClass - } - podSpec.RuntimeClassName = &runtimeClass - } +func setRuntimeClassName(podSpec *corev1.PodSpec, config *gpuv1.ClusterPolicySpec) { + runtimeClassName := getRuntimeClassName(config) + podSpec.RuntimeClassName = &runtimeClassName } func setContainerProbe(container *corev1.Container, probe *gpuv1.ContainerProbeSpec, probeType ContainerProbe) { @@ -4741,7 +4730,7 @@ func transformRuntimeClassLegacy(n ClusterPolicyController, spec nodev1.RuntimeC // apply runtime class name as per ClusterPolicy if obj.Name == "FILLED_BY_OPERATOR" { - runtimeClassName := getRuntimeClass(&n.singleton.Spec) + runtimeClassName := getRuntimeClassName(&n.singleton.Spec) obj.Name = runtimeClassName obj.Handler = runtimeClassName } @@ -4788,7 +4777,7 @@ func transformRuntimeClass(n ClusterPolicyController, spec nodev1.RuntimeClass) // apply runtime class name as per ClusterPolicy if obj.Name == "FILLED_BY_OPERATOR" { - runtimeClassName := getRuntimeClass(&n.singleton.Spec) + runtimeClassName := getRuntimeClassName(&n.singleton.Spec) obj.Name = runtimeClassName obj.Handler = runtimeClassName } From c52d2ec1365ed5a2762ef85561468efc3ed7697d Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Tue, 18 Feb 2025 14:50:10 -0800 Subject: [PATCH 3/8] Use native CDI support in runtimes for workload containers when cdi.enabled=true This commit updates the default behavior when cdi.enabled=true. We now leverage native CDI support in containerd / cri-o to inject GPU devices into workload containers. This means we no longer configure 'nvidia' as the default runtime. Our management containers will continue to use the 'nvidia' runtime to access GPUs by explicitly setting runtimeClassName=nvidia in their pod specs. Signed-off-by: Christopher Desiniotis --- controllers/object_controls.go | 97 ++++++++++++++++++++-------- controllers/transforms_test.go | 114 ++++++++++++++++++++++++++++++++- 2 files changed, 183 insertions(+), 28 deletions(-) diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 86d8239ed..cf82e8e82 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -178,6 +178,8 @@ const ( // DriverInstallDirCtrPathEnvName is the name of the envvar used by the driver-validator to represent the path // of the driver install dir mounted in the container DriverInstallDirCtrPathEnvName = "DRIVER_INSTALL_DIR_CTR_PATH" + // NvidiaRuntimeSetAsDefaultEnvName is the name of the toolkit container env for configuring NVIDIA Container Runtime as the default runtime + NvidiaRuntimeSetAsDefaultEnvName = "NVIDIA_RUNTIME_SET_AS_DEFAULT" ) // ContainerProbe defines container probe types @@ -1222,8 +1224,38 @@ func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar { return envVars } +func transformToolkitCtrForCDI(container *corev1.Container, config *gpuv1.ClusterPolicySpec) { + // When CDI is enabled in GPU Operator, we leverage native CDI support in containerd / cri-o + // to inject GPUs into workloads. We do not configure 'nvidia' as the default runtime. The + // 'nvidia' runtime will be set as the runtime class for our management containers so that + // they get access to all GPUs. + // + // Note: one could override this and continue to configure 'nvidia' as the default runtime + // by directly setting the 'NVIDIA_RUNTIME_SET_AS_DEFAULT' environment variable to 'true' in + // the toolkit container. One can leverage the 'toolkit.env' field in ClusterPolicy to + // directly configure environment variables for the toolkit container. + setContainerEnv(container, CDIEnabledEnvName, "true") + setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false") + + if config.CDI.IsDefault() { + setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi") + } +} + // TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { + var mainContainer *corev1.Container + mainContainerName := "nvidia-container-toolkit-ctr" + for i, ctr := range obj.Spec.Template.Spec.Containers { + if ctr.Name == mainContainerName { + mainContainer = &obj.Spec.Template.Spec.Containers[i] + break + } + } + if mainContainer == nil { + return fmt.Errorf("failed to find main container %q", mainContainerName) + } + // update validation container err := transformValidationInitContainer(obj, config) if err != nil { @@ -1234,10 +1266,10 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n if err != nil { return err } - obj.Spec.Template.Spec.Containers[0].Image = image + mainContainer.Image = image // update image pull policy - obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.Toolkit.ImagePullPolicy) + mainContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.Toolkit.ImagePullPolicy) // set image pull secrets if len(config.Toolkit.ImagePullSecrets) > 0 { @@ -1255,16 +1287,12 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n // update env required for CDI support if config.CDI.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeCDIPrefixesEnvName, "nvidia.cdi.k8s.io/") - if config.CDI.IsDefault() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCtrRuntimeModeEnvName, "cdi") - } + transformToolkitCtrForCDI(mainContainer, config) } // set install directory for the toolkit if config.Toolkit.InstallDir != "" && config.Toolkit.InstallDir != DefaultToolkitInstallDir { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), ToolkitInstallDirEnvName, config.Toolkit.InstallDir) + setContainerEnv(mainContainer, ToolkitInstallDirEnvName, config.Toolkit.InstallDir) for i, volume := range obj.Spec.Template.Spec.Volumes { if volume.Name == "toolkit-install-dir" { @@ -1273,9 +1301,9 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n } } - for i, volumeMount := range obj.Spec.Template.Spec.Containers[0].VolumeMounts { + for i, volumeMount := range mainContainer.VolumeMounts { if volumeMount.Name == "toolkit-install-dir" { - obj.Spec.Template.Spec.Containers[0].VolumeMounts[i].MountPath = config.Toolkit.InstallDir + mainContainer.VolumeMounts[i].MountPath = config.Toolkit.InstallDir break } } @@ -1292,13 +1320,13 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n if len(config.Toolkit.Env) > 0 { for _, env := range config.Toolkit.Env { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value) + setContainerEnv(mainContainer, env.Name, env.Value) } } // configure runtime runtime := n.runtime.String() - err = transformForRuntime(obj, config, runtime, "nvidia-container-toolkit-ctr") + err = transformForRuntime(obj, config, runtime, mainContainerName) if err != nil { return fmt.Errorf("error transforming toolkit daemonset : %w", err) } @@ -1384,8 +1412,30 @@ func transformForRuntime(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, return nil } +func transformDevicePluginCtrForCDI(container *corev1.Container, config *gpuv1.ClusterPolicySpec) { + setContainerEnv(container, CDIEnabledEnvName, "true") + setContainerEnv(container, DeviceListStrategyEnvName, "cdi-annotations,cdi-cri") + setContainerEnv(container, CDIAnnotationPrefixEnvName, "cdi.k8s.io/") + + if config.Toolkit.IsEnabled() { + setContainerEnv(container, NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook")) + } +} + // TransformDevicePlugin transforms k8s-device-plugin daemonset with required config as per ClusterPolicy func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { + var mainContainer *corev1.Container + mainContainerName := "nvidia-device-plugin" + for i, ctr := range obj.Spec.Template.Spec.Containers { + if ctr.Name == mainContainerName { + mainContainer = &obj.Spec.Template.Spec.Containers[i] + break + } + } + if mainContainer == nil { + return fmt.Errorf("failed to find main container %q", mainContainerName) + } + // update validation container err := transformValidationInitContainer(obj, config) if err != nil { @@ -1397,10 +1447,10 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe if err != nil { return err } - obj.Spec.Template.Spec.Containers[0].Image = image + mainContainer.Image = image // update image pull policy - obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy) + mainContainer.ImagePullPolicy = gpuv1.ImagePullPolicy(config.DevicePlugin.ImagePullPolicy) // set image pull secrets if len(config.DevicePlugin.ImagePullSecrets) > 0 { @@ -1417,13 +1467,13 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe } // set arguments if specified for device-plugin container if len(config.DevicePlugin.Args) > 0 { - obj.Spec.Template.Spec.Containers[0].Args = config.DevicePlugin.Args + mainContainer.Args = config.DevicePlugin.Args } // add env to allow injection of /dev/nvidia-fs and /dev/infiniband devices for GDS if config.GPUDirectStorage != nil && config.GPUDirectStorage.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), GDSEnabledEnvName, "true") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), MOFEDEnabledEnvName, "true") + setContainerEnv(mainContainer, GDSEnabledEnvName, "true") + setContainerEnv(mainContainer, MOFEDEnabledEnvName, "true") } // apply plugin configuration through ConfigMap if one is provided @@ -1435,16 +1485,11 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe setRuntimeClassName(&obj.Spec.Template.Spec, config) // update env required for MIG support - applyMIGConfiguration(&(obj.Spec.Template.Spec.Containers[0]), config.MIG.Strategy) + applyMIGConfiguration(mainContainer, config.MIG.Strategy) // update env required for CDI support if config.CDI.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations") - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, "nvidia.cdi.k8s.io/") - if config.Toolkit.IsEnabled() { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook")) - } + transformDevicePluginCtrForCDI(mainContainer, config) } // update MPS volumes and set MPS_ROOT env var if a custom MPS root is configured @@ -1458,12 +1503,12 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe obj.Spec.Template.Spec.Volumes[i].HostPath.Path = filepath.Join(config.DevicePlugin.MPS.Root, "shm") } } - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), MPSRootEnvName, config.DevicePlugin.MPS.Root) + setContainerEnv(mainContainer, MPSRootEnvName, config.DevicePlugin.MPS.Root) } if len(config.DevicePlugin.Env) > 0 { for _, env := range config.DevicePlugin.Env { - setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value) + setContainerEnv(mainContainer, env.Name, env.Value) } } diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 794721ede..f0c4451fe 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -756,7 +756,7 @@ func TestTransformDevicePlugin(t *testing.T) { { description: "transform device plugin", ds: NewDaemonset(). - WithContainer(corev1.Container{Name: "nvidia-device-plugin-ctr"}). + WithContainer(corev1.Container{Name: "nvidia-device-plugin"}). WithContainer(corev1.Container{Name: "dummy"}), cpSpec: &gpuv1.ClusterPolicySpec{ DevicePlugin: gpuv1.DevicePluginSpec{ @@ -772,7 +772,7 @@ func TestTransformDevicePlugin(t *testing.T) { }, }, expectedDs: NewDaemonset().WithContainer(corev1.Container{ - Name: "nvidia-device-plugin-ctr", + Name: "nvidia-device-plugin", Image: "nvcr.io/nvidia/cloud-native/nvidia-device-plugin:v1.0.0", ImagePullPolicy: corev1.PullIfNotPresent, Args: []string{"--fail-on-init-error=false"}, @@ -1822,3 +1822,113 @@ func TestTransformDriver(t *testing.T) { }) } } + +func TestTransformToolkitCtrForCDI(t *testing.T) { + testCases := []struct { + description string + ds Daemonset + cpSpec *gpuv1.ClusterPolicySpec + expectedDs Daemonset + }{ + { + description: "cdi enabled", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(true), + }, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, + }, + }), + }, + { + description: "cdi enabled and cdi default", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + CDI: gpuv1.CDIConfigSpec{ + Enabled: newBoolPtr(true), + Default: newBoolPtr(true), + }, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, + {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, + }, + }), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + mainContainer := &tc.ds.DaemonSet.Spec.Template.Spec.Containers[0] + transformToolkitCtrForCDI(mainContainer, tc.cpSpec) + require.EqualValues(t, tc.expectedDs, tc.ds) + }) + } +} + +func TestTransformDevicePluginCtrForCDI(t *testing.T) { + testCases := []struct { + description string + ds Daemonset + cpSpec *gpuv1.ClusterPolicySpec + expectedDs Daemonset + }{ + { + description: "toolkit disabled", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(false), + }, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: DeviceListStrategyEnvName, Value: "cdi-annotations,cdi-cri"}, + {Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"}, + }, + }), + }, + { + description: "toolkit enabled", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(true), + InstallDir: "/path/to/install", + }, + }, + expectedDs: NewDaemonset().WithContainer( + corev1.Container{ + Name: "main-ctr", + Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: DeviceListStrategyEnvName, Value: "cdi-annotations,cdi-cri"}, + {Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"}, + {Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"}, + }, + }), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + mainContainer := &tc.ds.DaemonSet.Spec.Template.Spec.Containers[0] + transformDevicePluginCtrForCDI(mainContainer, tc.cpSpec) + require.EqualValues(t, tc.expectedDs, tc.ds) + }) + } +} From e63a990340806abf87950fc50dc7b2fca1a056c8 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Fri, 1 Aug 2025 16:42:56 -0700 Subject: [PATCH 4/8] Make the cdi.default field a no-op Signed-off-by: Christopher Desiniotis --- api/nvidia/v1/clusterpolicy_types.go | 19 +++++------------- ...rator-certified.clusterserviceversion.yaml | 14 +++++++++++++ .../manifests/nvidia.com_clusterpolicies.yaml | 10 ++++++---- .../crd/bases/nvidia.com_clusterpolicies.yaml | 10 ++++++---- controllers/object_controls.go | 9 +++------ controllers/transforms_test.go | 20 +------------------ .../crds/nvidia.com_clusterpolicies.yaml | 10 ++++++---- deployments/gpu-operator/values.yaml | 1 - 8 files changed, 41 insertions(+), 52 deletions(-) diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 5b9535dba..537be53d9 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -1658,20 +1658,20 @@ type VGPUDevicesConfigSpec struct { // CDIConfigSpec defines how the Container Device Interface is used in the cluster. type CDIConfigSpec struct { - // Enabled indicates whether CDI can be used to make GPUs accessible to containers. + // Enabled indicates whether the Container Device Interface (CDI) should be used as the mechanism for making GPUs accessible to containers. // +kubebuilder:validation:Optional // +kubebuilder:default=false // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as a mechanism for making GPUs accessible to containers" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as the mechanism for making GPUs accessible to containers" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" Enabled *bool `json:"enabled,omitempty"` - // Default indicates whether to use CDI as the default mechanism for providing GPU access to containers. + // Deprecated: This field is no longer used. Setting cdi.enabled=true will configure CDI as the default mechanism for making GPUs accessible to containers. // +kubebuilder:validation:Optional // +kubebuilder:default=false // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Configure CDI as the default mechanism for making GPUs accessible to containers" - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Deprecated: This field is no longer used" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch,urn:alm:descriptor:com.tectonic.ui:hidden" Default *bool `json:"default,omitempty"` } @@ -2075,15 +2075,6 @@ func (c *CDIConfigSpec) IsEnabled() bool { return *c.Enabled } -// IsDefault returns true if CDI is enabled as the default -// mechanism for providing GPU access to containers -func (c *CDIConfigSpec) IsDefault() bool { - if c.Default == nil { - return false - } - return *c.Default -} - // IsEnabled returns true if Kata Manager is enabled func (k *KataManagerSpec) IsEnabled() bool { if k.Enabled == nil { diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index cd0603f6a..0f6297464 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -531,6 +531,20 @@ spec: path: toolkit.imagePullPolicy x-descriptors: - 'urn:alm:descriptor:com.tectonic.ui:imagePullPolicy' + - displayName: CDI + description: Container Device Interface (CDI) Configuration + path: cdi + - displayName: Enabled + description: 'Enabled indicates whether CDI should be used as the mechanism for making GPUs accessible to containers.' + path: cdi.enabled + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:booleanSwitch' + - displayName: Default + description: 'Deprecated: This field is no longer used. Setting cdi.enabled=true will configure CDI as the default mechanism for making GPUs accessible to containers.' + path: cdi.default + x-descriptors: + - 'urn:alm:descriptor:com.tectonic.ui:hidden' + - 'urn:alm:descriptor:com.tectonic.ui:booleanSwitch' - displayName: NVIDIA DCGM config description: NVIDIA DCGM config path: dcgm diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index c032907c6..964c17fdf 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -136,13 +136,15 @@ spec: properties: default: default: false - description: Default indicates whether to use CDI as the default - mechanism for providing GPU access to containers. + description: 'Deprecated: This field is no longer used. Setting + cdi.enabled=true will configure CDI as the default mechanism + for making GPUs accessible to containers.' type: boolean enabled: default: false - description: Enabled indicates whether CDI can be used to make - GPUs accessible to containers. + description: Enabled indicates whether the Container Device Interface + (CDI) should be used as the mechanism for making GPUs accessible + to containers. type: boolean type: object daemonsets: diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index c032907c6..964c17fdf 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -136,13 +136,15 @@ spec: properties: default: default: false - description: Default indicates whether to use CDI as the default - mechanism for providing GPU access to containers. + description: 'Deprecated: This field is no longer used. Setting + cdi.enabled=true will configure CDI as the default mechanism + for making GPUs accessible to containers.' type: boolean enabled: default: false - description: Enabled indicates whether CDI can be used to make - GPUs accessible to containers. + description: Enabled indicates whether the Container Device Interface + (CDI) should be used as the mechanism for making GPUs accessible + to containers. type: boolean type: object daemonsets: diff --git a/controllers/object_controls.go b/controllers/object_controls.go index cf82e8e82..41e6abac4 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -1224,7 +1224,7 @@ func getProxyEnv(proxyConfig *apiconfigv1.Proxy) []corev1.EnvVar { return envVars } -func transformToolkitCtrForCDI(container *corev1.Container, config *gpuv1.ClusterPolicySpec) { +func transformToolkitCtrForCDI(container *corev1.Container) { // When CDI is enabled in GPU Operator, we leverage native CDI support in containerd / cri-o // to inject GPUs into workloads. We do not configure 'nvidia' as the default runtime. The // 'nvidia' runtime will be set as the runtime class for our management containers so that @@ -1236,10 +1236,7 @@ func transformToolkitCtrForCDI(container *corev1.Container, config *gpuv1.Cluste // directly configure environment variables for the toolkit container. setContainerEnv(container, CDIEnabledEnvName, "true") setContainerEnv(container, NvidiaRuntimeSetAsDefaultEnvName, "false") - - if config.CDI.IsDefault() { - setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi") - } + setContainerEnv(container, NvidiaCtrRuntimeModeEnvName, "cdi") } // TransformToolkit transforms Nvidia container-toolkit daemonset with required config as per ClusterPolicy @@ -1287,7 +1284,7 @@ func TransformToolkit(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n // update env required for CDI support if config.CDI.IsEnabled() { - transformToolkitCtrForCDI(mainContainer, config) + transformToolkitCtrForCDI(mainContainer) } // set install directory for the toolkit diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index f0c4451fe..c347691e8 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -1838,24 +1838,6 @@ func TestTransformToolkitCtrForCDI(t *testing.T) { Enabled: newBoolPtr(true), }, }, - expectedDs: NewDaemonset().WithContainer( - corev1.Container{ - Name: "main-ctr", - Env: []corev1.EnvVar{ - {Name: CDIEnabledEnvName, Value: "true"}, - {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, - }, - }), - }, - { - description: "cdi enabled and cdi default", - ds: NewDaemonset().WithContainer(corev1.Container{Name: "main-ctr"}), - cpSpec: &gpuv1.ClusterPolicySpec{ - CDI: gpuv1.CDIConfigSpec{ - Enabled: newBoolPtr(true), - Default: newBoolPtr(true), - }, - }, expectedDs: NewDaemonset().WithContainer( corev1.Container{ Name: "main-ctr", @@ -1871,7 +1853,7 @@ func TestTransformToolkitCtrForCDI(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { mainContainer := &tc.ds.DaemonSet.Spec.Template.Spec.Containers[0] - transformToolkitCtrForCDI(mainContainer, tc.cpSpec) + transformToolkitCtrForCDI(mainContainer) require.EqualValues(t, tc.expectedDs, tc.ds) }) } diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index c032907c6..964c17fdf 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -136,13 +136,15 @@ spec: properties: default: default: false - description: Default indicates whether to use CDI as the default - mechanism for providing GPU access to containers. + description: 'Deprecated: This field is no longer used. Setting + cdi.enabled=true will configure CDI as the default mechanism + for making GPUs accessible to containers.' type: boolean enabled: default: false - description: Enabled indicates whether CDI can be used to make - GPUs accessible to containers. + description: Enabled indicates whether the Container Device Interface + (CDI) should be used as the mechanism for making GPUs accessible + to containers. type: boolean type: object daemonsets: diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 4f457dc42..672034cad 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -14,7 +14,6 @@ psa: cdi: enabled: false - default: false sandboxWorkloads: enabled: false From 67d2e125c9490124212f99133233879c900ff181 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Fri, 1 Aug 2025 16:47:04 -0700 Subject: [PATCH 5/8] Set cdi.enabled to true by default Signed-off-by: Christopher Desiniotis --- api/nvidia/v1/clusterpolicy_types.go | 4 ++-- ...rator-certified.clusterserviceversion.yaml | 3 +++ .../manifests/nvidia.com_clusterpolicies.yaml | 2 +- .../crd/bases/nvidia.com_clusterpolicies.yaml | 2 +- controllers/transforms_test.go | 20 +++++++++++++++++++ .../crds/nvidia.com_clusterpolicies.yaml | 2 +- deployments/gpu-operator/values.yaml | 2 +- 7 files changed, 29 insertions(+), 6 deletions(-) diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 537be53d9..42aaf4782 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -1660,7 +1660,7 @@ type VGPUDevicesConfigSpec struct { type CDIConfigSpec struct { // Enabled indicates whether the Container Device Interface (CDI) should be used as the mechanism for making GPUs accessible to containers. // +kubebuilder:validation:Optional - // +kubebuilder:default=false + // +kubebuilder:default=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable CDI as the mechanism for making GPUs accessible to containers" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" @@ -2070,7 +2070,7 @@ func (l *DriverLicensingConfigSpec) IsNLSEnabled() bool { // providing GPU access to containers func (c *CDIConfigSpec) IsEnabled() bool { if c.Enabled == nil { - return false + return true } return *c.Enabled } diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index 0f6297464..238756fca 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -34,6 +34,9 @@ metadata: "initContainer": { } }, + "cdi": { + "enabled": true + }, "sandboxWorkloads": { "enabled": false, "defaultWorkload": "container" diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index 964c17fdf..0ebfa1b58 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -141,7 +141,7 @@ spec: for making GPUs accessible to containers.' type: boolean enabled: - default: false + default: true description: Enabled indicates whether the Container Device Interface (CDI) should be used as the mechanism for making GPUs accessible to containers. diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 964c17fdf..0ebfa1b58 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -141,7 +141,7 @@ spec: for making GPUs accessible to containers.' type: boolean enabled: - default: false + default: true description: Enabled indicates whether the Container Device Interface (CDI) should be used as the mechanism for making GPUs accessible to containers. diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index c347691e8..8c0b1455a 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -643,6 +643,9 @@ func TestTransformToolkit(t *testing.T) { }, }, Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, + {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, {Name: "foo", Value: "bar"}, {Name: "RUNTIME", Value: "containerd"}, {Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"}, @@ -713,6 +716,9 @@ func TestTransformToolkit(t *testing.T) { }, }, Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaRuntimeSetAsDefaultEnvName, Value: "false"}, + {Name: NvidiaCtrRuntimeModeEnvName, Value: "cdi"}, {Name: "CONTAINERD_CONFIG", Value: "/runtime/config-dir/config.toml"}, {Name: "CONTAINERD_SOCKET", Value: "/runtime/sock-dir/containerd.sock"}, {Name: "CONTAINERD_RUNTIME_CLASS", Value: "nvidia"}, @@ -770,6 +776,10 @@ func TestTransformDevicePlugin(t *testing.T) { {Name: "foo", Value: "bar"}, }, }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(true), + InstallDir: "/path/to/install", + }, }, expectedDs: NewDaemonset().WithContainer(corev1.Container{ Name: "nvidia-device-plugin", @@ -778,6 +788,10 @@ func TestTransformDevicePlugin(t *testing.T) { Args: []string{"--fail-on-init-error=false"}, Env: []corev1.EnvVar{ {Name: "NVIDIA_MIG_MONITOR_DEVICES", Value: "all"}, + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: DeviceListStrategyEnvName, Value: "cdi-annotations,cdi-cri"}, + {Name: CDIAnnotationPrefixEnvName, Value: "cdi.k8s.io/"}, + {Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"}, {Name: "foo", Value: "bar"}, }, }).WithContainer(corev1.Container{Name: "dummy"}).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"), @@ -867,6 +881,10 @@ func TestTransformMigManager(t *testing.T) { {Name: "foo", Value: "bar"}, }, }, + Toolkit: gpuv1.ToolkitSpec{ + Enabled: newBoolPtr(true), + InstallDir: "/path/to/install", + }, }, expectedDs: NewDaemonset().WithContainer(corev1.Container{ Name: "mig-manager", @@ -874,6 +892,8 @@ func TestTransformMigManager(t *testing.T) { ImagePullPolicy: corev1.PullIfNotPresent, Args: []string{"--test-flag"}, Env: []corev1.EnvVar{ + {Name: CDIEnabledEnvName, Value: "true"}, + {Name: NvidiaCDIHookPathEnvName, Value: "/path/to/install/toolkit/nvidia-cdi-hook"}, {Name: "foo", Value: "bar"}, }, }).WithPullSecret("pull-secret").WithRuntimeClassName("nvidia"), diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 964c17fdf..0ebfa1b58 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -141,7 +141,7 @@ spec: for making GPUs accessible to containers.' type: boolean enabled: - default: false + default: true description: Enabled indicates whether the Container Device Interface (CDI) should be used as the mechanism for making GPUs accessible to containers. diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 672034cad..f9c01b1fd 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -13,7 +13,7 @@ psa: enabled: false cdi: - enabled: false + enabled: true sandboxWorkloads: enabled: false From 02c3e6c2d11669ae1f2c8c13279082e034b78f8f Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Tue, 5 Aug 2025 13:36:33 -0700 Subject: [PATCH 6/8] [ci] Bump containerd version to 1.7.27 in holodeck config Containerd 1.7.0 is the first release that supports CDI. CDI devices were exclusively passed to containerd via annotations. https://github.com/containerd/containerd/releases/tag/v1.7.0 In containerd 1.7.2 support was added for making use of the new CRI fields 'Config.CDIDevices' so that CDI devices could be passed through the CRI instead of annotations. https://github.com/containerd/containerd/releases/tag/v1.7.2 This commit updates our holodeck configuration to a version of containerd that supports CDI and the new CRI fields. containerd 1.7.27 is chosen since it is the latest release on the 1.7 branch that is available in the Ubuntu 22.04 repositories. Signed-off-by: Christopher Desiniotis --- tests/holodeck.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/holodeck.yaml b/tests/holodeck.yaml index e30131457..ce4d96568 100644 --- a/tests/holodeck.yaml +++ b/tests/holodeck.yaml @@ -27,6 +27,7 @@ spec: containerRuntime: install: true name: containerd + version: 1.7.27 kubernetes: install: true installer: kubeadm From d2d8d5db0bcf5514f563897fadc416990bbe72e5 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Fri, 29 Aug 2025 15:17:45 -0700 Subject: [PATCH 7/8] Add ld.so.cache file to the gpu-operator container image The distroless image does not contain the ld.so.cache file. The update-ldcache createContainer hook will skip updating the container's ldcache if this file does not exist. This can lead to issues running the CUDA vectorAdd sample if the NVIDIA libraries are not present in the default dynamic linker search path(s). Signed-off-by: Christopher Desiniotis --- docker/Dockerfile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 47805845e..476f771f9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -121,6 +121,13 @@ COPY --from=sample-builder /build/vectorAdd /usr/bin/vectorAdd # Once new sample images are published that contain the compat libs, we can update the below. COPY --from=builder /usr/local/cuda/compat /usr/local/cuda/compat +# The distroless image does not contain the ld.so.cache file. The update-ldcache +# createContainer hook will skip updating the container's ldcache if this file +# does not exist. This can lead to issues running the CUDA vectorAdd sample +# if the NVIDIA libraries are not present in the default dynamic linker search +# path(s). +COPY --from=sample-builder /etc/ld.so.cache /etc/ + COPY assets /opt/gpu-operator/ COPY manifests /opt/gpu-operator/manifests COPY validator/manifests /opt/validator/manifests From ea036dbf3e11c4cfa8004d8054a92d46b038464f Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 17 Sep 2025 16:21:12 +0200 Subject: [PATCH 8/8] [no-relnote] Address lint comments Signed-off-by: Evan Lezar --- controllers/transforms_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 8c0b1455a..0afebe039 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -1872,7 +1872,7 @@ func TestTransformToolkitCtrForCDI(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { - mainContainer := &tc.ds.DaemonSet.Spec.Template.Spec.Containers[0] + mainContainer := &tc.ds.Spec.Template.Spec.Containers[0] transformToolkitCtrForCDI(mainContainer) require.EqualValues(t, tc.expectedDs, tc.ds) }) @@ -1928,7 +1928,7 @@ func TestTransformDevicePluginCtrForCDI(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { - mainContainer := &tc.ds.DaemonSet.Spec.Template.Spec.Containers[0] + mainContainer := &tc.ds.Spec.Template.Spec.Containers[0] transformDevicePluginCtrForCDI(mainContainer, tc.cpSpec) require.EqualValues(t, tc.expectedDs, tc.ds) })