From 0c6cb431a6cc6bd9a3298cfddd2f829250e53d7b Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Mon, 21 Oct 2024 10:52:59 +0000 Subject: [PATCH 1/5] Move setting of NVIDIA_VISIBLE_DEVICES=void to standard CDI spec Signed-off-by: Kevin Klues --- cmd/nvidia-dra-plugin/cdi.go | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cmd/nvidia-dra-plugin/cdi.go b/cmd/nvidia-dra-plugin/cdi.go index 25d93a8ac..e76fc1811 100644 --- a/cmd/nvidia-dra-plugin/cdi.go +++ b/cmd/nvidia-dra-plugin/cdi.go @@ -172,6 +172,13 @@ func (cdi *CDIHandler) CreateStandardDeviceSpecFile(allocatable AllocatableDevic return fmt.Errorf("failed to get common CDI spec edits: %w", err) } + // Make sure that NVIDIA_VISIBLE_DEVICES is set to void to avoid the + // nvidia-container-runtime honoring it in addition to the underlying + // runtime honoring CDI. + commonEdits.ContainerEdits.Env = append( + commonEdits.ContainerEdits.Env, + "NVIDIA_VISIBLE_DEVICES=void") + // Generate device specs for all full GPUs and MIG devices. var deviceSpecs []cdispec.Device for _, device := range allocatable { @@ -223,25 +230,16 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, preparedDevices Prep // Generate claim specific specs for each device. var deviceSpecs []cdispec.Device for _, group := range preparedDevices { - // Include this per-device, rather than as a top-level edit so that - // each device spec is never empty and the spec file gets created - // without error. - claimDeviceEdits := cdiapi.ContainerEdits{ - ContainerEdits: &cdispec.ContainerEdits{ - Env: []string{ - "NVIDIA_VISIBLE_DEVICES=void", - }, - }, + // If there are no edits passed back as prt of the device config state, skip it + if group.ConfigState.containerEdits == nil { + continue } - // Apply any edits passed back as part of the device config state. - claimDeviceEdits.Append(group.ConfigState.containerEdits) - - // Apply edits to all devices. + // Apply any edits passed back as part of the device config state to all devices for _, device := range group.Devices { deviceSpec := cdispec.Device{ Name: fmt.Sprintf("%s-%s", claimUID, device.CanonicalName()), - ContainerEdits: *claimDeviceEdits.ContainerEdits, + ContainerEdits: *group.ConfigState.containerEdits.ContainerEdits, } deviceSpecs = append(deviceSpecs, deviceSpec) From 8bc42a5f1e410f94874716e628e0851f2fc9b847 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Sun, 20 Oct 2024 18:11:41 +0000 Subject: [PATCH 2/5] Fix (minor) bug where deployment=nvidia and namespace=nvidia-dra-driver Signed-off-by: Kevin Klues --- demo/clusters/kind/install-dra-driver.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demo/clusters/kind/install-dra-driver.sh b/demo/clusters/kind/install-dra-driver.sh index fc2aecc4b..d826cac70 100755 --- a/demo/clusters/kind/install-dra-driver.sh +++ b/demo/clusters/kind/install-dra-driver.sh @@ -25,12 +25,12 @@ source "${CURRENT_DIR}/scripts/common.sh" kubectl label node -l node-role.x-k8s.io/worker --overwrite nvidia.com/dra.kubelet-plugin=true kubectl label node -l node-role.x-k8s.io/control-plane --overwrite nvidia.com/dra.controller=true -helm upgrade -i --create-namespace --namespace nvidia-dra-driver nvidia ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \ +helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \ ${NVIDIA_DRIVER_ROOT:+--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT}} \ --wait set +x printf '\033[0;32m' echo "Driver installation complete:" -kubectl get pod -n nvidia-dra-driver +kubectl get pod -n nvidia printf '\033[0m' From 9ec8fd0e52569019e711c4f8ee4f1c3397beafdd Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Mon, 21 Oct 2024 13:42:38 +0000 Subject: [PATCH 3/5] Explicitly call plugin.Stop() on kubelet plugin shutdown Signed-off-by: Kevin Klues --- cmd/nvidia-dra-plugin/driver.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmd/nvidia-dra-plugin/driver.go b/cmd/nvidia-dra-plugin/driver.go index c7ca51330..da8654042 100644 --- a/cmd/nvidia-dra-plugin/driver.go +++ b/cmd/nvidia-dra-plugin/driver.go @@ -30,7 +30,6 @@ import ( type driver struct { sync.Mutex - doneCh chan struct{} client coreclientset.Interface plugin kubeletplugin.DRAPlugin state *DeviceState @@ -79,7 +78,7 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) { } func (d *driver) Shutdown(ctx context.Context) error { - close(d.doneCh) + d.plugin.Stop() return nil } From 9b16df126cdda3416d263e0a28b89df0a6ebf2a1 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Sun, 20 Oct 2024 18:36:11 +0000 Subject: [PATCH 4/5] Fix nodeSelectors/affinities for control-plan/gpu-plugin helm charts Signed-off-by: Kevin Klues --- demo/clusters/kind/install-dra-driver.sh | 3 --- deployments/helm/k8s-dra-driver/values.yaml | 29 +++++++++++++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/demo/clusters/kind/install-dra-driver.sh b/demo/clusters/kind/install-dra-driver.sh index d826cac70..2039b251e 100755 --- a/demo/clusters/kind/install-dra-driver.sh +++ b/demo/clusters/kind/install-dra-driver.sh @@ -22,9 +22,6 @@ set -o pipefail source "${CURRENT_DIR}/scripts/common.sh" -kubectl label node -l node-role.x-k8s.io/worker --overwrite nvidia.com/dra.kubelet-plugin=true -kubectl label node -l node-role.x-k8s.io/control-plane --overwrite nvidia.com/dra.controller=true - helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \ ${NVIDIA_DRIVER_ROOT:+--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT}} \ --wait diff --git a/deployments/helm/k8s-dra-driver/values.yaml b/deployments/helm/k8s-dra-driver/values.yaml index 512cdd7c9..aeb0e01de 100644 --- a/deployments/helm/k8s-dra-driver/values.yaml +++ b/deployments/helm/k8s-dra-driver/values.yaml @@ -54,8 +54,7 @@ controller: priorityClassName: "system-node-critical" podAnnotations: {} podSecurityContext: {} - nodeSelector: - nvidia.com/dra.controller: "true" + nodeSelector: {} tolerations: - key: node-role.kubernetes.io/master operator: Exists @@ -75,10 +74,8 @@ kubeletPlugin: type: RollingUpdate podAnnotations: {} podSecurityContext: {} - nodeSelector: - nvidia.com/dra.kubelet-plugin: "true" + nodeSelector: {} tolerations: [] - affinity: {} containers: init: securityContext: {} @@ -87,3 +84,25 @@ kubeletPlugin: securityContext: privileged: true resources: {} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + # On discrete-GPU based systems NFD adds the following label where 10de is the NVIDIA PCI vendor ID + - key: feature.node.kubernetes.io/pci-10de.present + operator: In + values: + - "true" + - matchExpressions: + # On some Tegra-based systems NFD detects the CPU vendor ID as NVIDIA + - key: feature.node.kubernetes.io/cpu-model.vendor_id + operator: In + values: + - "NVIDIA" + - matchExpressions: + # We allow a GPU deployment to be forced by setting the following label to "true" + - key: "nvidia.com/gpu.present" + operator: In + values: + - "true" From e8e3e43555d3950d248f5b2185f93632a7910f4d Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Sun, 20 Oct 2024 21:47:10 +0000 Subject: [PATCH 5/5] Allow selection of device classes managed by driver Signed-off-by: Kevin Klues --- cmd/nvidia-dra-controller/main.go | 18 ++++- cmd/nvidia-dra-controller/types.go | 24 +++++++ cmd/nvidia-dra-plugin/device_state.go | 2 +- cmd/nvidia-dra-plugin/driver.go | 6 ++ cmd/nvidia-dra-plugin/main.go | 10 +++ cmd/nvidia-dra-plugin/nvlib.go | 68 ++++++++++++++----- cmd/nvidia-dra-plugin/types.go | 2 +- demo/clusters/kind/install-dra-driver.sh | 2 + .../k8s-dra-driver/templates/_helpers.tpl | 32 +++++++++ .../k8s-dra-driver/templates/controller.yaml | 19 ++++++ .../templates/deviceclass-gpu.yaml | 2 + .../templates/deviceclass-imex.yaml | 2 + .../templates/deviceclass-mig.yaml | 2 + .../templates/kubeletplugin.yaml | 2 + .../k8s-dra-driver/templates/validation.yaml | 63 +++++++++++++++++ .../k8s-dra-driver/templates/validation.yml | 17 ----- deployments/helm/k8s-dra-driver/values.yaml | 2 + 17 files changed, 235 insertions(+), 38 deletions(-) create mode 100644 cmd/nvidia-dra-controller/types.go create mode 100644 deployments/helm/k8s-dra-driver/templates/validation.yaml delete mode 100644 deployments/helm/k8s-dra-driver/templates/validation.yml diff --git a/cmd/nvidia-dra-controller/main.go b/cmd/nvidia-dra-controller/main.go index e11b3dc71..d4c78e8c8 100644 --- a/cmd/nvidia-dra-controller/main.go +++ b/cmd/nvidia-dra-controller/main.go @@ -28,6 +28,7 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/urfave/cli/v2" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/component-base/metrics/legacyregistry" "k8s.io/klog/v2" @@ -49,6 +50,8 @@ type Flags struct { httpEndpoint string metricsPath string profilePath string + + deviceClasses sets.Set[string] } type Config struct { @@ -105,6 +108,12 @@ func newApp() *cli.App { Destination: &flags.profilePath, EnvVars: []string{"PPROF_PATH"}, }, + &cli.StringSliceFlag{ + Name: "device-classes", + Usage: "The supported set of DRA device classes", + Value: cli.NewStringSlice(GpuDeviceType, MigDeviceType, ImexChannelType), + EnvVars: []string{"DEVICE_CLASSES"}, + }, } cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...) @@ -125,6 +134,7 @@ func newApp() *cli.App { Action: func(c *cli.Context) error { ctx := c.Context mux := http.NewServeMux() + flags.deviceClasses = sets.New[string](c.StringSlice("device-classes")...) clientSets, err := flags.kubeClientConfig.NewClientSets() if err != nil { @@ -144,9 +154,11 @@ func newApp() *cli.App { } } - err = StartIMEXManager(ctx, config) - if err != nil { - return fmt.Errorf("start IMEX manager: %w", err) + if flags.deviceClasses.Has(ImexChannelType) { + err = StartIMEXManager(ctx, config) + if err != nil { + return fmt.Errorf("start IMEX manager: %w", err) + } } <-ctx.Done() diff --git a/cmd/nvidia-dra-controller/types.go b/cmd/nvidia-dra-controller/types.go new file mode 100644 index 000000000..48ad32a09 --- /dev/null +++ b/cmd/nvidia-dra-controller/types.go @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +const ( + GpuDeviceType = "gpu" + MigDeviceType = "mig" + ImexChannelType = "imex" + UnknownDeviceType = "unknown" +) diff --git a/cmd/nvidia-dra-plugin/device_state.go b/cmd/nvidia-dra-plugin/device_state.go index 11f42afd2..33499e9c3 100644 --- a/cmd/nvidia-dra-plugin/device_state.go +++ b/cmd/nvidia-dra-plugin/device_state.go @@ -61,7 +61,7 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) { return nil, fmt.Errorf("failed to create device library: %w", err) } - allocatable, err := nvdevlib.enumerateAllPossibleDevices() + allocatable, err := nvdevlib.enumerateAllPossibleDevices(config) if err != nil { return nil, fmt.Errorf("error enumerating all possible devices: %w", err) } diff --git a/cmd/nvidia-dra-plugin/driver.go b/cmd/nvidia-dra-plugin/driver.go index da8654042..d3c5bed44 100644 --- a/cmd/nvidia-dra-plugin/driver.go +++ b/cmd/nvidia-dra-plugin/driver.go @@ -60,6 +60,12 @@ func NewDriver(ctx context.Context, config *Config) (*driver, error) { } driver.plugin = plugin + // If not responsible for advertising GPUs or MIG devices, we are done + if !(config.flags.deviceClasses.Has(GpuDeviceType) || config.flags.deviceClasses.Has(MigDeviceType)) { + return driver, nil + } + + // Otherwise, enumerate the set of GPU and MIG devices and publish them var resources kubeletplugin.Resources for _, device := range state.allocatable { // Explicitly exclude IMEX channels from being advertised here. They diff --git a/cmd/nvidia-dra-plugin/main.go b/cmd/nvidia-dra-plugin/main.go index 0e04402f1..2fbdc07a3 100644 --- a/cmd/nvidia-dra-plugin/main.go +++ b/cmd/nvidia-dra-plugin/main.go @@ -25,6 +25,7 @@ import ( "github.com/urfave/cli/v2" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" "github.com/NVIDIA/k8s-dra-driver/internal/info" @@ -50,6 +51,7 @@ type Flags struct { containerDriverRoot string hostDriverRoot string nvidiaCTKPath string + deviceClasses sets.Set[string] } type Config struct { @@ -112,6 +114,12 @@ func newApp() *cli.App { Destination: &flags.nvidiaCTKPath, EnvVars: []string{"NVIDIA_CTK_PATH"}, }, + &cli.StringSliceFlag{ + Name: "device-classes", + Usage: "The supported set of DRA device classes", + Value: cli.NewStringSlice(GpuDeviceType, MigDeviceType, ImexChannelType), + EnvVars: []string{"DEVICE_CLASSES"}, + }, } cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...) cliFlags = append(cliFlags, flags.loggingConfig.Flags()...) @@ -130,6 +138,8 @@ func newApp() *cli.App { }, Action: func(c *cli.Context) error { ctx := c.Context + flags.deviceClasses = sets.New[string](c.StringSlice("device-classes")...) + clientSets, err := flags.kubeClientConfig.NewClientSets() if err != nil { return fmt.Errorf("create client: %w", err) diff --git a/cmd/nvidia-dra-plugin/nvlib.go b/cmd/nvidia-dra-plugin/nvlib.go index 71eae0514..c35bbbab0 100644 --- a/cmd/nvidia-dra-plugin/nvlib.go +++ b/cmd/nvidia-dra-plugin/nvlib.go @@ -108,36 +108,66 @@ func (l deviceLib) alwaysShutdown() { } } -func (l deviceLib) enumerateAllPossibleDevices() (AllocatableDevices, error) { +func (l deviceLib) enumerateAllPossibleDevices(config *Config) (AllocatableDevices, error) { + alldevices := make(AllocatableDevices) + deviceClasses := config.flags.deviceClasses + + if deviceClasses.Has(GpuDeviceType) || deviceClasses.Has(MigDeviceType) { + gms, err := l.enumerateGpusAndMigDevices(config) + if err != nil { + return nil, fmt.Errorf("error enumerating IMEX devices: %w", err) + } + for k, v := range gms { + alldevices[k] = v + } + } + + if deviceClasses.Has(ImexChannelType) { + imex, err := l.enumerateImexChannels(config) + if err != nil { + return nil, fmt.Errorf("error enumerating IMEX devices: %w", err) + } + for k, v := range imex { + alldevices[k] = v + } + } + + return alldevices, nil +} + +func (l deviceLib) enumerateGpusAndMigDevices(config *Config) (AllocatableDevices, error) { if err := l.Init(); err != nil { return nil, err } defer l.alwaysShutdown() - alldevices := make(AllocatableDevices) + devices := make(AllocatableDevices) + deviceClasses := config.flags.deviceClasses err := l.VisitDevices(func(i int, d nvdev.Device) error { gpuInfo, err := l.getGpuInfo(i, d) if err != nil { return fmt.Errorf("error getting info for GPU %d: %w", i, err) } - migs, err := l.getMigDevices(gpuInfo) - if err != nil { - return fmt.Errorf("error getting MIG devices for GPU %d: %w", i, err) - } - - for _, migDeviceInfo := range migs { + if deviceClasses.Has(GpuDeviceType) && !gpuInfo.migEnabled { deviceInfo := &AllocatableDevice{ - Mig: migDeviceInfo, + Gpu: gpuInfo, } - alldevices[migDeviceInfo.CanonicalName()] = deviceInfo + devices[gpuInfo.CanonicalName()] = deviceInfo } - if !gpuInfo.migEnabled && len(migs) == 0 { - deviceInfo := &AllocatableDevice{ - Gpu: gpuInfo, + if deviceClasses.Has(MigDeviceType) { + migs, err := l.getMigDevices(gpuInfo) + if err != nil { + return fmt.Errorf("error getting MIG devices for GPU %d: %w", i, err) + } + + for _, migDeviceInfo := range migs { + deviceInfo := &AllocatableDevice{ + Mig: migDeviceInfo, + } + devices[migDeviceInfo.CanonicalName()] = deviceInfo } - alldevices[gpuInfo.CanonicalName()] = deviceInfo } return nil @@ -146,6 +176,12 @@ func (l deviceLib) enumerateAllPossibleDevices() (AllocatableDevices, error) { return nil, fmt.Errorf("error visiting devices: %w", err) } + return devices, nil +} + +func (l deviceLib) enumerateImexChannels(config *Config) (AllocatableDevices, error) { + devices := make(AllocatableDevices) + imexChannelCount, err := l.getImexChannelCount() if err != nil { return nil, fmt.Errorf("error getting IMEX channel count: %w", err) @@ -157,10 +193,10 @@ func (l deviceLib) enumerateAllPossibleDevices() (AllocatableDevices, error) { deviceInfo := &AllocatableDevice{ ImexChannel: imexChannelInfo, } - alldevices[imexChannelInfo.CanonicalName()] = deviceInfo + devices[imexChannelInfo.CanonicalName()] = deviceInfo } - return alldevices, nil + return devices, nil } func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) { diff --git a/cmd/nvidia-dra-plugin/types.go b/cmd/nvidia-dra-plugin/types.go index 2ca4a1ad8..df20c54fe 100644 --- a/cmd/nvidia-dra-plugin/types.go +++ b/cmd/nvidia-dra-plugin/types.go @@ -19,7 +19,7 @@ package main const ( GpuDeviceType = "gpu" MigDeviceType = "mig" - ImexChannelType = "imex-channel" + ImexChannelType = "imex" UnknownDeviceType = "unknown" ) diff --git a/demo/clusters/kind/install-dra-driver.sh b/demo/clusters/kind/install-dra-driver.sh index 2039b251e..4a2533d34 100755 --- a/demo/clusters/kind/install-dra-driver.sh +++ b/demo/clusters/kind/install-dra-driver.sh @@ -22,7 +22,9 @@ set -o pipefail source "${CURRENT_DIR}/scripts/common.sh" +deviceClasses=${1:-"gpu,mig,imex"} helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \ + --set deviceClasses="{${deviceClasses}}" \ ${NVIDIA_DRIVER_ROOT:+--set nvidiaDriverRoot=${NVIDIA_DRIVER_ROOT}} \ --wait diff --git a/deployments/helm/k8s-dra-driver/templates/_helpers.tpl b/deployments/helm/k8s-dra-driver/templates/_helpers.tpl index 4c1d0c2cf..7cf4ea012 100644 --- a/deployments/helm/k8s-dra-driver/templates/_helpers.tpl +++ b/deployments/helm/k8s-dra-driver/templates/_helpers.tpl @@ -95,3 +95,35 @@ Create the name of the service account to use {{- default "default" .Values.serviceAccount.name }} {{- end }} {{- end }} + +{{/* +Check for the existence of an element in a list +*/}} +{{- define "k8s-dra-driver.listHas" -}} + {{- $listToCheck := index . 0 }} + {{- $valueToCheck := index . 1 }} + + {{- $found := "" -}} + {{- range $listToCheck}} + {{- if eq . $valueToCheck }} + {{- $found = "true" -}} + {{- end }} + {{- end }} + {{- $found -}} +{{- end }} + +{{/* +Filter a list by a set of valid values +*/}} +{{- define "k8s-dra-driver.filterList" -}} + {{- $listToFilter := index . 0 }} + {{- $validValues := index . 1 }} + + {{- $result := list -}} + {{- range $validValues}} + {{- if include "k8s-dra-driver.listHas" (list $listToFilter .) }} + {{- $result = append $result . }} + {{- end }} + {{- end }} + {{- $result -}} +{{- end -}} diff --git a/deployments/helm/k8s-dra-driver/templates/controller.yaml b/deployments/helm/k8s-dra-driver/templates/controller.yaml index b8f77a291..440526831 100644 --- a/deployments/helm/k8s-dra-driver/templates/controller.yaml +++ b/deployments/helm/k8s-dra-driver/templates/controller.yaml @@ -1,3 +1,19 @@ +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if (include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "imex")) }} +{{- $deviceClasses := include "k8s-dra-driver.filterList" (list $.Values.deviceClasses (list "imex")) }} --- apiVersion: apps/v1 kind: Deployment @@ -40,6 +56,8 @@ spec: resources: {{- toYaml .Values.controller.containers.controller.resources | nindent 10 }} env: + - name: DEVICE_CLASSES + value: {{ .Values.deviceClasses | join "," }} - name: POD_NAME valueFrom: fieldRef: @@ -60,3 +78,4 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} +{{- end }} diff --git a/deployments/helm/k8s-dra-driver/templates/deviceclass-gpu.yaml b/deployments/helm/k8s-dra-driver/templates/deviceclass-gpu.yaml index ffd1382eb..fa1c35107 100644 --- a/deployments/helm/k8s-dra-driver/templates/deviceclass-gpu.yaml +++ b/deployments/helm/k8s-dra-driver/templates/deviceclass-gpu.yaml @@ -1,3 +1,4 @@ +{{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "gpu") }} --- apiVersion: resource.k8s.io/v1alpha3 kind: DeviceClass @@ -7,3 +8,4 @@ spec: selectors: - cel: expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'" +{{- end }} diff --git a/deployments/helm/k8s-dra-driver/templates/deviceclass-imex.yaml b/deployments/helm/k8s-dra-driver/templates/deviceclass-imex.yaml index c076e112c..9d4446b8f 100644 --- a/deployments/helm/k8s-dra-driver/templates/deviceclass-imex.yaml +++ b/deployments/helm/k8s-dra-driver/templates/deviceclass-imex.yaml @@ -1,3 +1,4 @@ +{{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "imex") }} --- apiVersion: resource.k8s.io/v1alpha3 kind: DeviceClass @@ -7,3 +8,4 @@ spec: selectors: - cel: expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'imex-channel'" +{{- end }} diff --git a/deployments/helm/k8s-dra-driver/templates/deviceclass-mig.yaml b/deployments/helm/k8s-dra-driver/templates/deviceclass-mig.yaml index 00d1c2312..66dbe7ed1 100644 --- a/deployments/helm/k8s-dra-driver/templates/deviceclass-mig.yaml +++ b/deployments/helm/k8s-dra-driver/templates/deviceclass-mig.yaml @@ -1,3 +1,4 @@ +{{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "mig") }} --- apiVersion: resource.k8s.io/v1alpha3 kind: DeviceClass @@ -7,3 +8,4 @@ spec: selectors: - cel: expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'mig'" +{{- end }} diff --git a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml index 582d2f8c1..c189cca02 100644 --- a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml +++ b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml @@ -75,6 +75,8 @@ spec: value: /var/run/cdi - name: NVIDIA_MIG_CONFIG_DEVICES value: all + - name: DEVICE_CLASSES + value: {{ .Values.deviceClasses | join "," }} - name: NODE_NAME valueFrom: fieldRef: diff --git a/deployments/helm/k8s-dra-driver/templates/validation.yaml b/deployments/helm/k8s-dra-driver/templates/validation.yaml new file mode 100644 index 000000000..ce2dbe689 --- /dev/null +++ b/deployments/helm/k8s-dra-driver/templates/validation.yaml @@ -0,0 +1,63 @@ +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- $validDeviceClasses := list "gpu" "mig" "imex" }} + +{{- if not (kindIs "slice" .Values.deviceClasses) }} +{{- $error := "" }} +{{- $error = printf "%s\nValue 'deviceClasses' must be a list: %v" $error .Values.deviceClasses }} +{{- $error = printf "%s\nValid device classes are: %v" $error $validDeviceClasses }} +{{- fail $error }} +{{- end }} + +{{- if eq (len .Values.deviceClasses) 0 }} +{{- $error := "" }} +{{- $error = printf "%s\nAt least one 'deviceClass' must be specified." $error }} +{{- $error = printf "%s\nValid device classes are: %v" $error $validDeviceClasses }} +{{- fail $error }} +{{- end }} + +{{- range .Values.deviceClasses }} + {{- $deviceClass := . }} + {{- $found := false }} + {{- range $validDeviceClasses }} + {{- if eq . $deviceClass }} + {{- $found = true }} + {{- end }} + {{- end }} + {{- if not $found }} + {{- $error := "" }} + {{- $error = printf "%s\nInvalid value in 'deviceClasses': %s" $error $deviceClass }} + {{- $error = printf "%s\nValid device classes are: %v" $error $validDeviceClasses }} + {{- fail $error }} + {{- end }} +{{- end }} + +{{- if .Values.namespace }} +{{- $error := "" }} +{{- $error = printf "%s\nValue 'namespace' set to %s" $error .Values.namespace }} +{{- $error = printf "%s\nSetting an explicit 'namespace' in values.yaml or via --set on the command line is no longer supported." $error }} +{{- $error = printf "%s\nUse --namespace (with --create-namespace as necessary) instead." $error }} +{{- $error = printf "%s\nSee: https://helm.sh/docs/helm/helm_install/#options" $error }} +{{- fail $error }} +{{- end }} + +{{- if and (eq (include "k8s-dra-driver.namespace" .) "default") ( eq .Values.namespaceOverride "") (not .Values.allowDefaultNamespace) }} +{{- $error := "" }} +{{- $error = printf "%s\nRunning in the 'default' namespace is not recommended." $error }} +{{- $error = printf "%s\nSet 'allowDefaultNamespace=true' to bypass this error." $error }} +{{- $error = printf "%s\nOtherwise, use --namespace (with --create-namespace as necessary) to run in a specific namespace." $error }} +{{- $error = printf "%s\nSee: https://helm.sh/docs/helm/helm_install/#options" $error }} +{{- fail $error }} +{{- end }} diff --git a/deployments/helm/k8s-dra-driver/templates/validation.yml b/deployments/helm/k8s-dra-driver/templates/validation.yml deleted file mode 100644 index a87325621..000000000 --- a/deployments/helm/k8s-dra-driver/templates/validation.yml +++ /dev/null @@ -1,17 +0,0 @@ -{{- if .Values.namespace }} -{{- $error := "" }} -{{- $error = printf "%s\nValue 'namespace' set to %s" $error .Values.namespace }} -{{- $error = printf "%s\nSetting an explicit 'namespace' in values.yaml or via --set on the command line is no longer supported." $error }} -{{- $error = printf "%s\nUse --namespace (with --create-namespace as necessary) instead." $error }} -{{- $error = printf "%s\nSee: https://helm.sh/docs/helm/helm_install/#options" $error }} -{{- fail $error }} -{{- end }} - -{{- if and (eq (include "k8s-dra-driver.namespace" .) "default") ( eq .Values.namespaceOverride "") (not .Values.allowDefaultNamespace) }} -{{- $error := "" }} -{{- $error = printf "%s\nRunning in the 'default' namespace is not recommended." $error }} -{{- $error = printf "%s\nSet 'allowDefaultNamespace=true' to bypass this error." $error }} -{{- $error = printf "%s\nOtherwise, use --namespace (with --create-namespace as necessary) to run in a specific namespace." $error }} -{{- $error = printf "%s\nSee: https://helm.sh/docs/helm/helm_install/#options" $error }} -{{- fail $error }} -{{- end }} diff --git a/deployments/helm/k8s-dra-driver/values.yaml b/deployments/helm/k8s-dra-driver/values.yaml index aeb0e01de..09ce8d3be 100644 --- a/deployments/helm/k8s-dra-driver/values.yaml +++ b/deployments/helm/k8s-dra-driver/values.yaml @@ -34,6 +34,8 @@ selectorLabelsOverride: {} allowDefaultNamespace: false +deviceClasses: ["gpu", "mig", "imex"] + imagePullSecrets: [] image: repository: nvcr.io/nvidia/cloud-native/k8s-dra-driver