Skip to content

Commit 7132fe1

Browse files
authored
Merge pull request #1419 from NVIDIA/cpicks1
[release-25.3] Cherrypick commits
2 parents b2df634 + c8acd8b commit 7132fe1

File tree

21 files changed

+192
-45
lines changed

21 files changed

+192
-45
lines changed

.nvidia-ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ sign:ngc-gpu-operator-validator:
237237
OPERATOR_IMAGE: "${STAGING_REGISTRY}/gpu-operator"
238238
VALIDATOR_VERSION: "${CI_COMMIT_SHORT_SHA}"
239239
VALIDATOR_IMAGE: "${STAGING_REGISTRY}/gpu-operator-validator"
240-
TARGET_DRIVER_VERSION: "550.144.03"
240+
TARGET_DRIVER_VERSION: "550.163.01"
241241

242242
.e2e_tests:
243243
extends:

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,11 @@ type DCGMExporterSpec struct {
910910
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
911911
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA DCGM Exporter"
912912
ServiceMonitor *DCGMExporterServiceMonitorConfig `json:"serviceMonitor,omitempty"`
913+
914+
// Optional: Service configuration for NVIDIA DCGM Exporter
915+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
916+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Service configuration for NVIDIA DCGM Exporter"
917+
ServiceSpec *DCGMExporterServiceConfig `json:"service,omitempty"`
913918
}
914919

915920
// DCGMExporterMetricsConfig defines metrics to be collected by NVIDIA DCGM Exporter
@@ -922,6 +927,21 @@ type DCGMExporterMetricsConfig struct {
922927
Name string `json:"name,omitempty"`
923928
}
924929

930+
// DCGMExporterServiceConfig defines the configuration options for the Kubernetes Service deployed for DCGM Exporter
931+
type DCGMExporterServiceConfig struct {
932+
// Type represents the ServiceType which describes ingress methods for a service
933+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
934+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceType for the DCGM Exporter K8s Service"
935+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
936+
Type corev1.ServiceType `json:"type,omitempty"`
937+
938+
// InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP.
939+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
940+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Internal Traffic Policy for the DCGM Exporter K8s Service"
941+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
942+
InternalTrafficPolicy *corev1.ServiceInternalTrafficPolicy `json:"internalTrafficPolicy,omitempty"`
943+
}
944+
925945
// DCGMExporterServiceMonitorConfig defines configuration options for the ServiceMonitor
926946
// deployed for DCGM Exporter
927947
type DCGMExporterServiceMonitorConfig struct {

api/nvidia/v1/zz_generated.deepcopy.go

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ metadata:
161161
"driverType": "gpu",
162162
"repository": "nvcr.io/nvidia",
163163
"image": "driver",
164-
"version": "sha256:b93f1aab82ceb081919f58e13a66142c5857a0a5e68e13661a3e74696434c1d1",
164+
"version": "sha256:f581bc75cd8aece5dd9fb93be17e1e91d0d533a01af5a1880cf962eca7890662",
165165
"nodeSelector": {},
166166
"manager": {},
167167
"repoConfig": {
@@ -203,21 +203,21 @@ spec:
203203
- name: gpu-operator-image
204204
image: ghcr.io/nvidia/gpu-operator:main-latest
205205
- name: dcgm-exporter-image
206-
image: nvcr.io/nvidia/k8s/dcgm-exporter@sha256:624b1014b03abbd04a20664c5ced46685185110b41d5dfdcf63ede71c62d4225
206+
image: nvcr.io/nvidia/k8s/dcgm-exporter@sha256:b848747435dfecb216484d16363a9897f64232b3c3ae7f171dde06525d8606b4
207207
- name: dcgm-image
208-
image: nvcr.io/nvidia/cloud-native/dcgm@sha256:7d4afc8530a983a1f036dce6654603b5b6883f2214aa102118fb4491880e21d7
208+
image: nvcr.io/nvidia/cloud-native/dcgm@sha256:ec473ac9f8e4f638e97ec5ffd6f6d3dbbfc3a53bdd07514745c8868676979bba
209209
- name: container-toolkit-image
210-
image: nvcr.io/nvidia/k8s/container-toolkit@sha256:0c97336dbab95ef1f412b9cd6a4063809b8b578fad6ee795c0b19143d1c7463f
210+
image: nvcr.io/nvidia/k8s/container-toolkit@sha256:9f82c554a34dc612c5b02a3583c02eed4c0fd04bdfe5015cf8d457a80a3d7a4b
211211
- name: driver-image
212-
image: nvcr.io/nvidia/driver@sha256:b93f1aab82ceb081919f58e13a66142c5857a0a5e68e13661a3e74696434c1d1
212+
image: nvcr.io/nvidia/driver@sha256:f581bc75cd8aece5dd9fb93be17e1e91d0d533a01af5a1880cf962eca7890662
213213
- name: driver-image-550
214-
image: nvcr.io/nvidia/driver@sha256:6fe74322562c726c8fade184d8c45ebae3da7b1ea0a21f0ff9dc42c66c65e692
214+
image: nvcr.io/nvidia/driver@sha256:8b89435d54a2e6a33c480dd0659e9a4a73f872a6187f9f9eadd934ecb45ac273
215215
- name: driver-image-535
216-
image: nvcr.io/nvidia/driver@sha256:026f8f2d29b7058ecaaa5b98666e28e1b3646b73e7d7cda3dc8026d47a929152
216+
image: nvcr.io/nvidia/driver@sha256:5bc9bf943a6240853f3effecbc7ec9ebdfe98fba40ad9a0dc2aab9bf519c9a10
217217
- name: device-plugin-image
218-
image: nvcr.io/nvidia/k8s-device-plugin@sha256:af31e2b7c7f89834c4e5219860def7ac2e49a207b3d4e8610d5a26772b7738e5
218+
image: nvcr.io/nvidia/k8s-device-plugin@sha256:037160a36de0f060fc21cc0cb2f795d980282ff1471b55530433ca4350b24c4f
219219
- name: gpu-feature-discovery-image
220-
image: nvcr.io/nvidia/k8s-device-plugin@sha256:af31e2b7c7f89834c4e5219860def7ac2e49a207b3d4e8610d5a26772b7738e5
220+
image: nvcr.io/nvidia/k8s-device-plugin@sha256:037160a36de0f060fc21cc0cb2f795d980282ff1471b55530433ca4350b24c4f
221221
- name: mig-manager-image
222222
image: nvcr.io/nvidia/cloud-native/k8s-mig-manager@sha256:d959c62e5098320744acd1b9d4869fc84074adc8e49b4b5defa2d6c4be57a6dc
223223
- name: init-container-image
@@ -233,7 +233,7 @@ spec:
233233
- name: vgpu-device-manager-image
234234
image: nvcr.io/nvidia/cloud-native/vgpu-device-manager@sha256:f899f870a1ef59611a8c3b3e9f58fa3c4d89d930c776d161737d017a3bb2feb0
235235
- name: gdrcopy-image
236-
image: nvcr.io/nvidia/cloud-native/gdrdrv@sha256:5024bbdb11698d8d867bced6078b82ef28e78144ed8f9b91bb2f8dedbc09a8d8
236+
image: nvcr.io/nvidia/cloud-native/gdrdrv@sha256:8d4ed9f70bab1553b62d1d92fd3410f9cdf6205d8e8d88fd96228ff5a34a7052
237237
customresourcedefinitions:
238238
owned:
239239
- name: nvidiadrivers.nvidia.com
@@ -897,21 +897,21 @@ spec:
897897
- name: "VALIDATOR_IMAGE"
898898
value: "ghcr.io/nvidia/gpu-operator/gpu-operator-validator:main-latest"
899899
- name: "GFD_IMAGE"
900-
value: "nvcr.io/nvidia/k8s-device-plugin@sha256:af31e2b7c7f89834c4e5219860def7ac2e49a207b3d4e8610d5a26772b7738e5"
900+
value: "nvcr.io/nvidia/k8s-device-plugin@sha256:037160a36de0f060fc21cc0cb2f795d980282ff1471b55530433ca4350b24c4f"
901901
- name: "CONTAINER_TOOLKIT_IMAGE"
902-
value: "nvcr.io/nvidia/k8s/container-toolkit@sha256:0c97336dbab95ef1f412b9cd6a4063809b8b578fad6ee795c0b19143d1c7463f"
902+
value: "nvcr.io/nvidia/k8s/container-toolkit@sha256:9f82c554a34dc612c5b02a3583c02eed4c0fd04bdfe5015cf8d457a80a3d7a4b"
903903
- name: "DCGM_IMAGE"
904-
value: "nvcr.io/nvidia/cloud-native/dcgm@sha256:7d4afc8530a983a1f036dce6654603b5b6883f2214aa102118fb4491880e21d7"
904+
value: "nvcr.io/nvidia/cloud-native/dcgm@sha256:ec473ac9f8e4f638e97ec5ffd6f6d3dbbfc3a53bdd07514745c8868676979bba"
905905
- name: "DCGM_EXPORTER_IMAGE"
906-
value: "nvcr.io/nvidia/k8s/dcgm-exporter@sha256:624b1014b03abbd04a20664c5ced46685185110b41d5dfdcf63ede71c62d4225"
906+
value: "nvcr.io/nvidia/k8s/dcgm-exporter@sha256:b848747435dfecb216484d16363a9897f64232b3c3ae7f171dde06525d8606b4"
907907
- name: "DEVICE_PLUGIN_IMAGE"
908-
value: "nvcr.io/nvidia/k8s-device-plugin@sha256:af31e2b7c7f89834c4e5219860def7ac2e49a207b3d4e8610d5a26772b7738e5"
908+
value: "nvcr.io/nvidia/k8s-device-plugin@sha256:037160a36de0f060fc21cc0cb2f795d980282ff1471b55530433ca4350b24c4f"
909909
- name: "DRIVER_IMAGE"
910-
value: "nvcr.io/nvidia/driver@sha256:b93f1aab82ceb081919f58e13a66142c5857a0a5e68e13661a3e74696434c1d1"
910+
value: "nvcr.io/nvidia/driver@sha256:f581bc75cd8aece5dd9fb93be17e1e91d0d533a01af5a1880cf962eca7890662"
911911
- name: "DRIVER_IMAGE-550"
912-
value: "nvcr.io/nvidia/driver@sha256:6fe74322562c726c8fade184d8c45ebae3da7b1ea0a21f0ff9dc42c66c65e692"
912+
value: "nvcr.io/nvidia/driver@sha256:8b89435d54a2e6a33c480dd0659e9a4a73f872a6187f9f9eadd934ecb45ac273"
913913
- name: "DRIVER_IMAGE-535"
914-
value: "nvcr.io/nvidia/driver@sha256:026f8f2d29b7058ecaaa5b98666e28e1b3646b73e7d7cda3dc8026d47a929152"
914+
value: "nvcr.io/nvidia/driver@sha256:5bc9bf943a6240853f3effecbc7ec9ebdfe98fba40ad9a0dc2aab9bf519c9a10"
915915
- name: "DRIVER_MANAGER_IMAGE"
916916
value: "nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:c525320fd1e771b911b68f8e760b83e8fccf1beea43bf9b009c4f0c591e193ea"
917917
- name: "MIG_MANAGER_IMAGE"
@@ -925,7 +925,7 @@ spec:
925925
- name: "VGPU_DEVICE_MANAGER_IMAGE"
926926
value: "nvcr.io/nvidia/cloud-native/vgpu-device-manager@sha256:f899f870a1ef59611a8c3b3e9f58fa3c4d89d930c776d161737d017a3bb2feb0"
927927
- name: "GDRCOPY_IMAGE"
928-
value: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:5024bbdb11698d8d867bced6078b82ef28e78144ed8f9b91bb2f8dedbc09a8d8"
928+
value: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:8d4ed9f70bab1553b62d1d92fd3410f9cdf6205d8e8d88fd96228ff5a34a7052"
929929
terminationGracePeriodSeconds: 10
930930
volumes:
931931
- hostPath:

bundle/manifests/nvidia.com_clusterpolicies.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,19 @@ spec:
381381
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
382382
type: object
383383
type: object
384+
service:
385+
description: 'Optional: Service configuration for NVIDIA DCGM
386+
Exporter'
387+
properties:
388+
internalTrafficPolicy:
389+
description: InternalTrafficPolicy describes how nodes distribute
390+
service traffic they receive on the ClusterIP.
391+
type: string
392+
type:
393+
description: Type represents the ServiceType which describes
394+
ingress methods for a service
395+
type: string
396+
type: object
384397
serviceMonitor:
385398
description: 'Optional: ServiceMonitor configuration for NVIDIA
386399
DCGM Exporter'

config/crd/bases/nvidia.com_clusterpolicies.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,19 @@ spec:
381381
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
382382
type: object
383383
type: object
384+
service:
385+
description: 'Optional: Service configuration for NVIDIA DCGM
386+
Exporter'
387+
properties:
388+
internalTrafficPolicy:
389+
description: InternalTrafficPolicy describes how nodes distribute
390+
service traffic they receive on the ClusterIP.
391+
type: string
392+
type:
393+
description: Type represents the ServiceType which describes
394+
ingress methods for a service
395+
type: string
396+
type: object
384397
serviceMonitor:
385398
description: 'Optional: ServiceMonitor configuration for NVIDIA
386399
DCGM Exporter'

config/samples/nvidia_v1alpha1_nvidiadriver.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ spec:
88
driverType: gpu
99
repository: nvcr.io/nvidia
1010
image: driver
11-
version: "570.124.06"
11+
version: "570.133.20"
1212
imagePullPolicy: IfNotPresent
1313
imagePullSecrets: []
1414
nodeSelector: {}

controllers/object_controls.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,27 @@ func kernelFullVersion(n ClusterPolicyController) (string, string, string) {
703703
return kFVersion, osTag, osVersion
704704
}
705705

706+
func preprocessService(obj *corev1.Service, n ClusterPolicyController) error {
707+
logger := n.logger.WithValues("Service", obj.Name)
708+
transformations := map[string]func(*corev1.Service, *gpuv1.ClusterPolicySpec) error{
709+
"nvidia-dcgm-exporter": TransformDCGMExporterService,
710+
}
711+
712+
t, ok := transformations[obj.Name]
713+
if !ok {
714+
logger.V(2).Info(fmt.Sprintf("No transformation for Service '%s'", obj.Name))
715+
return nil
716+
}
717+
718+
err := t(obj, &n.singleton.Spec)
719+
if err != nil {
720+
logger.Error(err, "Failed to apply transformation", "Service", obj.Name)
721+
return err
722+
}
723+
724+
return nil
725+
}
726+
706727
func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error {
707728
logger := n.logger.WithValues("Daemonset", obj.Name)
708729
transformations := map[string]func(*appsv1.DaemonSet, *gpuv1.ClusterPolicySpec, ClusterPolicyController) error{
@@ -966,6 +987,20 @@ func parseOSRelease() (map[string]string, error) {
966987
return release, nil
967988
}
968989

990+
func TransformDCGMExporterService(obj *corev1.Service, config *gpuv1.ClusterPolicySpec) error {
991+
serviceConfig := config.DCGMExporter.ServiceSpec
992+
if serviceConfig != nil {
993+
if len(serviceConfig.Type) > 0 {
994+
obj.Spec.Type = serviceConfig.Type
995+
}
996+
997+
if serviceConfig.InternalTrafficPolicy != nil {
998+
obj.Spec.InternalTrafficPolicy = serviceConfig.InternalTrafficPolicy
999+
}
1000+
}
1001+
return nil
1002+
}
1003+
9691004
// TransformDriver transforms Nvidia driver daemonset with required config as per ClusterPolicy
9701005
func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
9711006
// update validation container
@@ -4477,8 +4512,14 @@ func Service(n ClusterPolicyController) (gpuv1.State, error) {
44774512
return gpuv1.NotReady, err
44784513
}
44794514

4515+
err := preprocessService(obj, n)
4516+
if err != nil {
4517+
logger.Info("Couldn't preprocess Service", "Error", err)
4518+
return gpuv1.NotReady, err
4519+
}
4520+
44804521
found := &corev1.Service{}
4481-
err := n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
4522+
err = n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
44824523
if err != nil && apierrors.IsNotFound(err) {
44834524
logger.Info("Not found, creating...")
44844525
err = n.client.Create(ctx, obj)
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
dependencies:
22
- name: node-feature-discovery
33
repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
4-
version: 0.17.2
5-
digest: sha256:4c55d30d958027ef8997a2976449326de3c90049025c3ebb9bee017cad32cc3f
6-
generated: "2025-02-25T09:08:49.128088-08:00"
4+
version: 0.17.3
5+
digest: sha256:744f561438647c0094558f855a03b853f1d2eba8ae42e7faa843264de23c5b87
6+
generated: "2025-04-24T16:35:17.28334-07:00"

deployments/gpu-operator/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,6 @@ keywords:
1919

2020
dependencies:
2121
- name: node-feature-discovery
22-
version: v0.17.2
22+
version: v0.17.3
2323
repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts
2424
condition: nfd.enabled

0 commit comments

Comments
 (0)