Skip to content

Commit c8acd8b

Browse files
committed
[dcgm-exporter] add support for setting dcgmexporter service type and internalTrafficPolicy
Signed-off-by: Tariq Ibrahim <[email protected]> (cherry picked from commit b4be47a)
1 parent 0a05d53 commit c8acd8b

File tree

8 files changed

+131
-1
lines changed

8 files changed

+131
-1
lines changed

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,11 @@ type DCGMExporterSpec struct {
910910
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
911911
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA DCGM Exporter"
912912
ServiceMonitor *DCGMExporterServiceMonitorConfig `json:"serviceMonitor,omitempty"`
913+
914+
// Optional: Service configuration for NVIDIA DCGM Exporter
915+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
916+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Service configuration for NVIDIA DCGM Exporter"
917+
ServiceSpec *DCGMExporterServiceConfig `json:"service,omitempty"`
913918
}
914919

915920
// DCGMExporterMetricsConfig defines metrics to be collected by NVIDIA DCGM Exporter
@@ -922,6 +927,21 @@ type DCGMExporterMetricsConfig struct {
922927
Name string `json:"name,omitempty"`
923928
}
924929

930+
// DCGMExporterServiceConfig defines the configuration options for the Kubernetes Service deployed for DCGM Exporter
931+
type DCGMExporterServiceConfig struct {
932+
// Type represents the ServiceType which describes ingress methods for a service
933+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
934+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceType for the DCGM Exporter K8s Service"
935+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
936+
Type corev1.ServiceType `json:"type,omitempty"`
937+
938+
// InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP.
939+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
940+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Internal Traffic Policy for the DCGM Exporter K8s Service"
941+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
942+
InternalTrafficPolicy *corev1.ServiceInternalTrafficPolicy `json:"internalTrafficPolicy,omitempty"`
943+
}
944+
925945
// DCGMExporterServiceMonitorConfig defines configuration options for the ServiceMonitor
926946
// deployed for DCGM Exporter
927947
type DCGMExporterServiceMonitorConfig struct {

api/nvidia/v1/zz_generated.deepcopy.go

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/nvidia.com_clusterpolicies.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,19 @@ spec:
381381
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
382382
type: object
383383
type: object
384+
service:
385+
description: 'Optional: Service configuration for NVIDIA DCGM
386+
Exporter'
387+
properties:
388+
internalTrafficPolicy:
389+
description: InternalTrafficPolicy describes how nodes distribute
390+
service traffic they receive on the ClusterIP.
391+
type: string
392+
type:
393+
description: Type represents the ServiceType which describes
394+
ingress methods for a service
395+
type: string
396+
type: object
384397
serviceMonitor:
385398
description: 'Optional: ServiceMonitor configuration for NVIDIA
386399
DCGM Exporter'

config/crd/bases/nvidia.com_clusterpolicies.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,19 @@ spec:
381381
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
382382
type: object
383383
type: object
384+
service:
385+
description: 'Optional: Service configuration for NVIDIA DCGM
386+
Exporter'
387+
properties:
388+
internalTrafficPolicy:
389+
description: InternalTrafficPolicy describes how nodes distribute
390+
service traffic they receive on the ClusterIP.
391+
type: string
392+
type:
393+
description: Type represents the ServiceType which describes
394+
ingress methods for a service
395+
type: string
396+
type: object
384397
serviceMonitor:
385398
description: 'Optional: ServiceMonitor configuration for NVIDIA
386399
DCGM Exporter'

controllers/object_controls.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,27 @@ func kernelFullVersion(n ClusterPolicyController) (string, string, string) {
703703
return kFVersion, osTag, osVersion
704704
}
705705

706+
func preprocessService(obj *corev1.Service, n ClusterPolicyController) error {
707+
logger := n.logger.WithValues("Service", obj.Name)
708+
transformations := map[string]func(*corev1.Service, *gpuv1.ClusterPolicySpec) error{
709+
"nvidia-dcgm-exporter": TransformDCGMExporterService,
710+
}
711+
712+
t, ok := transformations[obj.Name]
713+
if !ok {
714+
logger.V(2).Info(fmt.Sprintf("No transformation for Service '%s'", obj.Name))
715+
return nil
716+
}
717+
718+
err := t(obj, &n.singleton.Spec)
719+
if err != nil {
720+
logger.Error(err, "Failed to apply transformation", "Service", obj.Name)
721+
return err
722+
}
723+
724+
return nil
725+
}
726+
706727
func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error {
707728
logger := n.logger.WithValues("Daemonset", obj.Name)
708729
transformations := map[string]func(*appsv1.DaemonSet, *gpuv1.ClusterPolicySpec, ClusterPolicyController) error{
@@ -966,6 +987,20 @@ func parseOSRelease() (map[string]string, error) {
966987
return release, nil
967988
}
968989

990+
func TransformDCGMExporterService(obj *corev1.Service, config *gpuv1.ClusterPolicySpec) error {
991+
serviceConfig := config.DCGMExporter.ServiceSpec
992+
if serviceConfig != nil {
993+
if len(serviceConfig.Type) > 0 {
994+
obj.Spec.Type = serviceConfig.Type
995+
}
996+
997+
if serviceConfig.InternalTrafficPolicy != nil {
998+
obj.Spec.InternalTrafficPolicy = serviceConfig.InternalTrafficPolicy
999+
}
1000+
}
1001+
return nil
1002+
}
1003+
9691004
// TransformDriver transforms Nvidia driver daemonset with required config as per ClusterPolicy
9701005
func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
9711006
// update validation container
@@ -4477,8 +4512,14 @@ func Service(n ClusterPolicyController) (gpuv1.State, error) {
44774512
return gpuv1.NotReady, err
44784513
}
44794514

4515+
err := preprocessService(obj, n)
4516+
if err != nil {
4517+
logger.Info("Couldn't preprocess Service", "Error", err)
4518+
return gpuv1.NotReady, err
4519+
}
4520+
44804521
found := &corev1.Service{}
4481-
err := n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
4522+
err = n.client.Get(ctx, types.NamespacedName{Namespace: obj.Namespace, Name: obj.Name}, found)
44824523
if err != nil && apierrors.IsNotFound(err) {
44834524
logger.Info("Not found, creating...")
44844525
err = n.client.Create(ctx, obj)

deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,19 @@ spec:
381381
More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
382382
type: object
383383
type: object
384+
service:
385+
description: 'Optional: Service configuration for NVIDIA DCGM
386+
Exporter'
387+
properties:
388+
internalTrafficPolicy:
389+
description: InternalTrafficPolicy describes how nodes distribute
390+
service traffic they receive on the ClusterIP.
391+
type: string
392+
type:
393+
description: Type represents the ServiceType which describes
394+
ingress methods for a service
395+
type: string
396+
type: object
384397
serviceMonitor:
385398
description: 'Optional: ServiceMonitor configuration for NVIDIA
386399
DCGM Exporter'

deployments/gpu-operator/templates/clusterpolicy.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,9 @@ spec:
515515
{{- if .Values.dcgmExporter.serviceMonitor }}
516516
serviceMonitor: {{ toYaml .Values.dcgmExporter.serviceMonitor | nindent 6 }}
517517
{{- end }}
518+
{{- if .Values.dcgmExporter.service }}
519+
service: {{ toYaml .Values.dcgmExporter.service | nindent 6 }}
520+
{{- end }}
518521
gfd:
519522
enabled: {{ .Values.gfd.enabled }}
520523
{{- if .Values.gfd.repository }}

deployments/gpu-operator/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,8 @@ dcgmExporter:
323323
- name: DCGM_EXPORTER_COLLECTORS
324324
value: "/etc/dcgm-exporter/dcp-metrics-included.csv"
325325
resources: {}
326+
service:
327+
internalTrafficPolicy: Cluster
326328
serviceMonitor:
327329
enabled: false
328330
interval: 15s

0 commit comments

Comments
 (0)