deployments: Add Helm chart support for CDI mode

ArangoGutierrez · ArangoGutierrez · commit 8db72b7c6816 · 2025-11-11T20:17:21.000+01:00
Enhance Helm chart to support both default and CDI operating modes.

Changes:
- values.yaml: Add CDI configuration section
  - cdi.enabled: Enable CDI spec mode (default: false)
  - cdi.configMapName: Reference external CDI spec ConfigMap
  - cdi.inlineSpec: Embed CDI spec in Helm values
  - cdi.architectureOverride: Manual architecture selection
  - mockDriver.architecture: Default mode architecture (dgxa100)

- mock-driver-daemonset.yaml: Wire CDI configuration
  - Mount CDI spec ConfigMap when enabled
  - Set environment variables (MOCK_GPU_ARCH, MOCK_NVML_NUM_DEVICES)
  - Use entrypoint.sh instead of direct gpu-mockctl call
  - Pass CDI_SPEC_PATH when CDI mode enabled

- cdi-configmap.yaml: New template for inline CDI specs
  - Creates ConfigMap from Helm values when cdi.inlineSpec set

Deployment scenarios:
1. Default mode (zero-config):
   helm install gpu-mock ./helm/gpu-mock
   → 8 A100 GPUs, no configuration needed

2. CDI from ConfigMap:
   kubectl create configmap my-spec --from-file=spec.yaml=my-cdi.yaml
   helm install gpu-mock ./helm/gpu-mock --set cdi.enabled=true \
     --set cdi.configMapName=my-spec
   → Custom GPU topology from CDI spec

3. CDI inline:
   helm install gpu-mock ./helm/gpu-mock --set cdi.enabled=true \
     --set-file cdi.inlineSpec=my-cdi.yaml
   → CDI spec embedded in Helm release

This provides flexibility while maintaining backward compatibility
with existing zero-config deployments.
diff --git a/deployments/devel/gpu-mock/helm/gpu-mock/templates/cdi-configmap.yaml b/deployments/devel/gpu-mock/helm/gpu-mock/templates/cdi-configmap.yaml
@@ -0,0 +1,13 @@
+{{- if and .Values.cdi.enabled .Values.cdi.inlineSpec -}}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "gpu-mock.fullname" . }}-cdi-spec
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "gpu-mock.labels" . | nindent 4 }}
+data:
+  spec.yaml: |
+{{ .Values.cdi.inlineSpec | indent 4 }}
+{{- end }}
+
diff --git a/deployments/devel/gpu-mock/helm/gpu-mock/templates/mock-driver-daemonset.yaml b/deployments/devel/gpu-mock/helm/gpu-mock/templates/mock-driver-daemonset.yaml
@@ -41,6 +41,12 @@ spec:
         - name: host-dev
           hostPath:
             path: /dev
+        {{- if .Values.cdi.enabled }}
+        - name: cdi-spec
+          configMap:
+            name: {{ .Values.cdi.configMapName | default (printf "%s-cdi-spec" (include "gpu-mock.fullname" .)) }}
+            optional: true
+        {{- end }}
             
       initContainers:
         # Create mock driver filesystem and device nodes
@@ -55,15 +61,37 @@ spec:
             - name: DEBUG
               value: "true"
             {{- end }}
+            # Mock GPU architecture configuration
+            - name: MOCK_GPU_ARCH
+              value: {{ .Values.cdi.architectureOverride | default .Values.mockDriver.architecture | quote }}
+            # Number of GPUs (for default mode)
+            - name: MOCK_NVML_NUM_DEVICES
+              value: {{ .Values.mockDriver.gpuCount | quote }}
+            # Driver root and device paths
+            - name: DRIVER_ROOT
+              value: "/host/var/lib/nvidia-mock/driver"
+            - name: HOST_DEV
+              value: "/host/dev"
+            {{- if .Values.cdi.enabled }}
+            # CDI spec path (will be present if ConfigMap is mounted)
+            - name: CDI_SPEC_PATH
+              value: "/config/cdi-spec.yaml"
+            {{- end }}
           securityContext:
             {{- toYaml .Values.global.securityContext | nindent 12 }}
           volumeMounts:
             - name: host-driver
               mountPath: /host/var/lib/nvidia-mock/driver
             - name: host-dev
               mountPath: /host/dev
-          command: ["/usr/local/bin/gpu-mockctl"]
+            {{- if .Values.cdi.enabled }}
+            - name: cdi-spec
+              mountPath: /config
+              readOnly: true
+            {{- end }}
+          command: ["/usr/local/bin/entrypoint.sh"]
           args:
+            - "/usr/local/bin/gpu-mockctl"
             - "driver"
             - "--driver-root"
             - "/host/var/lib/nvidia-mock/driver"
diff --git a/deployments/devel/gpu-mock/helm/gpu-mock/values.yaml b/deployments/devel/gpu-mock/helm/gpu-mock/values.yaml
@@ -15,12 +15,16 @@ mockDriver:
     tag: dev
     pullPolicy: IfNotPresent
   
-  # Number of mock GPUs to create
+  # Number of mock GPUs to create (used in default mode)
   gpuCount: 8
   
-  # Mock GPU model
+  # Mock GPU model (used in default mode)
   gpuModel: "NVIDIA A100-SXM4-40GB"
   
+  # Mock GPU architecture (used in default mode)
+  # Options: dgxa100 (default), h100, h200, b200 (when available)
+  architecture: dgxa100
+  
   # Resources for the mock driver pod
   resources:
     requests:
@@ -97,6 +101,23 @@ nodeLabeling:
   # Apply feature.node.kubernetes.io/pci-10de.present=true label
   pciPresent: true
 
+# CDI (Container Device Interface) configuration
+cdi:
+  # Enable CDI spec mode (if disabled, uses default dgxa100 mode)
+  enabled: false
+  
+  # Option 1: Reference existing ConfigMap with CDI spec
+  # The ConfigMap should have a key "spec.yaml" containing the CDI spec
+  configMapName: ""
+  
+  # Option 2: Inline CDI spec (embedded in Helm values)
+  # Leave empty to use ConfigMap or default mode
+  inlineSpec: ""
+  
+  # Override architecture detection from CDI spec
+  # Leave empty to auto-detect from CDI spec annotations
+  architectureOverride: ""
+
 # Development/debugging options
 debug:
   # Enable debug logging