deployments: Add CDI spec examples

ArangoGutierrez · ArangoGutierrez · commit 1e83a87aa8c3 · 2025-11-11T20:18:26.000+01:00
Add example CDI specifications for testing and reference.

Files:
- cdi-spec-a100-2gpu.yaml: Minimal 2-GPU configuration
  - Lightweight setup for development/testing
  - Includes environment variable examples
  - Demonstrates basic CDI structure

- cdi-spec-a100-8gpu.yaml: DGX A100 simulation
  - 8x NVIDIA A100-SXM4-40GB GPUs
  - Matches default mode topology
  - Shows complete device enumeration
  - Includes usage instructions in comments

Both specs follow CDI v0.5.0 specification and include:
- Device nodes (/dev/nvidia0-7, nvidiactl, nvidia-uvm*)
- GPU model annotations for architecture detection
- UUID and index metadata
- Proper device major/minor numbers

Usage:
  # From file:
  kubectl create configmap my-spec \
    --from-file=spec.yaml=cdi-spec-a100-2gpu.yaml
  helm install gpu-mock ./helm/gpu-mock \
    --set cdi.enabled=true --set cdi.configMapName=my-spec

  # Inline:
  helm install gpu-mock ./helm/gpu-mock \
    --set cdi.enabled=true \
    --set-file cdi.inlineSpec=cdi-spec-a100-8gpu.yaml

These examples serve as templates for creating custom GPU topologies.
diff --git a/deployments/devel/gpu-mock/examples/cdi-spec-a100-2gpu.yaml b/deployments/devel/gpu-mock/examples/cdi-spec-a100-2gpu.yaml
@@ -0,0 +1,47 @@
+# Example CDI Specification for 2x NVIDIA A100 GPUs
+# Simpler configuration for testing and development
+# Compatible with CDI specification version 0.5.0
+#
+# Usage:
+#   kubectl create configmap gpu-cdi-spec --from-file=spec.yaml=cdi-spec-a100-2gpu.yaml -n gpu-mock
+#   helm upgrade --install gpu-mock ../../helm/gpu-mock \
+#     --set cdi.enabled=true \
+#     --set cdi.configMapName=gpu-cdi-spec \
+#     --namespace gpu-mock
+
+cdiVersion: "0.5.0"
+kind: nvidia.com/gpu
+
+devices:
+  # GPU 0
+  - name: "gpu0"
+    annotations:
+      nvidia.com/gpu.model: "NVIDIA A100-SXM4-40GB"
+      nvidia.com/gpu.uuid: "GPU-00000000-0000-0000-0000-000000000000"
+      nvidia.com/gpu.index: "0"
+    containerEdits:
+      deviceNodes:
+        - path: /dev/nvidia0
+          type: c
+          major: 195
+          minor: 0
+          fileMode: 0666
+      env:
+        - NVIDIA_VISIBLE_DEVICES=0
+      
+  # GPU 1
+  - name: "gpu1"
+    annotations:
+      nvidia.com/gpu.model: "NVIDIA A100-SXM4-40GB"
+      nvidia.com/gpu.uuid: "GPU-00000000-0000-0000-0000-000000000001"
+      nvidia.com/gpu.index: "1"
+    containerEdits:
+      deviceNodes:
+        - path: /dev/nvidia1
+          type: c
+          major: 195
+          minor: 1
+          fileMode: 0666
+      env:
+        - NVIDIA_VISIBLE_DEVICES=1
+
diff --git a/deployments/devel/gpu-mock/examples/cdi-spec-a100-8gpu.yaml b/deployments/devel/gpu-mock/examples/cdi-spec-a100-8gpu.yaml
@@ -0,0 +1,127 @@
+# Example CDI Specification for 8x NVIDIA A100 GPUs
+# This mimics the NVIDIA DGX A100 configuration
+# Compatible with CDI specification version 0.5.0
+#
+# Usage:
+#   kubectl create configmap gpu-cdi-spec --from-file=spec.yaml=cdi-spec-a100-8gpu.yaml -n gpu-mock
+#   helm upgrade --install gpu-mock ../../helm/gpu-mock \
+#     --set cdi.enabled=true \
+#     --set cdi.configMapName=gpu-cdi-spec \
+#     --namespace gpu-mock
+
+cdiVersion: "0.5.0"
+kind: nvidia.com/gpu
+
+devices:
+  # GPU 0
+  - name: "gpu0"
+    annotations:
+      nvidia.com/gpu.model: "NVIDIA A100-SXM4-40GB"
+      nvidia.com/gpu.uuid: "GPU-00000000-0000-0000-0000-000000000000"
+      nvidia.com/gpu.index: "0"
+    containerEdits:
+      deviceNodes:
+        - path: /dev/nvidia0
+          type: c
+          major: 195
+          minor: 0
+          fileMode: 0666
+      
+  # GPU 1
+  - name: "gpu1"
+    annotations:
+      nvidia.com/gpu.model: "NVIDIA A100-SXM4-40GB"
+      nvidia.com/gpu.uuid: "GPU-00000000-0000-0000-0000-000000000001"
+      nvidia.com/gpu.index: "1"
+    containerEdits:
+      deviceNodes:
+        - path: /dev/nvidia1
+          type: c
+          major: 195
+          minor: 1
+          fileMode: 0666
+          
+  # GPU 2
+  - name: "gpu2"
+    annotations:
+      nvidia.com/gpu.model: "NVIDIA A100-SXM4-40GB"
+      nvidia.com/gpu.uuid: "GPU-00000000-0000-0000-0000-000000000002"
+      nvidia.com/gpu.index: "2"
+    containerEdits:
+      deviceNodes:
+        - path: /dev/nvidia2
+          type: c
+          major: 195
+          minor: 2
+          fileMode: 0666
+          
+  # GPU 3
+  - name: "gpu3"
+    annotations:
+      nvidia.com/gpu.model: "NVIDIA A100-SXM4-40GB"
+      nvidia.com/gpu.uuid: "GPU-00000000-0000-0000-0000-000000000003"
+      nvidia.com/gpu.index: "3"
+    containerEdits:
+      deviceNodes:
+        - path: /dev/nvidia3
+          type: c
+          major: 195
+          minor: 3
+          fileMode: 0666
+          
+  # GPU 4
+  - name: "gpu4"
+    annotations:
+      nvidia.com/gpu.model: "NVIDIA A100-SXM4-40GB"
+      nvidia.com/gpu.uuid: "GPU-00000000-0000-0000-0000-000000000004"
+      nvidia.com/gpu.index: "4"
+    containerEdits:
+      deviceNodes:
+        - path: /dev/nvidia4
+          type: c
+          major: 195
+          minor: 4
+          fileMode: 0666
+          
+  # GPU 5
+  - name: "gpu5"
+    annotations:
+      nvidia.com/gpu.model: "NVIDIA A100-SXM4-40GB"
+      nvidia.com/gpu.uuid: "GPU-00000000-0000-0000-0000-000000000005"
+      nvidia.com/gpu.index: "5"
+    containerEdits:
+      deviceNodes:
+        - path: /dev/nvidia5
+          type: c
+          major: 195
+          minor: 5
+          fileMode: 0666
+          
+  # GPU 6
+  - name: "gpu6"
+    annotations:
+      nvidia.com/gpu.model: "NVIDIA A100-SXM4-40GB"
+      nvidia.com/gpu.uuid: "GPU-00000000-0000-0000-0000-000000000006"
+      nvidia.com/gpu.index: "6"
+    containerEdits:
+      deviceNodes:
+        - path: /dev/nvidia6
+          type: c
+          major: 195
+          minor: 6
+          fileMode: 0666
+          
+  # GPU 7
+  - name: "gpu7"
+    annotations:
+      nvidia.com/gpu.model: "NVIDIA A100-SXM4-40GB"
+      nvidia.com/gpu.uuid: "GPU-00000000-0000-0000-0000-000000000007"
+      nvidia.com/gpu.index: "7"
+    containerEdits:
+      deviceNodes:
+        - path: /dev/nvidia7
+          type: c
+          major: 195
+          minor: 7
+          fileMode: 0666
+
diff --git a/pkg/gpu/mocknvml/NVML_SYMBOLS_ANALYSIS.md b/pkg/gpu/mocknvml/NVML_SYMBOLS_ANALYSIS.md