Add init container to plugin pod (GPU driver dependency check)

jgehrcke · jgehrcke · commit bccdaf51b833 · 2025-06-04T19:45:09.000+02:00
Signed-off-by: Dr. Jan-Philip Gehrcke &lt;jgehrcke@nvidia.com&gt;
diff --git a/deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml b/deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml
@@ -49,6 +49,70 @@ spec:
       serviceAccountName: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}
       securityContext:
         {{- toYaml .Values.kubeletPlugin.podSecurityContext | nindent 8 }}
+      initContainers:
+      - name: validate-depenencies
+        image: {{ include "nvidia-dra-driver-gpu.fullimage" . }}
+        command: ["bash", "-c"]
+        args:
+        - |-
+          # Ensure that
+          #
+          # 1) GPU driver binaries are in the NVIDIA_DRIVER_ROOT directory.
+          # 2) the GPU driver is set up (nvidia-smi reports health).
+          #
+          # Main intent it to help troubleshoot a setup that does not have the
+          # GPU driver set up properly before installing this DRA driver. In
+          # that case, this container log is meant to yield an actionable error
+          # message. Another goal is to crash-loop (retry periodically) and
+          # actively wait for the GPU driver to be set up properly (this init
+          # container is meant to be a long-running init container that only
+          # exits upon success). That allows for auto-healing the DRA driver
+          # instalaltion right after the GPU driver setup has been fixed.
+          #
+          # For checking GPU driver health: for now, rely on nvidia-smi's exit
+          # code. Rely on code 0 meaning that the driver is properly set up. For
+          # example, code 9 means that the GPU driver is not loaded; see section
+          # 'RETURN VALUE' in the man page.
+          #
+          echo "NVIDIA_DRIVER_ROOT (path on host): ${NVIDIA_DRIVER_ROOT}"
+          echo "command -v nvidia-smi (path in container): $(command -v nvidia-smi)"
+          while :
+          do
+          if ! command -v nvidia-smi &>/dev/null
+          then
+              printf '%b' \
+              "Could not find 'nvidia-smi'. Has the NVIDIA GPU driver been set up? " \
+              "On the host, 'nvidia-smi' is expected to be placed in directory " \
+              "NVIDIA_DRIVER_ROOT (current value: '${NVIDIA_DRIVER_ROOT}'). " \
+              "If that value is unexpected: " \
+              "review and adjust the 'nvidiaDriverRoot' Helm chart variable. " \
+              "If the value is expected: make sure that the GPU driver gets " \
+              "placed at that location in the host file system. " \
+              "If you chose not to use a host-provided GPU driver, " \
+              "that might imply having to (re)provision the NVIDIA GPU Operator.\n"
+          else
+              # This may be slow or hang, in a bad setup.
+              echo "invoking nvidia-smi"
+              nvidia-smi
+              RCODE=$?
+              if [ ${RCODE} -eq 0 ]
+              then
+                  echo "nvidia-smi returned with code 0: success, leave"
+                  exit 0
+              else
+                  # Do we worry about false-negatives here (where nvidia-smi fails
+                  # e.g. because of one bad GPU but where we could still proceed)?
+                  printf '%b' \
+                  "nvidia-smi returned an error (code ${RCODE}). " \
+                  "Verify that the GPU driver is set up correctly\n"
+              fi
+          fi
+          echo "retry in 30 s"
+          sleep 30
+          done
+        env:
+        - name: NVIDIA_DRIVER_ROOT
+          value: "{{ .Values.nvidiaDriverRoot }}"
       containers:
       {{- if .Values.resources.computeDomains.enabled }}
       - name: compute-domains