|
49 | 49 | serviceAccountName: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }} |
50 | 50 | securityContext: |
51 | 51 | {{- toYaml .Values.kubeletPlugin.podSecurityContext | nindent 8 }} |
| 52 | + initContainers: |
| 53 | + - name: validate-depenencies |
| 54 | + image: {{ include "nvidia-dra-driver-gpu.fullimage" . }} |
| 55 | + command: ["bash", "-c"] |
| 56 | + args: |
| 57 | + - |- |
| 58 | + # Ensure that |
| 59 | + # |
| 60 | + # 1) GPU driver binaries are in the NVIDIA_DRIVER_ROOT directory. |
| 61 | + # 2) the GPU driver is set up (nvidia-smi reports health). |
| 62 | + # |
| 63 | + # Main intent it to help troubleshoot a setup that does not have the |
| 64 | + # GPU driver set up properly before installing this DRA driver. In |
| 65 | + # that case, this container log is meant to yield an actionable error |
| 66 | + # message. Another goal is to crash-loop (retry periodically) and |
| 67 | + # actively wait for the GPU driver to be set up properly (this init |
| 68 | + # container is meant to be a long-running init container that only |
| 69 | + # exits upon success). That allows for auto-healing the DRA driver |
| 70 | + # instalaltion right after the GPU driver setup has been fixed. |
| 71 | + # |
| 72 | + # For checking GPU driver health: for now, rely on nvidia-smi's exit |
| 73 | + # code. Rely on code 0 meaning that the driver is properly set up. For |
| 74 | + # example, code 9 means that the GPU driver is not loaded; see section |
| 75 | + # 'RETURN VALUE' in the man page. |
| 76 | + # |
| 77 | + echo "NVIDIA_DRIVER_ROOT (path on host): ${NVIDIA_DRIVER_ROOT}" |
| 78 | + echo "command -v nvidia-smi (path in container): $(command -v nvidia-smi)" |
| 79 | + while : |
| 80 | + do |
| 81 | + if ! command -v nvidia-smi &>/dev/null |
| 82 | + then |
| 83 | + printf '%b' \ |
| 84 | + "Could not find 'nvidia-smi'. Has the NVIDIA GPU driver been set up? " \ |
| 85 | + "On the host, 'nvidia-smi' is expected to be placed in directory " \ |
| 86 | + "NVIDIA_DRIVER_ROOT (current value: '${NVIDIA_DRIVER_ROOT}'). " \ |
| 87 | + "If that value is unexpected: " \ |
| 88 | + "review and adjust the 'nvidiaDriverRoot' Helm chart variable. " \ |
| 89 | + "If the value is expected: make sure that the GPU driver gets " \ |
| 90 | + "placed at that location in the host file system. " \ |
| 91 | + "If you chose not to use a host-provided GPU driver, " \ |
| 92 | + "that might imply having to (re)provision the NVIDIA GPU Operator.\n" |
| 93 | + else |
| 94 | + # This may be slow or hang, in a bad setup. |
| 95 | + echo "invoking nvidia-smi" |
| 96 | + nvidia-smi |
| 97 | + RCODE=$? |
| 98 | + if [ ${RCODE} -eq 0 ] |
| 99 | + then |
| 100 | + echo "nvidia-smi returned with code 0: success, leave" |
| 101 | + exit 0 |
| 102 | + else |
| 103 | + # Do we worry about false-negatives here (where nvidia-smi fails |
| 104 | + # e.g. because of one bad GPU but where we could still proceed)? |
| 105 | + printf '%b' \ |
| 106 | + "nvidia-smi returned an error (code ${RCODE}). " \ |
| 107 | + "Verify that the GPU driver is set up correctly\n" |
| 108 | + fi |
| 109 | + fi |
| 110 | + echo "retry in 30 s" |
| 111 | + sleep 30 |
| 112 | + done |
| 113 | + env: |
| 114 | + - name: NVIDIA_DRIVER_ROOT |
| 115 | + value: "{{ .Values.nvidiaDriverRoot }}" |
52 | 116 | containers: |
53 | 117 | {{- if .Values.resources.computeDomains.enabled }} |
54 | 118 | - name: compute-domains |
|
0 commit comments