|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# Main intent: help users to self-troubleshoot when the GPU driver is not set up |
| 4 | +# properly before installing this DRA driver. In that case, the log of the init |
| 5 | +# container running this script is meant to yield an actionable error message. |
| 6 | +# |
| 7 | +# Crash-loop (retry periodically) and actively wait for the GPU driver to come |
| 8 | +# online (unblock the DRA driver plugin daemonset pods from progressing right |
| 9 | +# after the GPU driver setup has been fixed). |
| 10 | + |
| 11 | +if [ -z "$NVIDIA_DRIVER_ROOT" ]; then |
| 12 | + # Not set, or set to empty string (not distinguishable). |
| 13 | + # Normalize to "/" (treated as such elsewhere). |
| 14 | + export NVIDIA_DRIVER_ROOT="/" |
| 15 | +fi |
| 16 | + |
| 17 | +echo "NVIDIA_DRIVER_ROOT (path on host): ${NVIDIA_DRIVER_ROOT}" |
| 18 | + |
| 19 | +while true |
| 20 | +do |
| 21 | +if ! command -v nvidia-smi &>/dev/null |
| 22 | +then |
| 23 | + printf '%b' \ |
| 24 | + "Not in PATH: 'nvidia-smi'. " \ |
| 25 | + "Has the NVIDIA GPU driver been set up? " \ |
| 26 | + "The GPU driver is expected to be installed under " \ |
| 27 | + "NVIDIA_DRIVER_ROOT ('${NVIDIA_DRIVER_ROOT}') in the host filesystem. " \ |
| 28 | + "If NVIDIA_DRIVER_ROOT appears to be unexpected: " \ |
| 29 | + "review and adjust the 'nvidiaDriverRoot' Helm chart variable. " \ |
| 30 | + "If the value is expected: review if the GPU driver has " \ |
| 31 | + "actually been installed under NVIDIA_DRIVER_ROOT. " \ |
| 32 | + "If you chose the NVIDIA GPU Operator to manage the GPU driver " \ |
| 33 | + "(NVIDIA_DRIVER_ROOT set to /run/nvidia/driver): "\ |
| 34 | + "make sure that Operator is deployed and healthy.\n" |
| 35 | + |
| 36 | + # Provide hint for a specific, common mistake. |
| 37 | + if [ "${NVIDIA_DRIVER_ROOT}" == "/" ] && [ -f /run/nvidia/driver/usr/bin/nvidia-smi ]; then |
| 38 | + printf '%b' \ |
| 39 | + "Note: /run/nvidia/driver/usr/bin/nvidia-smi exists on the host, you " \ |
| 40 | + "may want to re-install the DRA driver Helm chart with " \ |
| 41 | + "--set nvidiaDriverRoot=/run/nvidia/driver\n" |
| 42 | + fi |
| 43 | +else |
| 44 | + # Note: the following path is in-container, after chroot to /driver-root (and |
| 45 | + # /driver-root is where NVIDIA_DRIVER_ROOT is mounted from the host filesystem). |
| 46 | + # This typically outputs /usr/bin/nvidia-smi. |
| 47 | + echo "command -v nvidia-smi: $(command -v nvidia-smi)" |
| 48 | + |
| 49 | + # This may be slow or hang, in a bad setup. |
| 50 | + echo "invoking nvidia-smi" |
| 51 | + nvidia-smi |
| 52 | + RCODE="$?" |
| 53 | + |
| 54 | + # For checking GPU driver health: for now, rely on nvidia-smi's exit code. |
| 55 | + # Rely on code 0 meaning that the driver is properly set up. For example, |
| 56 | + # code 9 means that the GPU driver is not loaded; see section 'RETURN VALUE' |
| 57 | + # in the man page. |
| 58 | + if [ ${RCODE} -eq 0 ] |
| 59 | + then |
| 60 | + echo "nvidia-smi returned with code 0: success, leave" |
| 61 | + exit 0 |
| 62 | + else |
| 63 | + # Do we worry about false-negatives here (where nvidia-smi fails |
| 64 | + # e.g. because of one bad GPU but where we could still proceed)? |
| 65 | + printf '%b' \ |
| 66 | + "nvidia-smi returned an error (code ${RCODE}). " \ |
| 67 | + "Verify that the GPU driver is set up correctly\n" |
| 68 | + fi |
| 69 | +fi |
| 70 | +echo "retry in 30 s" |
| 71 | +sleep 30 |
| 72 | +done |
0 commit comments