|
3 | 3 | # Main intent: help users to self-troubleshoot when the GPU driver is not set up |
4 | 4 | # properly before installing this DRA driver. In that case, the log of the init |
5 | 5 | # container running this script is meant to yield an actionable error message. |
6 | | -# |
7 | | -# Crash-loop (retry periodically) and actively wait for the GPU driver to come |
8 | | -# online (unblock the DRA driver plugin daemonset pods from progressing right |
9 | | -# after the GPU driver setup has been fixed). |
| 6 | +# For now, rely on k8s to implement a high-level retry with back-off. |
10 | 7 |
|
11 | 8 | if [ -z "$NVIDIA_DRIVER_ROOT" ]; then |
12 | 9 | # Not set, or set to empty string (not distinguishable). |
|
16 | 13 |
|
17 | 14 | echo "NVIDIA_DRIVER_ROOT (path on host): ${NVIDIA_DRIVER_ROOT}" |
18 | 15 |
|
19 | | -while true |
20 | | -do |
21 | 16 | if ! command -v nvidia-smi &>/dev/null |
22 | 17 | then |
23 | 18 | printf '%b' \ |
|
41 | 36 | "--set nvidiaDriverRoot=/run/nvidia/driver\n" |
42 | 37 | fi |
43 | 38 |
|
44 | | - # /driver-root (which we chrooted to) is empty (qrely on the fact |
| 39 | + # /driver-root (which we chrooted to) is empty (rely on the fact |
45 | 40 | # that this actually exists on the host: k8s hostPath type: Directory). |
46 | 41 | if [ -z "$( ls -A / )" ]; then |
47 | 42 | echo "Hint: Directory $NVIDIA_DRIVER_ROOT on the host appears to be empty" |
48 | 43 | fi |
49 | | -else |
50 | | - # Note: the following path is in-container, after chroot to /driver-root (and |
51 | | - # /driver-root is where NVIDIA_DRIVER_ROOT is mounted from the host filesystem). |
52 | | - # This typically outputs /usr/bin/nvidia-smi. |
53 | | - echo "command -v nvidia-smi: $(command -v nvidia-smi)" |
54 | 44 |
|
55 | | - # This may be slow or hang, in a bad setup. |
56 | | - echo "invoking nvidia-smi" |
57 | | - nvidia-smi |
58 | | - RCODE="$?" |
| 45 | + exit 1 |
| 46 | +fi |
59 | 47 |
|
60 | | - # For checking GPU driver health: for now, rely on nvidia-smi's exit code. |
61 | | - # Rely on code 0 meaning that the driver is properly set up. For example, |
62 | | - # code 9 means that the GPU driver is not loaded; see section 'RETURN VALUE' |
63 | | - # in the man page. |
64 | | - if [ ${RCODE} -eq 0 ] |
65 | | - then |
66 | | - echo "nvidia-smi returned with code 0: success, leave" |
67 | | - exit 0 |
68 | | - else |
69 | | - # Do we worry about false-negatives here (where nvidia-smi fails |
70 | | - # e.g. because of one bad GPU but where we could still proceed)? |
71 | | - printf '%b' \ |
72 | | - "nvidia-smi returned an error (code ${RCODE}). " \ |
73 | | - "Verify that the GPU driver is set up correctly\n" |
74 | | - fi |
| 48 | + |
| 49 | +# Note: the following path is in-container, after chroot to /driver-root (and |
| 50 | +# /driver-root is where NVIDIA_DRIVER_ROOT is mounted from the host filesystem). |
| 51 | +# This typically outputs /usr/bin/nvidia-smi. |
| 52 | +echo "command -v nvidia-smi: $(command -v nvidia-smi)" |
| 53 | + |
| 54 | +# This may be slow or hang, in a bad setup. |
| 55 | +echo "invoking nvidia-smi" |
| 56 | +nvidia-smi |
| 57 | +RCODE="$?" |
| 58 | + |
| 59 | +# For checking GPU driver health: for now, rely on nvidia-smi's exit code. |
| 60 | +# Rely on code 0 meaning that the driver is properly set up. For example, |
| 61 | +# code 9 means that the GPU driver is not loaded; see section 'RETURN VALUE' |
| 62 | +# in the man page. |
| 63 | +if [ ${RCODE} -eq 0 ] |
| 64 | +then |
| 65 | + echo "nvidia-smi returned with code 0: success, leave" |
| 66 | + exit 0 |
| 67 | +else |
| 68 | + # Do we worry about false-negatives here (where nvidia-smi fails |
| 69 | + # e.g. because of one bad GPU but where we could still proceed)? |
| 70 | + printf '%b' \ |
| 71 | + "nvidia-smi returned an error (code ${RCODE}). " \ |
| 72 | + "Verify that the GPU driver is set up correctly\n" |
75 | 73 | fi |
76 | | -echo "retry in 30 s" |
77 | | -sleep 30 |
78 | | -done |
0 commit comments