|
16 | 16 |
|
17 | 17 | echo "NVIDIA_DRIVER_ROOT (path on host): ${NVIDIA_DRIVER_ROOT}" |
18 | 18 |
|
19 | | -# Note: the following path is in-container, after chroot to /driver-root (and |
20 | | -# /driver-root is where NVIDIA_DRIVER_ROOT is mounted from the host filesystem). |
21 | | -# This typically outputs /usr/bin/nvidia-smi. |
22 | | -echo "command -v nvidia-smi: $(command -v nvidia-smi)" |
23 | | - |
24 | 19 | while true |
25 | 20 | do |
26 | 21 | if ! command -v nvidia-smi &>/dev/null |
27 | 22 | then |
28 | 23 | printf '%b' \ |
29 | | - "Command not found: 'nvidia-smi'. " \ |
30 | | - "Has the NVIDIA GPU driver been set up on the host? " \ |
| 24 | + "Not in PATH: 'nvidia-smi'. " \ |
| 25 | + "Has the NVIDIA GPU driver been set up? " \ |
31 | 26 | "The GPU driver is expected to be installed under " \ |
32 | | - "NVIDIA_DRIVER_ROOT in the host filesystem. " \ |
33 | | - "NVIDIA_DRIVER_ROOT is currently set to: '${NVIDIA_DRIVER_ROOT}'. " \ |
34 | | - "If that value appears to be unexpected: " \ |
| 27 | + "NVIDIA_DRIVER_ROOT ('${NVIDIA_DRIVER_ROOT}') in the host filesystem. " \ |
| 28 | + "If NVIDIA_DRIVER_ROOT appears to be unexpected: " \ |
35 | 29 | "review and adjust the 'nvidiaDriverRoot' Helm chart variable. " \ |
36 | 30 | "If the value is expected: review if the GPU driver has " \ |
37 | 31 | "actually been installed under NVIDIA_DRIVER_ROOT. " \ |
38 | | - "If you chose the NVIDIA GPU Operator to manage the GPU driver: " \ |
| 32 | + "If you chose the NVIDIA GPU Operator to manage the GPU driver " \ |
| 33 | + "(NVIDIA_DRIVER_ROOT set to /run/nvidia/driver): "\ |
39 | 34 | "make sure that Operator is deployed and healthy.\n" |
| 35 | + |
| 36 | + # Provide hint for a specific, common mistake. |
| 37 | + if [ "$NVIDIA_DRIVER_ROOT" == "/" ] && [ -f /run/nvidia/driver/usr/bin/nvidia-smi ]; then |
| 38 | + printf '%b' \ |
| 39 | + "Note: /run/nvidia/driver/usr/bin/nvidia-smi exists on the host, you " \ |
| 40 | + "may want to re-install the DRA driver Helm chart with " \ |
| 41 | + "--set nvidiaDriverRoot=/run/nvidia/driver\n" |
| 42 | + fi |
40 | 43 | else |
| 44 | + # Note: the following path is in-container, after chroot to /driver-root (and |
| 45 | + # /driver-root is where NVIDIA_DRIVER_ROOT is mounted from the host filesystem). |
| 46 | + # This typically outputs /usr/bin/nvidia-smi. |
| 47 | + echo "command -v nvidia-smi: $(command -v nvidia-smi)" |
| 48 | + |
41 | 49 | # This may be slow or hang, in a bad setup. |
42 | 50 | echo "invoking nvidia-smi" |
43 | 51 | nvidia-smi |
|
0 commit comments