|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# Main intent: help users to self-troubleshoot when the GPU driver is not set up |
| 4 | +# properly before installing this DRA driver. In that case, the log of the init |
| 5 | +# container running this script is meant to yield an actionable error message. |
| 6 | +# For now, rely on k8s to implement a high-level retry with back-off. |
| 7 | + |
| 8 | +if [ -z "$NVIDIA_DRIVER_ROOT" ]; then |
| 9 | + # Not set, or set to empty string (not distinguishable). |
| 10 | + # Normalize to "/" (treated as such elsewhere). |
| 11 | + export NVIDIA_DRIVER_ROOT="/" |
| 12 | +fi |
| 13 | + |
| 14 | +emit_common_err () { |
| 15 | + printf '%b' \ |
| 16 | + "Check failed. Has the NVIDIA GPU driver been set up? " \ |
| 17 | + "It is expected to be installed under " \ |
| 18 | + "NVIDIA_DRIVER_ROOT (currently set to '${NVIDIA_DRIVER_ROOT}') " \ |
| 19 | + "in the host filesystem. If that path appears to be unexpected: " \ |
| 20 | + "review the DRA driver's 'nvidiaDriverRoot' Helm chart variable. " \ |
| 21 | + "Otherwise, review if the GPU driver has " \ |
| 22 | + "actually been installed under that path.\n" |
| 23 | +} |
| 24 | + |
| 25 | +validate_and_exit_on_success () { |
| 26 | + echo -n "$(date -u +"%Y-%m-%dT%H:%M:%SZ") /driver-root (${NVIDIA_DRIVER_ROOT} on host): " |
| 27 | + |
| 28 | + # Search specific set of directories (not recursively: not required, and |
| 29 | + # /driver-root may be a big tree). Limit to first result (multiple results |
| 30 | + # are a bit of a pathological state, but continue with validation logic). |
| 31 | + # Suppress find stderr: some search directories are expected to be "not |
| 32 | + # found". |
| 33 | + |
| 34 | + NV_PATH=$( \ |
| 35 | + find \ |
| 36 | + /driver-root/bin \ |
| 37 | + /driver-root/sbin \ |
| 38 | + /driver-root/usr/bin \ |
| 39 | + /driver-root/sbin \ |
| 40 | + -maxdepth 1 -type f -name "nvidia-smi" 2> /dev/null | head -n1 |
| 41 | + ) |
| 42 | + |
| 43 | + # Follow symlinks (-L), because `libnvidia-ml.so.1` is typically a link. |
| 44 | + # maxdepth 1 also protects against any potential symlink loop (we're |
| 45 | + # suppressing find's stderr, so we'd never see messages like 'Too many |
| 46 | + # levels of symbolic links'). |
| 47 | + NV_LIB_PATH=$( \ |
| 48 | + find -L \ |
| 49 | + /driver-root/usr/lib64 \ |
| 50 | + /driver-root/usr/lib/x86_64-linux-gnu \ |
| 51 | + /driver-root/usr/lib/aarch64-linux-gnu \ |
| 52 | + /driver-root/lib64 \ |
| 53 | + /driver-root/lib/x86_64-linux-gnu \ |
| 54 | + /driver-root/lib/aarch64-linux-gnu \ |
| 55 | + -maxdepth 1 -type f -name "libnvidia-ml.so.1" 2> /dev/null | head -n1 |
| 56 | + ) |
| 57 | + |
| 58 | + if [ -z "${NV_PATH}" ]; then |
| 59 | + echo -n "nvidia-smi: not found, " |
| 60 | + else |
| 61 | + echo -n "nvidia-smi: '${NV_PATH}', " |
| 62 | + fi |
| 63 | + |
| 64 | + if [ -z "${NV_LIB_PATH}" ]; then |
| 65 | + echo -n "libnvidia-ml.so.1: not found, " |
| 66 | + else |
| 67 | + echo -n "libnvidia-ml.so.1: '${NV_LIB_PATH}', " |
| 68 | + fi |
| 69 | + |
| 70 | + # Log top-level entries in /driver-root (this may be valuable debug info). |
| 71 | + echo "current contents: [$(/bin/ls -1xAw0 /driver-root 2>/dev/null)]." |
| 72 | + |
| 73 | + if [ -n "${NV_PATH}" ] && [ -n "${NV_LIB_PATH}" ]; then |
| 74 | + |
| 75 | + # Run with clean environment (only LD_PRELOAD; nvidia-smi has only this |
| 76 | + # dependency). Emit message before invocation (nvidia-smi may be slow or |
| 77 | + # hang). |
| 78 | + echo "invoke: env -i LD_PRELOAD=${NV_LIB_PATH} ${NV_PATH}" |
| 79 | + |
| 80 | + # Always show stderr, maybe hide or filter stdout? |
| 81 | + env -i LD_PRELOAD="${NV_LIB_PATH}" "${NV_PATH}" |
| 82 | + RCODE="$?" |
| 83 | + |
| 84 | + # For checking GPU driver health: rely on nvidia-smi's exit code. Rely |
| 85 | + # on code 0 signaling that the driver is properly set up. See section |
| 86 | + # 'RETURN VALUE' in the nvidia-smi man page for meaning of error codes. |
| 87 | + if [ ${RCODE} -eq 0 ]; then |
| 88 | + echo "nvidia-smi returned with code 0: success, leave" |
| 89 | + |
| 90 | + # Exit script indicating success (leave init container). |
| 91 | + exit 0 |
| 92 | + else |
| 93 | + echo "exit code: ${RCODE}" |
| 94 | + fi |
| 95 | + fi |
| 96 | + |
| 97 | + # Reduce log volume: log hints only every Nth attempt. |
| 98 | + if [ $((_ATTEMPT % 6)) -ne 0 ]; then |
| 99 | + return |
| 100 | + fi |
| 101 | + |
| 102 | + # nvidia-smi binaries not found, or execution failed. First, provide generic |
| 103 | + # error message. Then, try to provide actional hints for common problems. |
| 104 | + echo |
| 105 | + emit_common_err |
| 106 | + |
| 107 | + # For host-provided driver not at / provide feedback for two special cases. |
| 108 | + if [ "${NVIDIA_DRIVER_ROOT}" != "/" ]; then |
| 109 | + if [ -z "$( ls -A /driver-root )" ]; then |
| 110 | + echo "Hint: Directory $NVIDIA_DRIVER_ROOT on the host is empty" |
| 111 | + else |
| 112 | + # Not empty, but at least one of the binaries not found: this is a |
| 113 | + # rather pathotlogical state. |
| 114 | + if [ -z "${NV_PATH}" ] || [ -z "${NV_LIB_PATH}" ]; then |
| 115 | + echo "Hint: Directory $NVIDIA_DRIVER_ROOT is not empty but at least one of the binaries wasn't found." |
| 116 | + fi |
| 117 | + fi |
| 118 | + fi |
| 119 | + |
| 120 | + # Common mistake: driver container, but forgot `--set nvidiaDriverRoot` |
| 121 | + if [ "${NVIDIA_DRIVER_ROOT}" == "/" ] && [ -f /driver-root/run/nvidia/driver/usr/bin/nvidia-smi ]; then |
| 122 | + printf '%b' \ |
| 123 | + "Hint: '/run/nvidia/driver/usr/bin/nvidia-smi' exists on the host, you " \ |
| 124 | + "may want to re-install the DRA driver Helm chart with " \ |
| 125 | + "--set nvidiaDriverRoot=/run/nvidia/driver\n" |
| 126 | + fi |
| 127 | + |
| 128 | + if [ "${NVIDIA_DRIVER_ROOT}" == "/run/nvidia/driver" ]; then |
| 129 | + printf '%b' \ |
| 130 | + "Hint: NVIDIA_DRIVER_ROOT is set to '/run/nvidia/driver' " \ |
| 131 | + "which typically means that the NVIDIA GPU Operator " \ |
| 132 | + "manages the GPU driver. Make sure that the GPU Operator " \ |
| 133 | + "is deployed and healthy.\n" |
| 134 | + fi |
| 135 | + echo |
| 136 | +} |
| 137 | + |
| 138 | +# DS pods may get deleted (terminated with SIGTERM) and re-created when the GPU |
| 139 | +# Operator driver container creates a mount at /run/nvidia. Make that explicit. |
| 140 | +log_sigterm() { |
| 141 | + echo "$(date -u +"%Y-%m-%dT%H:%M:%S.%3NZ"): received SIGTERM" |
| 142 | + exit 0 |
| 143 | +} |
| 144 | +trap 'log_sigterm' SIGTERM |
| 145 | + |
| 146 | + |
| 147 | +# Design goal: long-running init container that retries at constant frequency, |
| 148 | +# and leaves only upon success (with code 0). |
| 149 | +_WAIT_S=10 |
| 150 | +_ATTEMPT=0 |
| 151 | + |
| 152 | +while true |
| 153 | +do |
| 154 | + validate_and_exit_on_success |
| 155 | + sleep ${_WAIT_S} |
| 156 | + _ATTEMPT=$((_ATTEMPT+1)) |
| 157 | +done |
0 commit comments