|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# Main intent: help users to self-troubleshoot when the GPU driver is not set up |
| 4 | +# properly before installing this DRA driver. In that case, the log of the init |
| 5 | +# container running this script is meant to yield an actionable error message. |
| 6 | +# For now, rely on k8s to implement a high-level retry with back-off. |
| 7 | + |
| 8 | +if [ -z "$NVIDIA_DRIVER_ROOT" ]; then |
| 9 | + # Not set, or set to empty string (not distinguishable). |
| 10 | + # Normalize to "/" (treated as such elsewhere). |
| 11 | + export NVIDIA_DRIVER_ROOT="/" |
| 12 | +fi |
| 13 | + |
| 14 | +# Remove trailing slash (if existing) and get last path element. |
| 15 | +_driver_root_path="/driver-root-parent/$(basename "${NVIDIA_DRIVER_ROOT%/}")" |
| 16 | + |
| 17 | +# Create in-container path /driver-root as a symlink. Expectation: link may be |
| 18 | +# broken initially (e.g., if the GPU operator isn't deployed yet. The link heals |
| 19 | +# once the driver becomes mounted (e.g., once GPU operator provides the driver |
| 20 | +# on the host at /run/nvidia/driver). |
| 21 | +echo "create symlink: /driver-root -> ${_driver_root_path}" |
| 22 | +ln -s "${_driver_root_path}" /driver-root |
| 23 | + |
| 24 | +emit_common_err () { |
| 25 | + printf '%b' \ |
| 26 | + "Check failed. Has the NVIDIA GPU driver been set up? " \ |
| 27 | + "It is expected to be installed under " \ |
| 28 | + "NVIDIA_DRIVER_ROOT (currently set to '${NVIDIA_DRIVER_ROOT}') " \ |
| 29 | + "in the host filesystem. If that path appears to be unexpected: " \ |
| 30 | + "review the DRA driver's 'nvidiaDriverRoot' Helm chart variable. " \ |
| 31 | + "Otherwise, review if the GPU driver has " \ |
| 32 | + "actually been installed under that path.\n" |
| 33 | +} |
| 34 | + |
| 35 | +validate_and_exit_on_success () { |
| 36 | + echo -n "$(date -u +"%Y-%m-%dT%H:%M:%SZ") /driver-root (${NVIDIA_DRIVER_ROOT} on host): " |
| 37 | + |
| 38 | + # Search specific set of directories (not recursively: not required, and |
| 39 | + # /driver-root may be a big tree). Limit to first result (multiple results |
| 40 | + # are a bit of a pathological state, but continue with validation logic). |
| 41 | + # Suppress find stderr: some search directories are expected to be "not |
| 42 | + # found". |
| 43 | + |
| 44 | + NV_PATH=$( \ |
| 45 | + find \ |
| 46 | + /driver-root/bin \ |
| 47 | + /driver-root/sbin \ |
| 48 | + /driver-root/usr/bin \ |
| 49 | + /driver-root/sbin \ |
| 50 | + -maxdepth 1 -type f -name "nvidia-smi" 2> /dev/null | head -n1 |
| 51 | + ) |
| 52 | + |
| 53 | + # Follow symlinks (-L), because `libnvidia-ml.so.1` is typically a link. |
| 54 | + # maxdepth 1 also protects against any potential symlink loop (we're |
| 55 | + # suppressing find's stderr, so we'd never see messages like 'Too many |
| 56 | + # levels of symbolic links'). |
| 57 | + NV_LIB_PATH=$( \ |
| 58 | + find -L \ |
| 59 | + /driver-root/usr/lib64 \ |
| 60 | + /driver-root/usr/lib/x86_64-linux-gnu \ |
| 61 | + /driver-root/usr/lib/aarch64-linux-gnu \ |
| 62 | + /driver-root/lib64 \ |
| 63 | + /driver-root/lib/x86_64-linux-gnu \ |
| 64 | + /driver-root/lib/aarch64-linux-gnu \ |
| 65 | + -maxdepth 1 -type f -name "libnvidia-ml.so.1" 2> /dev/null | head -n1 |
| 66 | + ) |
| 67 | + |
| 68 | + if [ -z "${NV_PATH}" ]; then |
| 69 | + echo -n "nvidia-smi: not found, " |
| 70 | + else |
| 71 | + echo -n "nvidia-smi: '${NV_PATH}', " |
| 72 | + fi |
| 73 | + |
| 74 | + if [ -z "${NV_LIB_PATH}" ]; then |
| 75 | + echo -n "libnvidia-ml.so.1: not found, " |
| 76 | + else |
| 77 | + echo -n "libnvidia-ml.so.1: '${NV_LIB_PATH}', " |
| 78 | + fi |
| 79 | + |
| 80 | + # Log top-level entries in /driver-root (this may be valuable debug info). |
| 81 | + echo "current contents: [$(/bin/ls -1xAw0 /driver-root 2>/dev/null)]." |
| 82 | + |
| 83 | + if [ -n "${NV_PATH}" ] && [ -n "${NV_LIB_PATH}" ]; then |
| 84 | + # Run with clean environment (only LD_PRELOAD; nvidia-smi has only this |
| 85 | + # dependency). Emit message before invocation (nvidia-smi may be slow or |
| 86 | + # hang). |
| 87 | + echo "invoke: env -i LD_PRELOAD=${NV_LIB_PATH} ${NV_PATH}" |
| 88 | + |
| 89 | + # Always show stderr, maybe hide or filter stdout? |
| 90 | + env -i LD_PRELOAD="${NV_LIB_PATH}" "${NV_PATH}" |
| 91 | + RCODE="$?" |
| 92 | + |
| 93 | + # For checking GPU driver health: rely on nvidia-smi's exit code. Rely |
| 94 | + # on code 0 signaling that the driver is properly set up. See section |
| 95 | + # 'RETURN VALUE' in the nvidia-smi man page for meaning of error codes. |
| 96 | + if [ ${RCODE} -eq 0 ]; then |
| 97 | + echo "nvidia-smi returned with code 0: success, leave" |
| 98 | + |
| 99 | + # Exit script indicating success (leave init container). |
| 100 | + exit 0 |
| 101 | + fi |
| 102 | + echo "exit code: ${RCODE}" |
| 103 | + fi |
| 104 | + |
| 105 | + # Reduce log volume: log hints only every Nth attempt. |
| 106 | + if [ $((_ATTEMPT % 6)) -ne 0 ]; then |
| 107 | + return |
| 108 | + fi |
| 109 | + |
| 110 | + # nvidia-smi binaries not found, or execution failed. First, provide generic |
| 111 | + # error message. Then, try to provide actionable hints for common problems. |
| 112 | + echo |
| 113 | + emit_common_err |
| 114 | + |
| 115 | + # For host-provided driver not at / provide feedback for two special cases. |
| 116 | + if [ "${NVIDIA_DRIVER_ROOT}" != "/" ]; then |
| 117 | + if [ -z "$( ls -A /driver-root )" ]; then |
| 118 | + echo "Hint: Directory $NVIDIA_DRIVER_ROOT on the host is empty" |
| 119 | + else |
| 120 | + # Not empty, but at least one of the binaries not found: this is a |
| 121 | + # rather pathological state. |
| 122 | + if [ -z "${NV_PATH}" ] || [ -z "${NV_LIB_PATH}" ]; then |
| 123 | + echo "Hint: Directory $NVIDIA_DRIVER_ROOT is not empty but at least one of the binaries wasn't found." |
| 124 | + fi |
| 125 | + fi |
| 126 | + fi |
| 127 | + |
| 128 | + # Common mistake: driver container, but forgot `--set nvidiaDriverRoot` |
| 129 | + if [ "${NVIDIA_DRIVER_ROOT}" == "/" ] && [ -f /driver-root/run/nvidia/driver/usr/bin/nvidia-smi ]; then |
| 130 | + printf '%b' \ |
| 131 | + "Hint: '/run/nvidia/driver/usr/bin/nvidia-smi' exists on the host, you " \ |
| 132 | + "may want to re-install the DRA driver Helm chart with " \ |
| 133 | + "--set nvidiaDriverRoot=/run/nvidia/driver\n" |
| 134 | + fi |
| 135 | + |
| 136 | + if [ "${NVIDIA_DRIVER_ROOT}" == "/run/nvidia/driver" ]; then |
| 137 | + printf '%b' \ |
| 138 | + "Hint: NVIDIA_DRIVER_ROOT is set to '/run/nvidia/driver' " \ |
| 139 | + "which typically means that the NVIDIA GPU Operator " \ |
| 140 | + "manages the GPU driver. Make sure that the GPU Operator " \ |
| 141 | + "is deployed and healthy.\n" |
| 142 | + fi |
| 143 | + echo |
| 144 | +} |
| 145 | + |
| 146 | +# DS pods may get deleted (terminated with SIGTERM) and re-created when the GPU |
| 147 | +# Operator driver container creates a mount at /run/nvidia. Make that explicit. |
| 148 | +log_sigterm() { |
| 149 | + echo "$(date -u +"%Y-%m-%dT%H:%M:%S.%3NZ"): received SIGTERM" |
| 150 | + exit 0 |
| 151 | +} |
| 152 | +trap 'log_sigterm' SIGTERM |
| 153 | + |
| 154 | + |
| 155 | +# Design goal: long-running init container that retries at constant frequency, |
| 156 | +# and leaves only upon success (with code 0). |
| 157 | +_WAIT_S=10 |
| 158 | +_ATTEMPT=0 |
| 159 | + |
| 160 | +while true |
| 161 | +do |
| 162 | + validate_and_exit_on_success |
| 163 | + sleep ${_WAIT_S} |
| 164 | + _ATTEMPT=$((_ATTEMPT+1)) |
| 165 | +done |
0 commit comments