do not make init container long-running

jgehrcke · jgehrcke · commit b9d821c1c5df · 2025-06-05T17:41:45.000+02:00
Signed-off-by: Dr. Jan-Philip Gehrcke &lt;jgehrcke@nvidia.com&gt;
diff --git a/hack/kubelet-plugin-prestart.sh b/hack/kubelet-plugin-prestart.sh
@@ -3,10 +3,7 @@
 # Main intent: help users to self-troubleshoot when the GPU driver is not set up
 # properly before installing this DRA driver. In that case, the log of the init
 # container running this script is meant to yield an actionable error message.
-#
-# Crash-loop (retry periodically) and actively wait for the GPU driver to come
-# online (unblock the DRA driver plugin daemonset pods from progressing right
-# after the GPU driver setup has been fixed).
+# For now, rely on k8s to implement a high-level retry with back-off.
 
 if [ -z "$NVIDIA_DRIVER_ROOT" ]; then
     # Not set, or set to empty string (not distinguishable).
@@ -16,8 +13,6 @@ fi
 
 echo "NVIDIA_DRIVER_ROOT (path on host): ${NVIDIA_DRIVER_ROOT}"
 
-while true
-do
 if ! command -v nvidia-smi &>/dev/null
 then
     printf '%b' \
@@ -41,38 +36,38 @@ then
         "--set nvidiaDriverRoot=/run/nvidia/driver\n"
     fi
 
-    # /driver-root (which we chrooted to) is empty (qrely on the fact
+    # /driver-root (which we chrooted to) is empty (rely on the fact
     # that this actually exists on the host: k8s hostPath type: Directory).
     if [ -z "$( ls -A / )" ]; then
         echo "Hint: Directory $NVIDIA_DRIVER_ROOT on the host appears to be empty"
     fi
-else
-    # Note: the following path is in-container, after chroot to /driver-root (and
-    # /driver-root is where NVIDIA_DRIVER_ROOT is mounted from the host filesystem).
-    # This typically outputs /usr/bin/nvidia-smi.
-    echo "command -v nvidia-smi: $(command -v nvidia-smi)"
 
-    # This may be slow or hang, in a bad setup.
-    echo "invoking nvidia-smi"
-    nvidia-smi
-    RCODE="$?"
+    exit 1
+fi
 
-    # For checking GPU driver health: for now, rely on nvidia-smi's exit code.
-    # Rely on code 0 meaning that the driver is properly set up. For example,
-    # code 9 means that the GPU driver is not loaded; see section 'RETURN VALUE'
-    # in the man page.
-    if [ ${RCODE} -eq 0 ]
-    then
-        echo "nvidia-smi returned with code 0: success, leave"
-        exit 0
-    else
-        # Do we worry about false-negatives here (where nvidia-smi fails
-        # e.g. because of one bad GPU but where we could still proceed)?
-        printf '%b' \
-        "nvidia-smi returned an error (code ${RCODE}). " \
-        "Verify that the GPU driver is set up correctly\n"
-    fi
+
+# Note: the following path is in-container, after chroot to /driver-root (and
+# /driver-root is where NVIDIA_DRIVER_ROOT is mounted from the host filesystem).
+# This typically outputs /usr/bin/nvidia-smi.
+echo "command -v nvidia-smi: $(command -v nvidia-smi)"
+
+# This may be slow or hang, in a bad setup.
+echo "invoking nvidia-smi"
+nvidia-smi
+RCODE="$?"
+
+# For checking GPU driver health: for now, rely on nvidia-smi's exit code.
+# Rely on code 0 meaning that the driver is properly set up. For example,
+# code 9 means that the GPU driver is not loaded; see section 'RETURN VALUE'
+# in the man page.
+if [ ${RCODE} -eq 0 ]
+then
+    echo "nvidia-smi returned with code 0: success, leave"
+    exit 0
+else
+    # Do we worry about false-negatives here (where nvidia-smi fails
+    # e.g. because of one bad GPU but where we could still proceed)?
+    printf '%b' \
+    "nvidia-smi returned an error (code ${RCODE}). " \
+    "Verify that the GPU driver is set up correctly\n"
 fi
-echo "retry in 30 s"
-sleep 30
-done