Skip to content

Commit b9d821c

Browse files
committed
do not make init container long-running
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent e201a27 commit b9d821c

File tree

1 file changed

+29
-34
lines changed

1 file changed

+29
-34
lines changed

hack/kubelet-plugin-prestart.sh

Lines changed: 29 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,7 @@
33
# Main intent: help users to self-troubleshoot when the GPU driver is not set up
44
# properly before installing this DRA driver. In that case, the log of the init
55
# container running this script is meant to yield an actionable error message.
6-
#
7-
# Crash-loop (retry periodically) and actively wait for the GPU driver to come
8-
# online (unblock the DRA driver plugin daemonset pods from progressing right
9-
# after the GPU driver setup has been fixed).
6+
# For now, rely on k8s to implement a high-level retry with back-off.
107

118
if [ -z "$NVIDIA_DRIVER_ROOT" ]; then
129
# Not set, or set to empty string (not distinguishable).
@@ -16,8 +13,6 @@ fi
1613

1714
echo "NVIDIA_DRIVER_ROOT (path on host): ${NVIDIA_DRIVER_ROOT}"
1815

19-
while true
20-
do
2116
if ! command -v nvidia-smi &>/dev/null
2217
then
2318
printf '%b' \
@@ -41,38 +36,38 @@ then
4136
"--set nvidiaDriverRoot=/run/nvidia/driver\n"
4237
fi
4338

44-
# /driver-root (which we chrooted to) is empty (qrely on the fact
39+
# /driver-root (which we chrooted to) is empty (rely on the fact
4540
# that this actually exists on the host: k8s hostPath type: Directory).
4641
if [ -z "$( ls -A / )" ]; then
4742
echo "Hint: Directory $NVIDIA_DRIVER_ROOT on the host appears to be empty"
4843
fi
49-
else
50-
# Note: the following path is in-container, after chroot to /driver-root (and
51-
# /driver-root is where NVIDIA_DRIVER_ROOT is mounted from the host filesystem).
52-
# This typically outputs /usr/bin/nvidia-smi.
53-
echo "command -v nvidia-smi: $(command -v nvidia-smi)"
5444

55-
# This may be slow or hang, in a bad setup.
56-
echo "invoking nvidia-smi"
57-
nvidia-smi
58-
RCODE="$?"
45+
exit 1
46+
fi
5947

60-
# For checking GPU driver health: for now, rely on nvidia-smi's exit code.
61-
# Rely on code 0 meaning that the driver is properly set up. For example,
62-
# code 9 means that the GPU driver is not loaded; see section 'RETURN VALUE'
63-
# in the man page.
64-
if [ ${RCODE} -eq 0 ]
65-
then
66-
echo "nvidia-smi returned with code 0: success, leave"
67-
exit 0
68-
else
69-
# Do we worry about false-negatives here (where nvidia-smi fails
70-
# e.g. because of one bad GPU but where we could still proceed)?
71-
printf '%b' \
72-
"nvidia-smi returned an error (code ${RCODE}). " \
73-
"Verify that the GPU driver is set up correctly\n"
74-
fi
48+
49+
# Note: the following path is in-container, after chroot to /driver-root (and
50+
# /driver-root is where NVIDIA_DRIVER_ROOT is mounted from the host filesystem).
51+
# This typically outputs /usr/bin/nvidia-smi.
52+
echo "command -v nvidia-smi: $(command -v nvidia-smi)"
53+
54+
# This may be slow or hang, in a bad setup.
55+
echo "invoking nvidia-smi"
56+
nvidia-smi
57+
RCODE="$?"
58+
59+
# For checking GPU driver health: for now, rely on nvidia-smi's exit code.
60+
# Rely on code 0 meaning that the driver is properly set up. For example,
61+
# code 9 means that the GPU driver is not loaded; see section 'RETURN VALUE'
62+
# in the man page.
63+
if [ ${RCODE} -eq 0 ]
64+
then
65+
echo "nvidia-smi returned with code 0: success, leave"
66+
exit 0
67+
else
68+
# Do we worry about false-negatives here (where nvidia-smi fails
69+
# e.g. because of one bad GPU but where we could still proceed)?
70+
printf '%b' \
71+
"nvidia-smi returned an error (code ${RCODE}). " \
72+
"Verify that the GPU driver is set up correctly\n"
7573
fi
76-
echo "retry in 30 s"
77-
sleep 30
78-
done

0 commit comments

Comments
 (0)