Skip to content

Commit 71bde39

Browse files
committed
Add init container to plugin pod (GPU driver dependency check)
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent f537b96 commit 71bde39

File tree

3 files changed

+91
-0
lines changed

3 files changed

+91
-0
lines changed

deployments/container/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,5 @@ COPY --from=build /artifacts/compute-domain-controller /usr/bin/compute-do
9494
COPY --from=build /artifacts/compute-domain-kubelet-plugin /usr/bin/compute-domain-kubelet-plugin
9595
COPY --from=build /artifacts/compute-domain-daemon /usr/bin/compute-domain-daemon
9696
COPY --from=build /artifacts/gpu-kubelet-plugin /usr/bin/gpu-kubelet-plugin
97+
COPY --from=build /build/hack/kubelet-plugin-prestart.sh /usr/bin/kubelet-plugin-prestart.sh
9798
COPY --from=build /build/templates /templates

deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,24 @@ spec:
4949
serviceAccountName: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}
5050
securityContext:
5151
{{- toYaml .Values.kubeletPlugin.podSecurityContext | nindent 8 }}
52+
initContainers:
53+
- name: init-container
54+
image: {{ include "nvidia-dra-driver-gpu.fullimage" . }}
55+
securityContext:
56+
privileged: true
57+
# Bashisms are used in kubelet-plugin-prestart.sh. Feed logic from
58+
# stdin: script path does not exist in chroot environment.
59+
command: [sh, "-c", "chroot /driver-root bash -s < /usr/bin/kubelet-plugin-prestart.sh"]
60+
env:
61+
- name: NVIDIA_DRIVER_ROOT
62+
value: "{{ .Values.nvidiaDriverRoot }}"
63+
# Explicit "void"; otherwise we inherit "all".
64+
- name: NVIDIA_VISIBLE_DEVICES
65+
value: void
66+
volumeMounts:
67+
- name: driver-root
68+
mountPath: /driver-root
69+
readOnly: true
5270
containers:
5371
{{- if .Values.resources.computeDomains.enabled }}
5472
- name: compute-domains

hack/kubelet-plugin-prestart.sh

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/usr/bin/env bash
2+
3+
# Main intent: help users to self-troubleshoot when the GPU driver is not set up
4+
# properly before installing this DRA driver. In that case, the log of the init
5+
# container running this script is meant to yield an actionable error message.
6+
#
7+
# Crash-loop (retry periodically) and actively wait for the GPU driver to come
8+
# online (unblock the DRA driver plugin daemonset pods from progressing right
9+
# after the GPU driver setup has been fixed).
10+
11+
if [ -z "$NVIDIA_DRIVER_ROOT" ]; then
12+
# Not set, or set to empty string (not distinguishable).
13+
# Normalize to "/" (treated as such elsewhere).
14+
export NVIDIA_DRIVER_ROOT="/"
15+
fi
16+
17+
echo "NVIDIA_DRIVER_ROOT (path on host): ${NVIDIA_DRIVER_ROOT}"
18+
19+
while true
20+
do
21+
if ! command -v nvidia-smi &>/dev/null
22+
then
23+
printf '%b' \
24+
"Not in PATH: 'nvidia-smi'. " \
25+
"Has the NVIDIA GPU driver been set up? " \
26+
"The GPU driver is expected to be installed under " \
27+
"NVIDIA_DRIVER_ROOT ('${NVIDIA_DRIVER_ROOT}') in the host filesystem. " \
28+
"If NVIDIA_DRIVER_ROOT appears to be unexpected: " \
29+
"review and adjust the 'nvidiaDriverRoot' Helm chart variable. " \
30+
"If the value is expected: review if the GPU driver has " \
31+
"actually been installed under NVIDIA_DRIVER_ROOT. " \
32+
"If you chose the NVIDIA GPU Operator to manage the GPU driver " \
33+
"(NVIDIA_DRIVER_ROOT set to /run/nvidia/driver): "\
34+
"make sure that Operator is deployed and healthy.\n"
35+
36+
# Provide hint for a specific, common mistake.
37+
if [ "${NVIDIA_DRIVER_ROOT}" == "/" ] && [ -f /run/nvidia/driver/usr/bin/nvidia-smi ]; then
38+
printf '%b' \
39+
"Note: /run/nvidia/driver/usr/bin/nvidia-smi exists on the host, you " \
40+
"may want to re-install the DRA driver Helm chart with " \
41+
"--set nvidiaDriverRoot=/run/nvidia/driver\n"
42+
fi
43+
else
44+
# Note: the following path is in-container, after chroot to /driver-root (and
45+
# /driver-root is where NVIDIA_DRIVER_ROOT is mounted from the host filesystem).
46+
# This typically outputs /usr/bin/nvidia-smi.
47+
echo "command -v nvidia-smi: $(command -v nvidia-smi)"
48+
49+
# This may be slow or hang, in a bad setup.
50+
echo "invoking nvidia-smi"
51+
nvidia-smi
52+
RCODE="$?"
53+
54+
# For checking GPU driver health: for now, rely on nvidia-smi's exit code.
55+
# Rely on code 0 meaning that the driver is properly set up. For example,
56+
# code 9 means that the GPU driver is not loaded; see section 'RETURN VALUE'
57+
# in the man page.
58+
if [ ${RCODE} -eq 0 ]
59+
then
60+
echo "nvidia-smi returned with code 0: success, leave"
61+
exit 0
62+
else
63+
# Do we worry about false-negatives here (where nvidia-smi fails
64+
# e.g. because of one bad GPU but where we could still proceed)?
65+
printf '%b' \
66+
"nvidia-smi returned an error (code ${RCODE}). " \
67+
"Verify that the GPU driver is set up correctly\n"
68+
fi
69+
fi
70+
echo "retry in 30 s"
71+
sleep 30
72+
done

0 commit comments

Comments
 (0)