Skip to content

Commit 9c7c0c5

Browse files
committed
Add init container to plugin pod (GPU driver dependency check)
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]> Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent 06ea908 commit 9c7c0c5

File tree

3 files changed

+188
-3
lines changed

3 files changed

+188
-3
lines changed

deployments/container/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,5 @@ COPY --from=build /artifacts/compute-domain-controller /usr/bin/compute-do
9494
COPY --from=build /artifacts/compute-domain-kubelet-plugin /usr/bin/compute-domain-kubelet-plugin
9595
COPY --from=build /artifacts/compute-domain-daemon /usr/bin/compute-domain-daemon
9696
COPY --from=build /artifacts/gpu-kubelet-plugin /usr/bin/gpu-kubelet-plugin
97+
COPY --from=build /build/hack/kubelet-plugin-prestart.sh /usr/bin/kubelet-plugin-prestart.sh
9798
COPY --from=build /build/templates /templates

deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,30 @@ spec:
4949
serviceAccountName: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}
5050
securityContext:
5151
{{- toYaml .Values.kubeletPlugin.podSecurityContext | nindent 8 }}
52+
initContainers:
53+
- name: init-container
54+
image: {{ include "nvidia-dra-driver-gpu.fullimage" . }}
55+
securityContext:
56+
privileged: true
57+
command: [bash, /usr/bin/kubelet-plugin-prestart.sh]
58+
env:
59+
- name: NVIDIA_DRIVER_ROOT
60+
value: "{{ .Values.nvidiaDriverRoot }}"
61+
# Use runc: explicit "void"; otherwise we inherit "all".
62+
- name: NVIDIA_VISIBLE_DEVICES
63+
value: void
64+
# In case of the operator-provided driver, another container mounts the
65+
# driver onto the host using `mountPropagation: Bidirectional`
66+
# (out-of-band of the lifecycle of _this_ pod here). For us to see that
67+
# mount, `mountPropagation: HostToContainer` is required (docs: "if any
68+
# Pod with Bidirectional mount propagation to the same volume mounts
69+
# anything there, the container with HostToContainer mount propagation
70+
# will see it.").
71+
volumeMounts:
72+
- name: driver-root
73+
mountPath: /driver-root
74+
readOnly: true
75+
mountPropagation: HostToContainer
5276
containers:
5377
{{- if .Values.resources.computeDomains.enabled }}
5478
- name: compute-domains
@@ -98,11 +122,13 @@ spec:
98122
mountPropagation: Bidirectional
99123
- name: cdi
100124
mountPath: /var/run/cdi
101-
# We always mount the driver root at /driver-root in the container.
102125
- name: driver-root
103126
mountPath: /driver-root
104127
readOnly: true
105-
# Pragmatic solution for host-managed drivers located not at /.
128+
mountPropagation: HostToContainer
129+
# For host-managed drivers located not at /.
130+
# TODO: make this more surgical, see discussion in
131+
# https://github.com/NVIDIA/k8s-dra-driver-gpu/pull/307.
106132
- name: host-dev
107133
mountPath: /dev
108134
{{- end }}
@@ -156,10 +182,10 @@ spec:
156182
mountPropagation: Bidirectional
157183
- name: cdi
158184
mountPath: /var/run/cdi
159-
# We always mount the driver root at /driver-root in the container.
160185
- name: driver-root
161186
mountPath: /driver-root
162187
readOnly: true
188+
mountPropagation: HostToContainer
163189
{{- end }}
164190
volumes:
165191
- name: plugins-registry
@@ -174,6 +200,7 @@ spec:
174200
- name: driver-root
175201
hostPath:
176202
path: {{ .Values.nvidiaDriverRoot }}
203+
type: DirectoryOrCreate
177204
- name: host-dev
178205
hostPath:
179206
path: /dev

hack/kubelet-plugin-prestart.sh

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
#!/usr/bin/env bash
2+
3+
# Main intent: help users to self-troubleshoot when the GPU driver is not set up
4+
# properly before installing this DRA driver. In that case, the log of the init
5+
# container running this script is meant to yield an actionable error message.
6+
# For now, rely on k8s to implement a high-level retry with back-off.
7+
8+
if [ -z "$NVIDIA_DRIVER_ROOT" ]; then
9+
# Not set, or set to empty string (not distinguishable).
10+
# Normalize to "/" (treated as such elsewhere).
11+
export NVIDIA_DRIVER_ROOT="/"
12+
fi
13+
14+
emit_common_err () {
15+
printf '%b' \
16+
"Check failed. Has the NVIDIA GPU driver been set up? " \
17+
"It is expected to be installed under " \
18+
"NVIDIA_DRIVER_ROOT (currently set to '${NVIDIA_DRIVER_ROOT}') " \
19+
"in the host filesystem. If that path appears to be unexpected: " \
20+
"review the DRA driver's 'nvidiaDriverRoot' Helm chart variable. " \
21+
"Otherwise, review if the GPU driver has " \
22+
"actually been installed under that path.\n"
23+
}
24+
25+
validate_and_exit_on_success () {
26+
echo -n "$(date -u +"%Y-%m-%dT%H:%M:%SZ") /driver-root (${NVIDIA_DRIVER_ROOT} on host): "
27+
28+
# Search specific set of directories (not recursively: not required, and
29+
# /driver-root may be a big tree). Limit to first result (multiple results
30+
# are a bit of a pathological state, but continue with validation logic).
31+
# Suppress find stderr: some search directories are expected to be "not
32+
# found".
33+
34+
NV_PATH=$( \
35+
find \
36+
/driver-root/bin \
37+
/driver-root/sbin \
38+
/driver-root/usr/bin \
39+
/driver-root/sbin \
40+
-maxdepth 1 -type f -name "nvidia-smi" 2> /dev/null | head -n1
41+
)
42+
43+
# Follow symlinks (-L), because `libnvidia-ml.so.1` is typically a link.
44+
# maxdepth 1 also protects against any potential symlink loop (we're
45+
# suppressing find's stderr, so we'd never see messages like 'Too many
46+
# levels of symbolic links').
47+
NV_LIB_PATH=$( \
48+
find -L \
49+
/driver-root/usr/lib64 \
50+
/driver-root/usr/lib/x86_64-linux-gnu \
51+
/driver-root/usr/lib/aarch64-linux-gnu \
52+
/driver-root/lib64 \
53+
/driver-root/lib/x86_64-linux-gnu \
54+
/driver-root/lib/aarch64-linux-gnu \
55+
-maxdepth 1 -type f -name "libnvidia-ml.so.1" 2> /dev/null | head -n1
56+
)
57+
58+
if [ -z "${NV_PATH}" ]; then
59+
echo -n "nvidia-smi: not found, "
60+
else
61+
echo -n "nvidia-smi: '${NV_PATH}', "
62+
fi
63+
64+
if [ -z "${NV_LIB_PATH}" ]; then
65+
echo -n "libnvidia-ml.so.1: not found, "
66+
else
67+
echo -n "libnvidia-ml.so.1: '${NV_LIB_PATH}', "
68+
fi
69+
70+
# Log top-level entries in /driver-root (this may be valuable debug info).
71+
echo "current contents: [$(/bin/ls -1xAw0 /driver-root 2>/dev/null)]."
72+
73+
if [ -n "${NV_PATH}" ] && [ -n "${NV_LIB_PATH}" ]; then
74+
75+
# Run with clean environment (only LD_PRELOAD; nvidia-smi has only this
76+
# dependency). Emit message before invocation (nvidia-smi may be slow or
77+
# hang).
78+
echo "invoke: env -i LD_PRELOAD=${NV_LIB_PATH} ${NV_PATH}"
79+
80+
# Always show stderr, maybe hide or filter stdout?
81+
env -i LD_PRELOAD="${NV_LIB_PATH}" "${NV_PATH}"
82+
RCODE="$?"
83+
84+
# For checking GPU driver health: rely on nvidia-smi's exit code. Rely
85+
# on code 0 signaling that the driver is properly set up. See section
86+
# 'RETURN VALUE' in the nvidia-smi man page for meaning of error codes.
87+
if [ ${RCODE} -eq 0 ]; then
88+
echo "nvidia-smi returned with code 0: success, leave"
89+
90+
# Exit script indicating success (leave init container).
91+
exit 0
92+
else
93+
echo "exit code: ${RCODE}"
94+
fi
95+
fi
96+
97+
# Reduce log volume: log hints only every Nth attempt.
98+
if [ $((_ATTEMPT % 6)) -ne 0 ]; then
99+
return
100+
fi
101+
102+
# nvidia-smi binaries not found, or execution failed. First, provide generic
103+
# error message. Then, try to provide actional hints for common problems.
104+
echo
105+
emit_common_err
106+
107+
# For host-provided driver not at / provide feedback for two special cases.
108+
if [ "${NVIDIA_DRIVER_ROOT}" != "/" ]; then
109+
if [ -z "$( ls -A /driver-root )" ]; then
110+
echo "Hint: Directory $NVIDIA_DRIVER_ROOT on the host is empty"
111+
else
112+
# Not empty, but at least one of the binaries not found: this is a
113+
# rather pathotlogical state.
114+
if [ -z "${NV_PATH}" ] || [ -z "${NV_LIB_PATH}" ]; then
115+
echo "Hint: Directory $NVIDIA_DRIVER_ROOT is not empty but at least one of the binaries wasn't found."
116+
fi
117+
fi
118+
fi
119+
120+
# Common mistake: driver container, but forgot `--set nvidiaDriverRoot`
121+
if [ "${NVIDIA_DRIVER_ROOT}" == "/" ] && [ -f /driver-root/run/nvidia/driver/usr/bin/nvidia-smi ]; then
122+
printf '%b' \
123+
"Hint: '/run/nvidia/driver/usr/bin/nvidia-smi' exists on the host, you " \
124+
"may want to re-install the DRA driver Helm chart with " \
125+
"--set nvidiaDriverRoot=/run/nvidia/driver\n"
126+
fi
127+
128+
if [ "${NVIDIA_DRIVER_ROOT}" == "/run/nvidia/driver" ]; then
129+
printf '%b' \
130+
"Hint: NVIDIA_DRIVER_ROOT is set to '/run/nvidia/driver' " \
131+
"which typically means that the NVIDIA GPU Operator " \
132+
"manages the GPU driver. Make sure that the GPU Operator " \
133+
"is deployed and healthy.\n"
134+
fi
135+
echo
136+
}
137+
138+
# DS pods may get deleted (terminated with SIGTERM) and re-created when the GPU
139+
# Operator driver container creates a mount at /run/nvidia. Make that explicit.
140+
log_sigterm() {
141+
echo "$(date -u +"%Y-%m-%dT%H:%M:%S.%3NZ"): received SIGTERM"
142+
exit 0
143+
}
144+
trap 'log_sigterm' SIGTERM
145+
146+
147+
# Design goal: long-running init container that retries at constant frequency,
148+
# and leaves only upon success (with code 0).
149+
_WAIT_S=10
150+
_ATTEMPT=0
151+
152+
while true
153+
do
154+
validate_and_exit_on_success
155+
sleep ${_WAIT_S}
156+
_ATTEMPT=$((_ATTEMPT+1))
157+
done

0 commit comments

Comments
 (0)