Skip to content

Commit 0a18f1a

Browse files
authored
Merge pull request #389 from jgehrcke/jp/plugin-init-cont
Add init container to plugin pod (GPU driver dependency check)
2 parents 9584691 + b33f9fd commit 0a18f1a

File tree

3 files changed

+211
-3
lines changed

3 files changed

+211
-3
lines changed

deployments/container/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,5 @@ COPY --from=build /artifacts/compute-domain-controller /usr/bin/compute-do
9494
COPY --from=build /artifacts/compute-domain-kubelet-plugin /usr/bin/compute-domain-kubelet-plugin
9595
COPY --from=build /artifacts/compute-domain-daemon /usr/bin/compute-domain-daemon
9696
COPY --from=build /artifacts/gpu-kubelet-plugin /usr/bin/gpu-kubelet-plugin
97+
COPY --from=build /build/hack/kubelet-plugin-prestart.sh /usr/bin/kubelet-plugin-prestart.sh
9798
COPY --from=build /build/templates /templates

deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,33 @@ spec:
4949
serviceAccountName: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}
5050
securityContext:
5151
{{- toYaml .Values.kubeletPlugin.podSecurityContext | nindent 8 }}
52+
initContainers:
53+
- name: init-container
54+
image: {{ include "nvidia-dra-driver-gpu.fullimage" . }}
55+
securityContext:
56+
privileged: true
57+
command: [bash, /usr/bin/kubelet-plugin-prestart.sh]
58+
env:
59+
- name: NVIDIA_DRIVER_ROOT
60+
value: "{{ .Values.nvidiaDriverRoot }}"
61+
# Use runc: explicit "void"; otherwise we inherit "all".
62+
- name: NVIDIA_VISIBLE_DEVICES
63+
value: void
64+
volumeMounts:
65+
- name: driver-root-parent
66+
mountPath: /driver-root-parent
67+
{{- if eq "/" .Values.nvidiaDriverRoot }}
68+
readOnly: true
69+
{{- else }}
70+
# In case of the operator-provided driver, another container mounts
71+
# the driver onto the host using `mountPropagation: Bidirectional`
72+
# (out-of-band of the lifecycle of _this_ pod here). For us to see
73+
# that mount, `mountPropagation: HostToContainer` is required (docs:
74+
# "if any Pod with Bidirectional mount propagation to the same volume
75+
# mounts anything there, the container with HostToContainer mount
76+
# propagation will see it.").
77+
mountPropagation: HostToContainer
78+
{{- end }}
5279
containers:
5380
{{- if .Values.resources.computeDomains.enabled }}
5481
- name: compute-domains
@@ -98,11 +125,13 @@ spec:
98125
mountPropagation: Bidirectional
99126
- name: cdi
100127
mountPath: /var/run/cdi
101-
# We always mount the driver root at /driver-root in the container.
102128
- name: driver-root
103129
mountPath: /driver-root
104130
readOnly: true
105-
# Pragmatic solution for host-managed drivers located not at /.
131+
mountPropagation: HostToContainer
132+
# For host-managed drivers located not at /.
133+
# TODO: make this more surgical, see discussion in
134+
# https://github.com/NVIDIA/k8s-dra-driver-gpu/pull/307.
106135
- name: host-dev
107136
mountPath: /dev
108137
{{- end }}
@@ -156,10 +185,10 @@ spec:
156185
mountPropagation: Bidirectional
157186
- name: cdi
158187
mountPath: /var/run/cdi
159-
# We always mount the driver root at /driver-root in the container.
160188
- name: driver-root
161189
mountPath: /driver-root
162190
readOnly: true
191+
mountPropagation: HostToContainer
163192
{{- end }}
164193
volumes:
165194
- name: plugins-registry
@@ -171,9 +200,22 @@ spec:
171200
- name: cdi
172201
hostPath:
173202
path: /var/run/cdi
203+
- name: driver-root-parent
204+
hostPath:
205+
# If nvidiaDriverRoot == "/" then its parent is itself. Otherwise, get
206+
# its parent by removing any trailing slashes as well as the last path
207+
# element with sprig template function `dir`. Examples: /a/b/ -> /a,
208+
# /a/b/c -> /a/b.
209+
{{- if eq "/" .Values.nvidiaDriverRoot }}
210+
path: "/"
211+
{{- else }}
212+
path: {{ dir (trimSuffix "/" .Values.nvidiaDriverRoot) }}
213+
{{- end }}
214+
type: DirectoryOrCreate
174215
- name: driver-root
175216
hostPath:
176217
path: {{ .Values.nvidiaDriverRoot }}
218+
type: DirectoryOrCreate
177219
- name: host-dev
178220
hostPath:
179221
path: /dev

hack/kubelet-plugin-prestart.sh

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
#!/usr/bin/env bash
2+
3+
# Main intent: help users to self-troubleshoot when the GPU driver is not set up
4+
# properly before installing this DRA driver. In that case, the log of the init
5+
# container running this script is meant to yield an actionable error message.
6+
# For now, rely on k8s to implement a high-level retry with back-off.
7+
8+
if [ -z "$NVIDIA_DRIVER_ROOT" ]; then
9+
# Not set, or set to empty string (not distinguishable).
10+
# Normalize to "/" (treated as such elsewhere).
11+
export NVIDIA_DRIVER_ROOT="/"
12+
fi
13+
14+
# Remove trailing slash (if existing) and get last path element.
15+
_driver_root_path="/driver-root-parent/$(basename "${NVIDIA_DRIVER_ROOT%/}")"
16+
17+
# Create in-container path /driver-root as a symlink. Expectation: link may be
18+
# broken initially (e.g., if the GPU operator isn't deployed yet. The link heals
19+
# once the driver becomes mounted (e.g., once GPU operator provides the driver
20+
# on the host at /run/nvidia/driver).
21+
echo "create symlink: /driver-root -> ${_driver_root_path}"
22+
ln -s "${_driver_root_path}" /driver-root
23+
24+
emit_common_err () {
25+
printf '%b' \
26+
"Check failed. Has the NVIDIA GPU driver been set up? " \
27+
"It is expected to be installed under " \
28+
"NVIDIA_DRIVER_ROOT (currently set to '${NVIDIA_DRIVER_ROOT}') " \
29+
"in the host filesystem. If that path appears to be unexpected: " \
30+
"review the DRA driver's 'nvidiaDriverRoot' Helm chart variable. " \
31+
"Otherwise, review if the GPU driver has " \
32+
"actually been installed under that path.\n"
33+
}
34+
35+
validate_and_exit_on_success () {
36+
echo -n "$(date -u +"%Y-%m-%dT%H:%M:%SZ") /driver-root (${NVIDIA_DRIVER_ROOT} on host): "
37+
38+
# Search specific set of directories (not recursively: not required, and
39+
# /driver-root may be a big tree). Limit to first result (multiple results
40+
# are a bit of a pathological state, but continue with validation logic).
41+
# Suppress find stderr: some search directories are expected to be "not
42+
# found".
43+
44+
NV_PATH=$( \
45+
find \
46+
/driver-root/bin \
47+
/driver-root/sbin \
48+
/driver-root/usr/bin \
49+
/driver-root/sbin \
50+
-maxdepth 1 -type f -name "nvidia-smi" 2> /dev/null | head -n1
51+
)
52+
53+
# Follow symlinks (-L), because `libnvidia-ml.so.1` is typically a link.
54+
# maxdepth 1 also protects against any potential symlink loop (we're
55+
# suppressing find's stderr, so we'd never see messages like 'Too many
56+
# levels of symbolic links').
57+
NV_LIB_PATH=$( \
58+
find -L \
59+
/driver-root/usr/lib64 \
60+
/driver-root/usr/lib/x86_64-linux-gnu \
61+
/driver-root/usr/lib/aarch64-linux-gnu \
62+
/driver-root/lib64 \
63+
/driver-root/lib/x86_64-linux-gnu \
64+
/driver-root/lib/aarch64-linux-gnu \
65+
-maxdepth 1 -type f -name "libnvidia-ml.so.1" 2> /dev/null | head -n1
66+
)
67+
68+
if [ -z "${NV_PATH}" ]; then
69+
echo -n "nvidia-smi: not found, "
70+
else
71+
echo -n "nvidia-smi: '${NV_PATH}', "
72+
fi
73+
74+
if [ -z "${NV_LIB_PATH}" ]; then
75+
echo -n "libnvidia-ml.so.1: not found, "
76+
else
77+
echo -n "libnvidia-ml.so.1: '${NV_LIB_PATH}', "
78+
fi
79+
80+
# Log top-level entries in /driver-root (this may be valuable debug info).
81+
echo "current contents: [$(/bin/ls -1xAw0 /driver-root 2>/dev/null)]."
82+
83+
if [ -n "${NV_PATH}" ] && [ -n "${NV_LIB_PATH}" ]; then
84+
# Run with clean environment (only LD_PRELOAD; nvidia-smi has only this
85+
# dependency). Emit message before invocation (nvidia-smi may be slow or
86+
# hang).
87+
echo "invoke: env -i LD_PRELOAD=${NV_LIB_PATH} ${NV_PATH}"
88+
89+
# Always show stderr, maybe hide or filter stdout?
90+
env -i LD_PRELOAD="${NV_LIB_PATH}" "${NV_PATH}"
91+
RCODE="$?"
92+
93+
# For checking GPU driver health: rely on nvidia-smi's exit code. Rely
94+
# on code 0 signaling that the driver is properly set up. See section
95+
# 'RETURN VALUE' in the nvidia-smi man page for meaning of error codes.
96+
if [ ${RCODE} -eq 0 ]; then
97+
echo "nvidia-smi returned with code 0: success, leave"
98+
99+
# Exit script indicating success (leave init container).
100+
exit 0
101+
fi
102+
echo "exit code: ${RCODE}"
103+
fi
104+
105+
# Reduce log volume: log hints only every Nth attempt.
106+
if [ $((_ATTEMPT % 6)) -ne 0 ]; then
107+
return
108+
fi
109+
110+
# nvidia-smi binaries not found, or execution failed. First, provide generic
111+
# error message. Then, try to provide actionable hints for common problems.
112+
echo
113+
emit_common_err
114+
115+
# For host-provided driver not at / provide feedback for two special cases.
116+
if [ "${NVIDIA_DRIVER_ROOT}" != "/" ]; then
117+
if [ -z "$( ls -A /driver-root )" ]; then
118+
echo "Hint: Directory $NVIDIA_DRIVER_ROOT on the host is empty"
119+
else
120+
# Not empty, but at least one of the binaries not found: this is a
121+
# rather pathological state.
122+
if [ -z "${NV_PATH}" ] || [ -z "${NV_LIB_PATH}" ]; then
123+
echo "Hint: Directory $NVIDIA_DRIVER_ROOT is not empty but at least one of the binaries wasn't found."
124+
fi
125+
fi
126+
fi
127+
128+
# Common mistake: driver container, but forgot `--set nvidiaDriverRoot`
129+
if [ "${NVIDIA_DRIVER_ROOT}" == "/" ] && [ -f /driver-root/run/nvidia/driver/usr/bin/nvidia-smi ]; then
130+
printf '%b' \
131+
"Hint: '/run/nvidia/driver/usr/bin/nvidia-smi' exists on the host, you " \
132+
"may want to re-install the DRA driver Helm chart with " \
133+
"--set nvidiaDriverRoot=/run/nvidia/driver\n"
134+
fi
135+
136+
if [ "${NVIDIA_DRIVER_ROOT}" == "/run/nvidia/driver" ]; then
137+
printf '%b' \
138+
"Hint: NVIDIA_DRIVER_ROOT is set to '/run/nvidia/driver' " \
139+
"which typically means that the NVIDIA GPU Operator " \
140+
"manages the GPU driver. Make sure that the GPU Operator " \
141+
"is deployed and healthy.\n"
142+
fi
143+
echo
144+
}
145+
146+
# DS pods may get deleted (terminated with SIGTERM) and re-created when the GPU
147+
# Operator driver container creates a mount at /run/nvidia. Make that explicit.
148+
log_sigterm() {
149+
echo "$(date -u +"%Y-%m-%dT%H:%M:%S.%3NZ"): received SIGTERM"
150+
exit 0
151+
}
152+
trap 'log_sigterm' SIGTERM
153+
154+
155+
# Design goal: long-running init container that retries at constant frequency,
156+
# and leaves only upon success (with code 0).
157+
_WAIT_S=10
158+
_ATTEMPT=0
159+
160+
while true
161+
do
162+
validate_and_exit_on_success
163+
sleep ${_WAIT_S}
164+
_ATTEMPT=$((_ATTEMPT+1))
165+
done

0 commit comments

Comments
 (0)