Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions deployments/container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,5 @@ COPY --from=build /artifacts/compute-domain-controller /usr/bin/compute-do
COPY --from=build /artifacts/compute-domain-kubelet-plugin /usr/bin/compute-domain-kubelet-plugin
COPY --from=build /artifacts/compute-domain-daemon /usr/bin/compute-domain-daemon
COPY --from=build /artifacts/gpu-kubelet-plugin /usr/bin/gpu-kubelet-plugin
COPY --from=build /build/hack/kubelet-plugin-prestart.sh /usr/bin/kubelet-plugin-prestart.sh
COPY --from=build /build/templates /templates
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,33 @@ spec:
serviceAccountName: {{ include "nvidia-dra-driver-gpu.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.kubeletPlugin.podSecurityContext | nindent 8 }}
initContainers:
- name: init-container
image: {{ include "nvidia-dra-driver-gpu.fullimage" . }}
securityContext:
privileged: true
command: [bash, /usr/bin/kubelet-plugin-prestart.sh]
env:
- name: NVIDIA_DRIVER_ROOT
value: "{{ .Values.nvidiaDriverRoot }}"
# Use runc: explicit "void"; otherwise we inherit "all".
- name: NVIDIA_VISIBLE_DEVICES
value: void
volumeMounts:
- name: driver-root-parent
mountPath: /driver-root-parent
{{- if eq "/" .Values.nvidiaDriverRoot }}
readOnly: true
{{- else }}
# In case of the operator-provided driver, another container mounts
# the driver onto the host using `mountPropagation: Bidirectional`
# (out-of-band of the lifecycle of _this_ pod here). For us to see
# that mount, `mountPropagation: HostToContainer` is required (docs:
# "if any Pod with Bidirectional mount propagation to the same volume
# mounts anything there, the container with HostToContainer mount
# propagation will see it.").
mountPropagation: HostToContainer
{{- end }}
containers:
{{- if .Values.resources.computeDomains.enabled }}
- name: compute-domains
Expand Down Expand Up @@ -98,11 +125,13 @@ spec:
mountPropagation: Bidirectional
- name: cdi
mountPath: /var/run/cdi
# We always mount the driver root at /driver-root in the container.
- name: driver-root
mountPath: /driver-root
readOnly: true
# Pragmatic solution for host-managed drivers located not at /.
mountPropagation: HostToContainer
# For host-managed drivers located not at /.
# TODO: make this more surgical, see discussion in
# https://github.com/NVIDIA/k8s-dra-driver-gpu/pull/307.
- name: host-dev
mountPath: /dev
{{- end }}
Expand Down Expand Up @@ -156,10 +185,10 @@ spec:
mountPropagation: Bidirectional
- name: cdi
mountPath: /var/run/cdi
# We always mount the driver root at /driver-root in the container.
- name: driver-root
mountPath: /driver-root
readOnly: true
mountPropagation: HostToContainer
{{- end }}
volumes:
- name: plugins-registry
Expand All @@ -171,9 +200,22 @@ spec:
- name: cdi
hostPath:
path: /var/run/cdi
- name: driver-root-parent
hostPath:
# If nvidiaDriverRoot == "/" then its parent is itself. Otherwise, get
# its parent by removing any trailing slashes as well as the last path
# element with sprig template function `dir`. Examples: /a/b/ -> /a,
# /a/b/c -> /a/b.
{{- if eq "/" .Values.nvidiaDriverRoot }}
path: "/"
{{- else }}
path: {{ dir (trimSuffix "/" .Values.nvidiaDriverRoot) }}
{{- end }}
type: DirectoryOrCreate
- name: driver-root
hostPath:
path: {{ .Values.nvidiaDriverRoot }}
type: DirectoryOrCreate
- name: host-dev
hostPath:
path: /dev
Expand Down
165 changes: 165 additions & 0 deletions hack/kubelet-plugin-prestart.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env bash

# Main intent: help users to self-troubleshoot when the GPU driver is not set up
# properly before installing this DRA driver. In that case, the log of the init
# container running this script is meant to yield an actionable error message.
# For now, rely on k8s to implement a high-level retry with back-off.

if [ -z "$NVIDIA_DRIVER_ROOT" ]; then
# Not set, or set to empty string (not distinguishable).
# Normalize to "/" (treated as such elsewhere).
export NVIDIA_DRIVER_ROOT="/"
fi

# Remove trailing slash (if existing) and get last path element.
_driver_root_path="/driver-root-parent/$(basename "${NVIDIA_DRIVER_ROOT%/}")"

# Create in-container path /driver-root as a symlink. Expectation: link may be
# broken initially (e.g., if the GPU operator isn't deployed yet. The link heals
# once the driver becomes mounted (e.g., once GPU operator provides the driver
# on the host at /run/nvidia/driver).
echo "create symlink: /driver-root -> ${_driver_root_path}"
ln -s "${_driver_root_path}" /driver-root

emit_common_err () {
printf '%b' \
"Check failed. Has the NVIDIA GPU driver been set up? " \
"It is expected to be installed under " \
"NVIDIA_DRIVER_ROOT (currently set to '${NVIDIA_DRIVER_ROOT}') " \
"in the host filesystem. If that path appears to be unexpected: " \
"review the DRA driver's 'nvidiaDriverRoot' Helm chart variable. " \
"Otherwise, review if the GPU driver has " \
"actually been installed under that path.\n"
}

validate_and_exit_on_success () {
echo -n "$(date -u +"%Y-%m-%dT%H:%M:%SZ") /driver-root (${NVIDIA_DRIVER_ROOT} on host): "

# Search specific set of directories (not recursively: not required, and
# /driver-root may be a big tree). Limit to first result (multiple results
# are a bit of a pathological state, but continue with validation logic).
# Suppress find stderr: some search directories are expected to be "not
# found".

NV_PATH=$( \
find \
/driver-root/bin \
/driver-root/sbin \
/driver-root/usr/bin \
/driver-root/sbin \
-maxdepth 1 -type f -name "nvidia-smi" 2> /dev/null | head -n1
)

# Follow symlinks (-L), because `libnvidia-ml.so.1` is typically a link.
# maxdepth 1 also protects against any potential symlink loop (we're
# suppressing find's stderr, so we'd never see messages like 'Too many
# levels of symbolic links').
NV_LIB_PATH=$( \
find -L \
/driver-root/usr/lib64 \
/driver-root/usr/lib/x86_64-linux-gnu \
/driver-root/usr/lib/aarch64-linux-gnu \
/driver-root/lib64 \
/driver-root/lib/x86_64-linux-gnu \
/driver-root/lib/aarch64-linux-gnu \
-maxdepth 1 -type f -name "libnvidia-ml.so.1" 2> /dev/null | head -n1
)

if [ -z "${NV_PATH}" ]; then
echo -n "nvidia-smi: not found, "
else
echo -n "nvidia-smi: '${NV_PATH}', "
fi

if [ -z "${NV_LIB_PATH}" ]; then
echo -n "libnvidia-ml.so.1: not found, "
else
echo -n "libnvidia-ml.so.1: '${NV_LIB_PATH}', "
fi

# Log top-level entries in /driver-root (this may be valuable debug info).
echo "current contents: [$(/bin/ls -1xAw0 /driver-root 2>/dev/null)]."
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shows what I was trying to achieve (condensed information on a single line, also showing how the contents of /driver-root evolved during init container runtime):

2025-06-08T14:13:52Z  /driver-root (/run/nvidia/driver on host): nvidia-smi: not found, libnvidia-ml.so.1: not found, current contents: [].
2025-06-08T14:14:02Z  /driver-root (/run/nvidia/driver on host): nvidia-smi: not found, libnvidia-ml.so.1: not found, current contents: [lib].


if [ -n "${NV_PATH}" ] && [ -n "${NV_LIB_PATH}" ]; then
# Run with clean environment (only LD_PRELOAD; nvidia-smi has only this
# dependency). Emit message before invocation (nvidia-smi may be slow or
# hang).
echo "invoke: env -i LD_PRELOAD=${NV_LIB_PATH} ${NV_PATH}"

# Always show stderr, maybe hide or filter stdout?
env -i LD_PRELOAD="${NV_LIB_PATH}" "${NV_PATH}"
RCODE="$?"

# For checking GPU driver health: rely on nvidia-smi's exit code. Rely
# on code 0 signaling that the driver is properly set up. See section
# 'RETURN VALUE' in the nvidia-smi man page for meaning of error codes.
if [ ${RCODE} -eq 0 ]; then
echo "nvidia-smi returned with code 0: success, leave"

# Exit script indicating success (leave init container).
exit 0
fi
echo "exit code: ${RCODE}"
fi

# Reduce log volume: log hints only every Nth attempt.
if [ $((_ATTEMPT % 6)) -ne 0 ]; then
return
fi

# nvidia-smi binaries not found, or execution failed. First, provide generic
# error message. Then, try to provide actionable hints for common problems.
echo
emit_common_err

# For host-provided driver not at / provide feedback for two special cases.
if [ "${NVIDIA_DRIVER_ROOT}" != "/" ]; then
if [ -z "$( ls -A /driver-root )" ]; then
echo "Hint: Directory $NVIDIA_DRIVER_ROOT on the host is empty"
else
# Not empty, but at least one of the binaries not found: this is a
# rather pathological state.
if [ -z "${NV_PATH}" ] || [ -z "${NV_LIB_PATH}" ]; then
echo "Hint: Directory $NVIDIA_DRIVER_ROOT is not empty but at least one of the binaries wasn't found."
fi
fi
fi

# Common mistake: driver container, but forgot `--set nvidiaDriverRoot`
if [ "${NVIDIA_DRIVER_ROOT}" == "/" ] && [ -f /driver-root/run/nvidia/driver/usr/bin/nvidia-smi ]; then
printf '%b' \
"Hint: '/run/nvidia/driver/usr/bin/nvidia-smi' exists on the host, you " \
"may want to re-install the DRA driver Helm chart with " \
"--set nvidiaDriverRoot=/run/nvidia/driver\n"
fi

if [ "${NVIDIA_DRIVER_ROOT}" == "/run/nvidia/driver" ]; then
printf '%b' \
"Hint: NVIDIA_DRIVER_ROOT is set to '/run/nvidia/driver' " \
"which typically means that the NVIDIA GPU Operator " \
"manages the GPU driver. Make sure that the GPU Operator " \
"is deployed and healthy.\n"
fi
echo
}

# DS pods may get deleted (terminated with SIGTERM) and re-created when the GPU
# Operator driver container creates a mount at /run/nvidia. Make that explicit.
log_sigterm() {
echo "$(date -u +"%Y-%m-%dT%H:%M:%S.%3NZ"): received SIGTERM"
exit 0
}
trap 'log_sigterm' SIGTERM


# Design goal: long-running init container that retries at constant frequency,
# and leaves only upon success (with code 0).
_WAIT_S=10
_ATTEMPT=0

while true
do
validate_and_exit_on_success
sleep ${_WAIT_S}
_ATTEMPT=$((_ATTEMPT+1))
done
Loading