Skip to content

Commit 0bb2e4c

Browse files
Store driver config state and compare on restart to enable config change detection
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent a8dbb15 commit 0bb2e4c

File tree

2 files changed

+49
-96
lines changed

2 files changed

+49
-96
lines changed

ubuntu22.04/nvidia-driver

Lines changed: 48 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ _mount_rootfs() {
530530
mount --make-private /sys
531531
mkdir -p ${RUN_DIR}/driver
532532
mount --rbind / ${RUN_DIR}/driver
533+
echo "Driver container rootfs mounted at ${RUN_DIR}/driver"
533534
}
534535

535536
# Unmount the driver rootfs from the run directory.
@@ -665,6 +666,31 @@ _ensure_persistence_running() {
665666
fi
666667
}
667668

669+
_build_driver_config() {
670+
local config="DRIVER_VERSION=${DRIVER_VERSION}
671+
KERNEL_VERSION=$(uname -r)
672+
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED}
673+
USE_HOST_MOFED=${USE_HOST_MOFED}
674+
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE}"
675+
676+
# Append config file contents directly
677+
for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do
678+
if [ -f "/drivers/$conf_file" ]; then
679+
config="${config}
680+
$(cat "/drivers/$conf_file")"
681+
fi
682+
done
683+
684+
echo "$config"
685+
}
686+
687+
_store_driver_config() {
688+
local config_file="/run/nvidia/driver-config.state"
689+
echo "Storing driver configuration state..."
690+
_build_driver_config > "$config_file"
691+
echo "Driver configuration stored at $config_file"
692+
}
693+
668694
init() {
669695
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
670696
_find_vgpu_driver_version || exit 1
@@ -683,13 +709,15 @@ init() {
683709
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
684710
trap "_shutdown" EXIT
685711

686-
# Fast path: if the NVIDIA kernel modules are already loaded and match the desired
687-
# version, skip kernel module build/load but install userspace components.
712+
# Fast path: if the NVIDIA kernel modules are already loaded and driver config matches,
713+
# skip kernel module build/load but install userspace components.
688714
# This handles non-clean restarts where modules are in use and can't be unloaded.
689-
if [ -f /sys/module/nvidia/refcnt ]; then
690-
loaded_version=$(_read_loaded_version || true)
691-
if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then
692-
echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install"
715+
if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ]; then
716+
current_config=$(_build_driver_config)
717+
stored_config=$(cat /run/nvidia/driver-config.state)
718+
719+
if [ "${current_config}" = "${stored_config}" ]; then
720+
echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install"
693721

694722
# Skip kernel module unload since they're already loaded with correct version
695723
# Unmount any existing rootfs
@@ -715,16 +743,19 @@ init() {
715743
--no-drm \
716744
--no-peermem
717745

718-
# Mount the driver rootfs to make components available
719-
_mount_rootfs
720-
721-
# Ensure persistence daemon is running
722-
_ensure_persistence_running
723-
724-
# Write kernel update hook
725-
_write_kernel_update_hook
726-
727-
echo "Userspace-only install complete, now waiting for signal"
746+
# Mount the driver rootfs to make components available
747+
_mount_rootfs
748+
749+
# Ensure persistence daemon is running
750+
_ensure_persistence_running
751+
752+
# Write kernel update hook
753+
_write_kernel_update_hook
754+
755+
# Store driver configuration
756+
_store_driver_config
757+
758+
echo "Userspace-only install complete, now waiting for signal"
728759
sleep infinity &
729760
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
730761
trap - EXIT
@@ -776,6 +807,7 @@ init() {
776807
_load_driver || exit 1
777808
_mount_rootfs
778809
_write_kernel_update_hook
810+
_store_driver_config
779811

780812
echo "Done, now waiting for signal"
781813
sleep infinity &

ubuntu24.04/nvidia-driver

Lines changed: 1 addition & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -244,33 +244,6 @@ _get_module_params() {
244244
fi
245245
}
246246

247-
# Read the currently loaded NVIDIA driver version from sysfs.
248-
_read_loaded_version() {
249-
cat /sys/module/nvidia/version 2>/dev/null || return 1
250-
}
251-
252-
_is_rootfs_mounted() {
253-
findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1
254-
}
255-
256-
# Ensure the driver rootfs is mounted exactly once.
257-
_ensure_rootfs_mounted_idempotent() {
258-
_is_rootfs_mounted || _mount_rootfs
259-
}
260-
261-
_ensure_persistence_running() {
262-
local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
263-
if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
264-
return 0
265-
fi
266-
267-
if command -v nvidia-persistenced >/dev/null 2>&1; then
268-
nvidia-persistenced --persistence-mode || true
269-
else
270-
echo "nvidia-persistenced not found; continuing without persistence"
271-
fi
272-
}
273-
274247
# Load the kernel modules and start persistenced.
275248
_load_driver() {
276249
echo "Parsing kernel module parameters..."
@@ -611,59 +584,7 @@ init() {
611584
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
612585
trap "_shutdown" EXIT
613586

614-
# Fast path: if the NVIDIA kernel modules are already loaded and match the desired
615-
# version, skip kernel module build/load but install userspace components.
616-
# This handles non-clean restarts where modules are in use and can't be unloaded.
617-
if [ -f /sys/module/nvidia/refcnt ]; then
618-
loaded_version=$(_read_loaded_version || true)
619-
if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then
620-
echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install"
621-
622-
# Skip kernel module unload since they're already loaded with correct version
623-
# Unmount any existing rootfs
624-
_unmount_rootfs
625-
626-
# Update package cache for userspace install
627-
_update_ca_certificates
628-
_update_package_cache
629-
_resolve_kernel_version || exit 1
630-
_install_prerequisites
631-
632-
# Install userspace components only (libraries, binaries)
633-
# The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install
634-
echo "Installing userspace components (libraries and binaries)..."
635-
cd /drivers
636-
# Extract the driver first
637-
sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x
638-
cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}
639-
./nvidia-installer \
640-
--silent \
641-
--no-kernel-module \
642-
--no-nouveau-check \
643-
--no-nvidia-modprobe \
644-
--no-drm \
645-
--no-peermem
646-
647-
# Mount the driver rootfs to make components available
648-
_mount_rootfs
649-
650-
# Ensure persistence daemon is running
651-
_ensure_persistence_running
652-
653-
# Write kernel update hook
654-
_write_kernel_update_hook
655-
656-
echo "Userspace-only install complete, now waiting for signal"
657-
sleep infinity &
658-
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
659-
trap - EXIT
660-
while true; do wait $! || continue; done
661-
exit 0
662-
fi
663-
fi
664-
665-
666-
_unload_driver || exit 1
587+
_unload_driver || exit 1
667588
_unmount_rootfs
668589

669590
_update_ca_certificates

0 commit comments

Comments
 (0)