Skip to content

Commit b107ac5

Browse files
Store driver config state and compare on restart to enable config change detection
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent a8dbb15 commit b107ac5

File tree

2 files changed

+49
-110
lines changed

2 files changed

+49
-110
lines changed

ubuntu22.04/nvidia-driver

Lines changed: 48 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ _mount_rootfs() {
530530
mount --make-private /sys
531531
mkdir -p ${RUN_DIR}/driver
532532
mount --rbind / ${RUN_DIR}/driver
533+
echo "Driver container rootfs mounted at ${RUN_DIR}/driver"
533534
}
534535

535536
# Unmount the driver rootfs from the run directory.
@@ -638,20 +639,6 @@ _start_vgpu_topology_daemon() {
638639
nvidia-topologyd
639640
}
640641

641-
# Read the currently loaded NVIDIA driver version from sysfs.
642-
_read_loaded_version() {
643-
cat /sys/module/nvidia/version 2>/dev/null || return 1
644-
}
645-
646-
_is_rootfs_mounted() {
647-
findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1
648-
}
649-
650-
# Ensure the driver rootfs is mounted exactly once.
651-
_ensure_rootfs_mounted_idempotent() {
652-
_is_rootfs_mounted || _mount_rootfs
653-
}
654-
655642
_ensure_persistence_running() {
656643
local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
657644
if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
@@ -665,6 +652,31 @@ _ensure_persistence_running() {
665652
fi
666653
}
667654

655+
_build_driver_config() {
656+
local config="DRIVER_VERSION=${DRIVER_VERSION}
657+
KERNEL_VERSION=$(uname -r)
658+
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED}
659+
USE_HOST_MOFED=${USE_HOST_MOFED}
660+
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE}"
661+
662+
# Append config file contents directly
663+
for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do
664+
if [ -f "/drivers/$conf_file" ]; then
665+
config="${config}
666+
$(cat "/drivers/$conf_file")"
667+
fi
668+
done
669+
670+
echo "$config"
671+
}
672+
673+
_store_driver_config() {
674+
local config_file="/run/nvidia/driver-config.state"
675+
echo "Storing driver configuration state..."
676+
_build_driver_config > "$config_file"
677+
echo "Driver configuration stored at $config_file"
678+
}
679+
668680
init() {
669681
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
670682
_find_vgpu_driver_version || exit 1
@@ -683,13 +695,15 @@ init() {
683695
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
684696
trap "_shutdown" EXIT
685697

686-
# Fast path: if the NVIDIA kernel modules are already loaded and match the desired
687-
# version, skip kernel module build/load but install userspace components.
698+
# Fast path: if the NVIDIA kernel modules are already loaded and driver config matches,
699+
# skip kernel module build/load but install userspace components.
688700
# This handles non-clean restarts where modules are in use and can't be unloaded.
689-
if [ -f /sys/module/nvidia/refcnt ]; then
690-
loaded_version=$(_read_loaded_version || true)
691-
if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then
692-
echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install"
701+
if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ]; then
702+
current_config=$(_build_driver_config)
703+
stored_config=$(cat /run/nvidia/driver-config.state)
704+
705+
if [ "${current_config}" = "${stored_config}" ]; then
706+
echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install"
693707

694708
# Skip kernel module unload since they're already loaded with correct version
695709
# Unmount any existing rootfs
@@ -715,16 +729,19 @@ init() {
715729
--no-drm \
716730
--no-peermem
717731

718-
# Mount the driver rootfs to make components available
719-
_mount_rootfs
720-
721-
# Ensure persistence daemon is running
722-
_ensure_persistence_running
723-
724-
# Write kernel update hook
725-
_write_kernel_update_hook
726-
727-
echo "Userspace-only install complete, now waiting for signal"
732+
# Mount the driver rootfs to make components available
733+
_mount_rootfs
734+
735+
# Ensure persistence daemon is running
736+
_ensure_persistence_running
737+
738+
# Write kernel update hook
739+
_write_kernel_update_hook
740+
741+
# Store driver configuration
742+
_store_driver_config
743+
744+
echo "Userspace-only install complete, now waiting for signal"
728745
sleep infinity &
729746
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
730747
trap - EXIT
@@ -776,6 +793,7 @@ init() {
776793
_load_driver || exit 1
777794
_mount_rootfs
778795
_write_kernel_update_hook
796+
_store_driver_config
779797

780798
echo "Done, now waiting for signal"
781799
sleep infinity &

ubuntu24.04/nvidia-driver

Lines changed: 1 addition & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -244,33 +244,6 @@ _get_module_params() {
244244
fi
245245
}
246246

247-
# Read the currently loaded NVIDIA driver version from sysfs.
248-
_read_loaded_version() {
249-
cat /sys/module/nvidia/version 2>/dev/null || return 1
250-
}
251-
252-
_is_rootfs_mounted() {
253-
findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1
254-
}
255-
256-
# Ensure the driver rootfs is mounted exactly once.
257-
_ensure_rootfs_mounted_idempotent() {
258-
_is_rootfs_mounted || _mount_rootfs
259-
}
260-
261-
_ensure_persistence_running() {
262-
local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
263-
if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
264-
return 0
265-
fi
266-
267-
if command -v nvidia-persistenced >/dev/null 2>&1; then
268-
nvidia-persistenced --persistence-mode || true
269-
else
270-
echo "nvidia-persistenced not found; continuing without persistence"
271-
fi
272-
}
273-
274247
# Load the kernel modules and start persistenced.
275248
_load_driver() {
276249
echo "Parsing kernel module parameters..."
@@ -611,59 +584,7 @@ init() {
611584
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
612585
trap "_shutdown" EXIT
613586

614-
# Fast path: if the NVIDIA kernel modules are already loaded and match the desired
615-
# version, skip kernel module build/load but install userspace components.
616-
# This handles non-clean restarts where modules are in use and can't be unloaded.
617-
if [ -f /sys/module/nvidia/refcnt ]; then
618-
loaded_version=$(_read_loaded_version || true)
619-
if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then
620-
echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install"
621-
622-
# Skip kernel module unload since they're already loaded with correct version
623-
# Unmount any existing rootfs
624-
_unmount_rootfs
625-
626-
# Update package cache for userspace install
627-
_update_ca_certificates
628-
_update_package_cache
629-
_resolve_kernel_version || exit 1
630-
_install_prerequisites
631-
632-
# Install userspace components only (libraries, binaries)
633-
# The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install
634-
echo "Installing userspace components (libraries and binaries)..."
635-
cd /drivers
636-
# Extract the driver first
637-
sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x
638-
cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}
639-
./nvidia-installer \
640-
--silent \
641-
--no-kernel-module \
642-
--no-nouveau-check \
643-
--no-nvidia-modprobe \
644-
--no-drm \
645-
--no-peermem
646-
647-
# Mount the driver rootfs to make components available
648-
_mount_rootfs
649-
650-
# Ensure persistence daemon is running
651-
_ensure_persistence_running
652-
653-
# Write kernel update hook
654-
_write_kernel_update_hook
655-
656-
echo "Userspace-only install complete, now waiting for signal"
657-
sleep infinity &
658-
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
659-
trap - EXIT
660-
while true; do wait $! || continue; done
661-
exit 0
662-
fi
663-
fi
664-
665-
666-
_unload_driver || exit 1
587+
_unload_driver || exit 1
667588
_unmount_rootfs
668589

669590
_update_ca_certificates

0 commit comments

Comments
 (0)