Skip to content

Commit a8dbb15

Browse files
feat: implement userspace-only reinstall for non-clean driver restarts and fix scenario handling
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent 7c178f6 commit a8dbb15

File tree

2 files changed

+130
-21
lines changed

2 files changed

+130
-21
lines changed

ubuntu22.04/nvidia-driver

Lines changed: 93 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -638,11 +638,104 @@ _start_vgpu_topology_daemon() {
638638
nvidia-topologyd
639639
}
640640

641+
# Read the currently loaded NVIDIA driver version from sysfs.
642+
_read_loaded_version() {
643+
cat /sys/module/nvidia/version 2>/dev/null || return 1
644+
}
645+
646+
_is_rootfs_mounted() {
647+
findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1
648+
}
649+
650+
# Ensure the driver rootfs is mounted exactly once.
651+
_ensure_rootfs_mounted_idempotent() {
652+
_is_rootfs_mounted || _mount_rootfs
653+
}
654+
655+
_ensure_persistence_running() {
656+
local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
657+
if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
658+
return 0
659+
fi
660+
661+
if command -v nvidia-persistenced >/dev/null 2>&1; then
662+
nvidia-persistenced --persistence-mode || true
663+
else
664+
echo "nvidia-persistenced not found; continuing without persistence"
665+
fi
666+
}
667+
641668
init() {
642669
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
643670
_find_vgpu_driver_version || exit 1
644671
fi
645672

673+
echo -e "\n========== NVIDIA Software Installer ==========\n"
674+
echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"
675+
676+
exec 3> ${PID_FILE}
677+
if ! flock -n 3; then
678+
echo "An instance of the NVIDIA driver is already running, aborting"
679+
exit 1
680+
fi
681+
echo $$ >&3
682+
683+
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
684+
trap "_shutdown" EXIT
685+
686+
# Fast path: if the NVIDIA kernel modules are already loaded and match the desired
687+
# version, skip kernel module build/load but install userspace components.
688+
# This handles non-clean restarts where modules are in use and can't be unloaded.
689+
if [ -f /sys/module/nvidia/refcnt ]; then
690+
loaded_version=$(_read_loaded_version || true)
691+
if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then
692+
echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install"
693+
694+
# Skip kernel module unload since they're already loaded with correct version
695+
# Unmount any existing rootfs
696+
_unmount_rootfs
697+
698+
# Update package cache for userspace install
699+
_update_package_cache
700+
_resolve_kernel_version || exit 1
701+
_install_prerequisites
702+
703+
# Install userspace components only (libraries, binaries)
704+
# The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install
705+
echo "Installing userspace components (libraries and binaries)..."
706+
cd /drivers
707+
# Extract the driver first
708+
sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x
709+
cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}
710+
./nvidia-installer \
711+
--silent \
712+
--no-kernel-module \
713+
--no-nouveau-check \
714+
--no-nvidia-modprobe \
715+
--no-drm \
716+
--no-peermem
717+
718+
# Mount the driver rootfs to make components available
719+
_mount_rootfs
720+
721+
# Ensure persistence daemon is running
722+
_ensure_persistence_running
723+
724+
# Write kernel update hook
725+
_write_kernel_update_hook
726+
727+
echo "Userspace-only install complete, now waiting for signal"
728+
sleep infinity &
729+
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
730+
trap - EXIT
731+
while true; do wait $! || continue; done
732+
exit 0
733+
fi
734+
fi
735+
736+
_unload_driver || exit 1
737+
_unmount_rootfs
738+
646739
# Install the userspace components
647740
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
648741
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
@@ -668,22 +761,6 @@ init() {
668761
mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \
669762
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest
670763

671-
echo -e "\n========== NVIDIA Software Installer ==========\n"
672-
echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"
673-
674-
exec 3> ${PID_FILE}
675-
if ! flock -n 3; then
676-
echo "An instance of the NVIDIA driver is already running, aborting"
677-
exit 1
678-
fi
679-
echo $$ >&3
680-
681-
trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
682-
trap "_shutdown" EXIT
683-
684-
_unload_driver || exit 1
685-
_unmount_rootfs
686-
687764
if _kernel_requires_package; then
688765
_update_ca_certificates
689766
_update_package_cache

ubuntu24.04/nvidia-driver

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -612,16 +612,48 @@ init() {
612612
trap "_shutdown" EXIT
613613

614614
# Fast path: if the NVIDIA kernel modules are already loaded and match the desired
615-
# version, avoid any heavy reinstall/build. Ensure rootfs is mounted and
616-
# persistenced is running, then hold the container.
615+
# version, skip kernel module build/load but install userspace components.
616+
# This handles non-clean restarts where modules are in use and can't be unloaded.
617617
if [ -f /sys/module/nvidia/refcnt ]; then
618618
loaded_version=$(_read_loaded_version || true)
619619
if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then
620-
echo "Detected matching loaded driver (${loaded_version}); skipping reinstall"
621-
_ensure_rootfs_mounted_idempotent
620+
echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install"
621+
622+
# Skip kernel module unload since they're already loaded with correct version
623+
# Unmount any existing rootfs
624+
_unmount_rootfs
625+
626+
# Update package cache for userspace install
627+
_update_ca_certificates
628+
_update_package_cache
629+
_resolve_kernel_version || exit 1
630+
_install_prerequisites
631+
632+
# Install userspace components only (libraries, binaries)
633+
# The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install
634+
echo "Installing userspace components (libraries and binaries)..."
635+
cd /drivers
636+
# Extract the driver first
637+
sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x
638+
cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}
639+
./nvidia-installer \
640+
--silent \
641+
--no-kernel-module \
642+
--no-nouveau-check \
643+
--no-nvidia-modprobe \
644+
--no-drm \
645+
--no-peermem
646+
647+
# Mount the driver rootfs to make components available
648+
_mount_rootfs
649+
650+
# Ensure persistence daemon is running
622651
_ensure_persistence_running
652+
653+
# Write kernel update hook
623654
_write_kernel_update_hook
624-
echo "Done, now waiting for signal"
655+
656+
echo "Userspace-only install complete, now waiting for signal"
625657
sleep infinity &
626658
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
627659
trap - EXIT

0 commit comments

Comments
 (0)