From 7c178f6001cd4eb1301ffede9c8ef60819747b09 Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Thu, 2 Oct 2025 15:53:57 +0000 Subject: [PATCH 1/4] Add fast-track to skip uninstall/install if NVIDIA driver modules present Signed-off-by: Karthik Vetrivel --- ubuntu24.04/nvidia-driver | 49 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/ubuntu24.04/nvidia-driver b/ubuntu24.04/nvidia-driver index 2449628f..6662c34d 100755 --- a/ubuntu24.04/nvidia-driver +++ b/ubuntu24.04/nvidia-driver @@ -244,6 +244,33 @@ _get_module_params() { fi } +# Read the currently loaded NVIDIA driver version from sysfs. +_read_loaded_version() { + cat /sys/module/nvidia/version 2>/dev/null || return 1 +} + +_is_rootfs_mounted() { + findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1 +} + +# Ensure the driver rootfs is mounted exactly once. +_ensure_rootfs_mounted_idempotent() { + _is_rootfs_mounted || _mount_rootfs +} + +_ensure_persistence_running() { + local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid + if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + return 0 + fi + + if command -v nvidia-persistenced >/dev/null 2>&1; then + nvidia-persistenced --persistence-mode || true + else + echo "nvidia-persistenced not found; continuing without persistence" + fi +} + # Load the kernel modules and start persistenced. _load_driver() { echo "Parsing kernel module parameters..." @@ -584,7 +611,27 @@ init() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT - _unload_driver || exit 1 + # Fast path: if the NVIDIA kernel modules are already loaded and match the desired + # version, avoid any heavy reinstall/build. Ensure rootfs is mounted and + # persistenced is running, then hold the container. + if [ -f /sys/module/nvidia/refcnt ]; then + loaded_version=$(_read_loaded_version || true) + if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then + echo "Detected matching loaded driver (${loaded_version}); skipping reinstall" + _ensure_rootfs_mounted_idempotent + _ensure_persistence_running + _write_kernel_update_hook + echo "Done, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 + fi + fi + + + _unload_driver || exit 1 _unmount_rootfs _update_ca_certificates From a8dbb15864e5e969a44556c33da72d6504ebba07 Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Thu, 16 Oct 2025 20:57:33 +0000 Subject: [PATCH 2/4] feat: implement userspace-only reinstall for non-clean driver restarts and fix scenario handling Signed-off-by: Karthik Vetrivel --- ubuntu22.04/nvidia-driver | 109 ++++++++++++++++++++++++++++++++------ ubuntu24.04/nvidia-driver | 42 +++++++++++++-- 2 files changed, 130 insertions(+), 21 deletions(-) diff --git a/ubuntu22.04/nvidia-driver b/ubuntu22.04/nvidia-driver index da3ec8e3..532f0b33 100755 --- a/ubuntu22.04/nvidia-driver +++ b/ubuntu22.04/nvidia-driver @@ -638,11 +638,104 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } +# Read the currently loaded NVIDIA driver version from sysfs. +_read_loaded_version() { + cat /sys/module/nvidia/version 2>/dev/null || return 1 +} + +_is_rootfs_mounted() { + findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1 +} + +# Ensure the driver rootfs is mounted exactly once. +_ensure_rootfs_mounted_idempotent() { + _is_rootfs_mounted || _mount_rootfs +} + +_ensure_persistence_running() { + local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid + if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + return 0 + fi + + if command -v nvidia-persistenced >/dev/null 2>&1; then + nvidia-persistenced --persistence-mode || true + else + echo "nvidia-persistenced not found; continuing without persistence" + fi +} + init() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 fi + echo -e "\n========== NVIDIA Software Installer ==========\n" + echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" + + exec 3> ${PID_FILE} + if ! flock -n 3; then + echo "An instance of the NVIDIA driver is already running, aborting" + exit 1 + fi + echo $$ >&3 + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + # Fast path: if the NVIDIA kernel modules are already loaded and match the desired + # version, skip kernel module build/load but install userspace components. + # This handles non-clean restarts where modules are in use and can't be unloaded. + if [ -f /sys/module/nvidia/refcnt ]; then + loaded_version=$(_read_loaded_version || true) + if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then + echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install" + + # Skip kernel module unload since they're already loaded with correct version + # Unmount any existing rootfs + _unmount_rootfs + + # Update package cache for userspace install + _update_package_cache + _resolve_kernel_version || exit 1 + _install_prerequisites + + # Install userspace components only (libraries, binaries) + # The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install + echo "Installing userspace components (libraries and binaries)..." + cd /drivers + # Extract the driver first + sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x + cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} + ./nvidia-installer \ + --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-drm \ + --no-peermem + + # Mount the driver rootfs to make components available + _mount_rootfs + + # Ensure persistence daemon is running + _ensure_persistence_running + + # Write kernel update hook + _write_kernel_update_hook + + echo "Userspace-only install complete, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 + fi + fi + + _unload_driver || exit 1 + _unmount_rootfs + # Install the userspace components sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ @@ -668,22 +761,6 @@ init() { mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION} && \ sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest - echo -e "\n========== NVIDIA Software Installer ==========\n" - echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" - - exec 3> ${PID_FILE} - if ! flock -n 3; then - echo "An instance of the NVIDIA driver is already running, aborting" - exit 1 - fi - echo $$ >&3 - - trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM - trap "_shutdown" EXIT - - _unload_driver || exit 1 - _unmount_rootfs - if _kernel_requires_package; then _update_ca_certificates _update_package_cache diff --git a/ubuntu24.04/nvidia-driver b/ubuntu24.04/nvidia-driver index 6662c34d..9bc501c2 100755 --- a/ubuntu24.04/nvidia-driver +++ b/ubuntu24.04/nvidia-driver @@ -612,16 +612,48 @@ init() { trap "_shutdown" EXIT # Fast path: if the NVIDIA kernel modules are already loaded and match the desired - # version, avoid any heavy reinstall/build. Ensure rootfs is mounted and - # persistenced is running, then hold the container. + # version, skip kernel module build/load but install userspace components. + # This handles non-clean restarts where modules are in use and can't be unloaded. if [ -f /sys/module/nvidia/refcnt ]; then loaded_version=$(_read_loaded_version || true) if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then - echo "Detected matching loaded driver (${loaded_version}); skipping reinstall" - _ensure_rootfs_mounted_idempotent + echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install" + + # Skip kernel module unload since they're already loaded with correct version + # Unmount any existing rootfs + _unmount_rootfs + + # Update package cache for userspace install + _update_ca_certificates + _update_package_cache + _resolve_kernel_version || exit 1 + _install_prerequisites + + # Install userspace components only (libraries, binaries) + # The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install + echo "Installing userspace components (libraries and binaries)..." + cd /drivers + # Extract the driver first + sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x + cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} + ./nvidia-installer \ + --silent \ + --no-kernel-module \ + --no-nouveau-check \ + --no-nvidia-modprobe \ + --no-drm \ + --no-peermem + + # Mount the driver rootfs to make components available + _mount_rootfs + + # Ensure persistence daemon is running _ensure_persistence_running + + # Write kernel update hook _write_kernel_update_hook - echo "Done, now waiting for signal" + + echo "Userspace-only install complete, now waiting for signal" sleep infinity & trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM trap - EXIT From ba7e6de0007afc5bf536d39aaa2a39f45667b3aa Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Wed, 5 Nov 2025 21:06:31 +0000 Subject: [PATCH 3/4] Store driver config state and compare on restart to enable config change detection Signed-off-by: Karthik Vetrivel --- ubuntu22.04/nvidia-driver | 84 +++++++++++++++++++++++++-------------- ubuntu24.04/nvidia-driver | 81 +------------------------------------ 2 files changed, 56 insertions(+), 109 deletions(-) diff --git a/ubuntu22.04/nvidia-driver b/ubuntu22.04/nvidia-driver index 532f0b33..85360952 100755 --- a/ubuntu22.04/nvidia-driver +++ b/ubuntu22.04/nvidia-driver @@ -530,6 +530,7 @@ _mount_rootfs() { mount --make-private /sys mkdir -p ${RUN_DIR}/driver mount --rbind / ${RUN_DIR}/driver + echo "Driver container rootfs mounted at ${RUN_DIR}/driver" } # Unmount the driver rootfs from the run directory. @@ -638,20 +639,6 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } -# Read the currently loaded NVIDIA driver version from sysfs. -_read_loaded_version() { - cat /sys/module/nvidia/version 2>/dev/null || return 1 -} - -_is_rootfs_mounted() { - findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1 -} - -# Ensure the driver rootfs is mounted exactly once. -_ensure_rootfs_mounted_idempotent() { - _is_rootfs_mounted || _mount_rootfs -} - _ensure_persistence_running() { local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then @@ -665,6 +652,31 @@ _ensure_persistence_running() { fi } +_build_driver_config() { + local config="DRIVER_VERSION=${DRIVER_VERSION} +KERNEL_VERSION=$(uname -r) +GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED} +USE_HOST_MOFED=${USE_HOST_MOFED} +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE}" + + # Append config file contents directly + for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do + if [ -f "/drivers/$conf_file" ]; then + config="${config} +$(cat "/drivers/$conf_file")" + fi + done + + echo "$config" +} + +_store_driver_config() { + local config_file="/run/nvidia/driver-config.state" + echo "Storing driver configuration state..." + _build_driver_config > "$config_file" + echo "Driver configuration stored at $config_file" +} + init() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 @@ -683,13 +695,15 @@ init() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT - # Fast path: if the NVIDIA kernel modules are already loaded and match the desired - # version, skip kernel module build/load but install userspace components. + # Fast path: if the NVIDIA kernel modules are already loaded and driver config matches, + # skip kernel module build/load but install userspace components. # This handles non-clean restarts where modules are in use and can't be unloaded. - if [ -f /sys/module/nvidia/refcnt ]; then - loaded_version=$(_read_loaded_version || true) - if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then - echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install" + if [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ]; then + current_config=$(_build_driver_config) + stored_config=$(cat /run/nvidia/driver-config.state) + + if [ "${current_config}" = "${stored_config}" ]; then + echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" # Skip kernel module unload since they're already loaded with correct version # Unmount any existing rootfs @@ -715,16 +729,27 @@ init() { --no-drm \ --no-peermem - # Mount the driver rootfs to make components available - _mount_rootfs - - # Ensure persistence daemon is running - _ensure_persistence_running - - # Write kernel update hook - _write_kernel_update_hook + # Determine the kernel module type + _resolve_kernel_type || exit 1 + + # Copy the kernel module sources for sidecar containers (gdrcopy, nvidia-fs, etc.) + mkdir -p /usr/src/nvidia-${DRIVER_VERSION} && \ + cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ && \ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest - echo "Userspace-only install complete, now waiting for signal" + # Mount the driver rootfs to make components available + _mount_rootfs + + # Ensure persistence daemon is running + _ensure_persistence_running + + # Write kernel update hook + _write_kernel_update_hook + + # Store driver configuration + _store_driver_config + + echo "Userspace-only install complete, now waiting for signal" sleep infinity & trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM trap - EXIT @@ -776,6 +801,7 @@ init() { _load_driver || exit 1 _mount_rootfs _write_kernel_update_hook + _store_driver_config echo "Done, now waiting for signal" sleep infinity & diff --git a/ubuntu24.04/nvidia-driver b/ubuntu24.04/nvidia-driver index 9bc501c2..2449628f 100755 --- a/ubuntu24.04/nvidia-driver +++ b/ubuntu24.04/nvidia-driver @@ -244,33 +244,6 @@ _get_module_params() { fi } -# Read the currently loaded NVIDIA driver version from sysfs. -_read_loaded_version() { - cat /sys/module/nvidia/version 2>/dev/null || return 1 -} - -_is_rootfs_mounted() { - findmnt -rno TARGET "${RUN_DIR}/driver" >/dev/null 2>&1 -} - -# Ensure the driver rootfs is mounted exactly once. -_ensure_rootfs_mounted_idempotent() { - _is_rootfs_mounted || _mount_rootfs -} - -_ensure_persistence_running() { - local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid - if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then - return 0 - fi - - if command -v nvidia-persistenced >/dev/null 2>&1; then - nvidia-persistenced --persistence-mode || true - else - echo "nvidia-persistenced not found; continuing without persistence" - fi -} - # Load the kernel modules and start persistenced. _load_driver() { echo "Parsing kernel module parameters..." @@ -611,59 +584,7 @@ init() { trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM trap "_shutdown" EXIT - # Fast path: if the NVIDIA kernel modules are already loaded and match the desired - # version, skip kernel module build/load but install userspace components. - # This handles non-clean restarts where modules are in use and can't be unloaded. - if [ -f /sys/module/nvidia/refcnt ]; then - loaded_version=$(_read_loaded_version || true) - if [ -n "${loaded_version}" ] && [ "${loaded_version}" = "${DRIVER_VERSION}" ]; then - echo "Detected matching loaded driver (${loaded_version}); performing userspace-only install" - - # Skip kernel module unload since they're already loaded with correct version - # Unmount any existing rootfs - _unmount_rootfs - - # Update package cache for userspace install - _update_ca_certificates - _update_package_cache - _resolve_kernel_version || exit 1 - _install_prerequisites - - # Install userspace components only (libraries, binaries) - # The --no-kernel-module flag tells nvidia-installer to skip kernel module build/install - echo "Installing userspace components (libraries and binaries)..." - cd /drivers - # Extract the driver first - sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x - cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} - ./nvidia-installer \ - --silent \ - --no-kernel-module \ - --no-nouveau-check \ - --no-nvidia-modprobe \ - --no-drm \ - --no-peermem - - # Mount the driver rootfs to make components available - _mount_rootfs - - # Ensure persistence daemon is running - _ensure_persistence_running - - # Write kernel update hook - _write_kernel_update_hook - - echo "Userspace-only install complete, now waiting for signal" - sleep infinity & - trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM - trap - EXIT - while true; do wait $! || continue; done - exit 0 - fi - fi - - - _unload_driver || exit 1 + _unload_driver || exit 1 _unmount_rootfs _update_ca_certificates From 4650182219d931a449046f6847a817931f1f0bb4 Mon Sep 17 00:00:00 2001 From: Karthik Vetrivel Date: Fri, 14 Nov 2025 16:04:45 +0000 Subject: [PATCH 4/4] Add support for OpenShift 14.04 Signed-off-by: Karthik Vetrivel --- rhel9/nvidia-driver | 145 +++++++++++++++++++++++++++++++++++++-- rhel9/ocp_dtk_entrypoint | 31 +++++++++ 2 files changed, 172 insertions(+), 4 deletions(-) mode change 100755 => 100644 rhel9/nvidia-driver mode change 100755 => 100644 rhel9/ocp_dtk_entrypoint diff --git a/rhel9/nvidia-driver b/rhel9/nvidia-driver old mode 100755 new mode 100644 index 8ecd8b1e..bd79b017 --- a/rhel9/nvidia-driver +++ b/rhel9/nvidia-driver @@ -8,12 +8,13 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"} KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver NUM_VGPU_DEVICES=0 +GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}" +USE_HOST_MOFED="${USE_HOST_MOFED:-false}" NVIDIA_MODULE_PARAMS=() NVIDIA_UVM_MODULE_PARAMS=() NVIDIA_MODESET_MODULE_PARAMS=() NVIDIA_PEERMEM_MODULE_PARAMS=() TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} -USE_HOST_MOFED="${USE_HOST_MOFED:-false}" DNF_RELEASEVER=${DNF_RELEASEVER:-""} RHEL_VERSION=${RHEL_VERSION:-""} RHEL_MAJOR_VERSION=9 @@ -211,7 +212,10 @@ _create_driver_package() ( local nvidia_modeset_sign_args="" local nvidia_uvm_sign_args="" - trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT + # Skip cleanup trap for DTK builds - modules are copied after this function returns + if [ "${PACKAGE_TAG:-}" != "builtin" ]; then + trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT + fi echo "Compiling NVIDIA driver kernel modules..." cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} @@ -566,7 +570,9 @@ _install_driver() { install_args+=("--skip-module-load") fi - IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} + # Prevent prompts when modules are already loaded (common in DTK context). + # Pipe "1" to auto-answer "Continue installation" when prompted about loaded modules. + echo "1" | IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit @@ -701,6 +707,94 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } +_ensure_persistence_running() { + local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid + if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + return 0 + fi + + if command -v nvidia-persistenced >/dev/null 2>&1; then + nvidia-persistenced --persistence-mode || true + else + echo "nvidia-persistenced not found; continuing without persistence" + fi +} + +_build_driver_config() { + local config="DRIVER_VERSION=${DRIVER_VERSION} +KERNEL_VERSION=$(uname -r) +GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false} +USE_HOST_MOFED=${USE_HOST_MOFED:-false} +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}" + + # Append config file contents directly + for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do + if [ -f "/drivers/$conf_file" ]; then + config="${config} +$(cat "/drivers/$conf_file")" + fi + done + + echo "$config" +} + +_store_driver_config() { + local config_file="/run/nvidia/driver-config.state" + echo "Storing driver configuration state..." + _build_driver_config > "$config_file" + echo "Driver configuration stored at $config_file" +} + +_should_use_fast_path() { + [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1 + local current_config=$(_build_driver_config) + local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "") + [ "${current_config}" = "${stored_config}" ] +} + +_userspace_only_install() { + echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install" + + _unmount_rootfs + _update_package_cache + + # Skip kernel-related steps for userspace-only install + # KERNEL_VERSION is already set from uname -r, no need to resolve from yum + # Kernel headers/devel/modules are not needed for userspace-only install + + cd /drivers + [ ! -d "NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}" ] && sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x + cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION} + + + echo "DEBUG: Current directory: $(pwd)" + echo "DEBUG: Checking for ./nvidia-installer:" + ls -la ./nvidia-installer 2>&1 || echo " ./nvidia-installer NOT FOUND" + echo "DEBUG: Checking PATH for nvidia-installer:" + which nvidia-installer 2>&1 || echo " nvidia-installer NOT in PATH" + + + echo "Installing userspace components (libraries and binaries)..." + local install_args="--silent --no-kernel-module --no-nouveau-check --no-nvidia-modprobe --no-drm --no-peermem --ui=none" + [ "${ACCEPT_LICENSE}" = "yes" ] && install_args="$install_args --accept-license" + IGNORE_CC_MISMATCH=1 ./nvidia-installer $install_args + + # Copy kernel module sources if not already present (needed for other containers) + if [ ! -d "/usr/src/nvidia-${DRIVER_VERSION}" ]; then + _resolve_kernel_type || exit 1 + mkdir -p /usr/src/nvidia-${DRIVER_VERSION} + cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest + fi + + _mount_rootfs + _ensure_persistence_running + _write_kernel_update_hook + _store_driver_config + + echo "Userspace-only install complete" +} + _prepare() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 @@ -758,6 +852,7 @@ _load() { _load_driver _mount_rootfs _write_kernel_update_hook + _store_driver_config echo "Done, now waiting for signal" sleep infinity & @@ -768,7 +863,49 @@ _load() { } init() { - _prepare_exclusive + if [ "${DRIVER_TYPE}" = "vgpu" ]; then + _find_vgpu_driver_version || exit 1 + fi + + echo -e "\n========== NVIDIA Software Installer ==========\n" + echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" + + exec 3> ${PID_FILE} + if ! flock -n 3; then + echo "An instance of the NVIDIA driver is already running, aborting" + exit 1 + fi + echo $$ >&3 + + trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM + trap "_shutdown" EXIT + + if _should_use_fast_path; then + _userspace_only_install + + echo "Userspace-only install complete, now waiting for signal" + sleep infinity & + trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM + trap - EXIT + while true; do wait $! || continue; done + exit 0 + fi + + _unload_driver || exit 1 + _unmount_rootfs + + # Install the userspace components and copy the kernel module sources. + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ + cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ + sh /tmp/install.sh nvinstall + + # Determine the kernel module type + _resolve_kernel_type || exit 1 + + # Copy the kernel module sources + mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \ + sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest _build diff --git a/rhel9/ocp_dtk_entrypoint b/rhel9/ocp_dtk_entrypoint old mode 100755 new mode 100644 index 0bd1496d..3973ba04 --- a/rhel9/ocp_dtk_entrypoint +++ b/rhel9/ocp_dtk_entrypoint @@ -10,6 +10,30 @@ echo "Running $*" SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) source $SCRIPT_DIR/common.sh +_build_driver_config() { + local config="DRIVER_VERSION=${DRIVER_VERSION} +KERNEL_VERSION=$(uname -r) +GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false} +USE_HOST_MOFED=${USE_HOST_MOFED:-false} +KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}" + + for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do + if [ -f "/drivers/$conf_file" ]; then + config="${config} +$(cat "/drivers/$conf_file")" + fi + done + + echo "$config" +} + +_should_use_fast_path() { + [ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1 + local current_config=$(_build_driver_config) + local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "") + [ "${current_config}" = "${stored_config}" ] +} + nv-ctr-run-with-dtk() { set -x @@ -18,6 +42,13 @@ nv-ctr-run-with-dtk() { exec bash -x nvidia-driver init fi + if _should_use_fast_path; then + echo "Fast path detected: skipping DTK build and module copy, proceeding with userspace-only install" + exec bash -x nvidia-driver init + fi + + echo "Fast path not detected: building driver and modules" + if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then cp -r \ /tmp/install.sh \