Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 141 additions & 4 deletions rhel9/nvidia-driver
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@ PID_FILE=${RUN_DIR}/${0##*/}.pid
DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"}
KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
NUM_VGPU_DEVICES=0
GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
NVIDIA_MODULE_PARAMS=()
NVIDIA_UVM_MODULE_PARAMS=()
NVIDIA_MODESET_MODULE_PARAMS=()
NVIDIA_PEERMEM_MODULE_PARAMS=()
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}
USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
DNF_RELEASEVER=${DNF_RELEASEVER:-""}
RHEL_VERSION=${RHEL_VERSION:-""}
RHEL_MAJOR_VERSION=9
Expand Down Expand Up @@ -211,7 +212,10 @@ _create_driver_package() (
local nvidia_modeset_sign_args=""
local nvidia_uvm_sign_args=""

trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT
# Skip cleanup trap for DTK builds - modules are copied after this function returns
if [ "${PACKAGE_TAG:-}" != "builtin" ]; then
trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT
fi

echo "Compiling NVIDIA driver kernel modules..."
cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE}
Expand Down Expand Up @@ -566,7 +570,9 @@ _install_driver() {
install_args+=("--skip-module-load")
fi

IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}
# Prevent prompts when modules are already loaded (common in DTK context).
# Pipe "1" to auto-answer "Continue installation" when prompted about loaded modules.
echo "1" | IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"}
# May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
# /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
# TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
Expand Down Expand Up @@ -701,6 +707,94 @@ _start_vgpu_topology_daemon() {
nvidia-topologyd
}

_ensure_persistence_running() {
local pid_file=/var/run/nvidia-persistenced/nvidia-persistenced.pid pid
if pid=$(<"${pid_file}" 2>/dev/null) && [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
return 0
fi

if command -v nvidia-persistenced >/dev/null 2>&1; then
nvidia-persistenced --persistence-mode || true
else
echo "nvidia-persistenced not found; continuing without persistence"
fi
}

_build_driver_config() {
local config="DRIVER_VERSION=${DRIVER_VERSION}
KERNEL_VERSION=$(uname -r)
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false}
USE_HOST_MOFED=${USE_HOST_MOFED:-false}
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}"

# Append config file contents directly
for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do
if [ -f "/drivers/$conf_file" ]; then
config="${config}
$(cat "/drivers/$conf_file")"
fi
done

echo "$config"
}

_store_driver_config() {
local config_file="/run/nvidia/driver-config.state"
echo "Storing driver configuration state..."
_build_driver_config > "$config_file"
echo "Driver configuration stored at $config_file"
}

_should_use_fast_path() {
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1
local current_config=$(_build_driver_config)
local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "")
[ "${current_config}" = "${stored_config}" ]
}

_userspace_only_install() {
echo "Detected matching loaded driver & config (${DRIVER_VERSION}); performing userspace-only install"

_unmount_rootfs
_update_package_cache

# Skip kernel-related steps for userspace-only install
# KERNEL_VERSION is already set from uname -r, no need to resolve from yum
# Kernel headers/devel/modules are not needed for userspace-only install

cd /drivers
[ ! -d "NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}" ] && sh NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}.run -x
cd NVIDIA-Linux-${DRIVER_ARCH}-${DRIVER_VERSION}


echo "DEBUG: Current directory: $(pwd)"
echo "DEBUG: Checking for ./nvidia-installer:"
ls -la ./nvidia-installer 2>&1 || echo " ./nvidia-installer NOT FOUND"
echo "DEBUG: Checking PATH for nvidia-installer:"
which nvidia-installer 2>&1 || echo " nvidia-installer NOT in PATH"


echo "Installing userspace components (libraries and binaries)..."
local install_args="--silent --no-kernel-module --no-nouveau-check --no-nvidia-modprobe --no-drm --no-peermem --ui=none"
[ "${ACCEPT_LICENSE}" = "yes" ] && install_args="$install_args --accept-license"
IGNORE_CC_MISMATCH=1 ./nvidia-installer $install_args

# Copy kernel module sources if not already present (needed for other containers)
if [ ! -d "/usr/src/nvidia-${DRIVER_VERSION}" ]; then
_resolve_kernel_type || exit 1
mkdir -p /usr/src/nvidia-${DRIVER_VERSION}
cp -r LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-${DRIVER_VERSION}/
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-${DRIVER_VERSION}/.manifest
fi

_mount_rootfs
_ensure_persistence_running
_write_kernel_update_hook
_store_driver_config

echo "Userspace-only install complete"
}

_prepare() {
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
_find_vgpu_driver_version || exit 1
Expand Down Expand Up @@ -758,6 +852,7 @@ _load() {
_load_driver
_mount_rootfs
_write_kernel_update_hook
_store_driver_config

echo "Done, now waiting for signal"
sleep infinity &
Expand All @@ -768,7 +863,49 @@ _load() {
}

init() {
_prepare_exclusive
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
_find_vgpu_driver_version || exit 1
fi

echo -e "\n========== NVIDIA Software Installer ==========\n"
echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"

exec 3> ${PID_FILE}
if ! flock -n 3; then
echo "An instance of the NVIDIA driver is already running, aborting"
exit 1
fi
echo $$ >&3

trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
trap "_shutdown" EXIT

if _should_use_fast_path; then
_userspace_only_install

echo "Userspace-only install complete, now waiting for signal"
sleep infinity &
trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
trap - EXIT
while true; do wait $! || continue; done
exit 0
fi

_unload_driver || exit 1
_unmount_rootfs

# Install the userspace components and copy the kernel module sources.
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
sh /tmp/install.sh nvinstall

# Determine the kernel module type
_resolve_kernel_type || exit 1

# Copy the kernel module sources
mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \
mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest

_build

Expand Down
31 changes: 31 additions & 0 deletions rhel9/ocp_dtk_entrypoint
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,30 @@ echo "Running $*"
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
source $SCRIPT_DIR/common.sh

_build_driver_config() {
local config="DRIVER_VERSION=${DRIVER_VERSION}
KERNEL_VERSION=$(uname -r)
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-false}
USE_HOST_MOFED=${USE_HOST_MOFED:-false}
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}"

for conf_file in nvidia.conf nvidia-uvm.conf nvidia-modeset.conf nvidia-peermem.conf; do
if [ -f "/drivers/$conf_file" ]; then
config="${config}
$(cat "/drivers/$conf_file")"
fi
done

echo "$config"
}

_should_use_fast_path() {
[ -f /sys/module/nvidia/refcnt ] && [ -f /run/nvidia/driver-config.state ] || return 1
local current_config=$(_build_driver_config)
local stored_config=$(cat /run/nvidia/driver-config.state 2>/dev/null || echo "")
[ "${current_config}" = "${stored_config}" ]
}

nv-ctr-run-with-dtk() {
set -x

Expand All @@ -18,6 +42,13 @@ nv-ctr-run-with-dtk() {
exec bash -x nvidia-driver init
fi

if _should_use_fast_path; then
echo "Fast path detected: skipping DTK build and module copy, proceeding with userspace-only install"
exec bash -x nvidia-driver init
fi

echo "Fast path not detected: building driver and modules"

if [[ ! -f "$DRIVER_TOOLKIT_SHARED_DIR/dir_prepared" ]]; then
cp -r \
/tmp/install.sh \
Expand Down
Loading