Skip to content

Commit 0de3f91

Browse files
authored
Merge pull request #281 from NVIDIA/ubuntu2404-nvl5
2 parents 8e5c7bf + c38b50a commit 0de3f91

File tree

4 files changed

+176
-4
lines changed

4 files changed

+176
-4
lines changed

rhel8/nvidia-driver

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,37 @@ _assert_nvswitch_system() {
268268
return 0
269269
}
270270

271+
_assert_nvlink5_system() (
272+
for dir in /sys/class/infiniband/*/device; do
273+
# Define the path to the VPD file
274+
vpd_file="$dir/vpd"
275+
276+
# Check if the VPD file exists
277+
if [ -f "$vpd_file" ]; then
278+
# Search for 'SW_MNG' in the VPD file
279+
if grep -q "SW_MNG" "$vpd_file"; then
280+
echo "Detected NVLink5+ system"
281+
return 0
282+
fi
283+
fi
284+
done
285+
return 1
286+
)
287+
288+
_ensure_nvlink5_prerequisites() (
289+
until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1;
290+
do
291+
echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded"
292+
sleep 10
293+
done
294+
295+
echo "Installing Infiniband packages"
296+
dnf -q -y --releasever=${DNF_RELEASEVER} install libibverbs-utils infiniband-diags > /dev/null
297+
298+
echo "Installing the NVLink subnet manager..."
299+
dnf -q -y --releasever=${DNF_RELEASEVER} install nvlsm > /dev/null
300+
)
301+
271302
# For each kernel module configuration file mounted into the container,
272303
# parse the file contents and extract the custom module parameters that
273304
# are to be passed as input to 'modprobe'.
@@ -370,7 +401,18 @@ _load_driver() {
370401
_start_vgpu_topology_daemon
371402
fi
372403

373-
if _assert_nvswitch_system; then
404+
if _assert_nvlink5_system; then
405+
_ensure_nvlink5_prerequisites || return 1
406+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
407+
408+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
409+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
410+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
411+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
412+
/usr/bin/nvidia-fabricmanager-start.sh $fm_config_file $fm_pid_file $nvlsm_config_file $nvlsm_pid_file
413+
414+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
415+
elif _assert_nvswitch_system; then
374416
echo "Starting NVIDIA fabric manager daemon..."
375417
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
376418
fi
@@ -430,6 +472,21 @@ _unload_driver() {
430472
fi
431473
fi
432474

475+
if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then
476+
echo "Stopping NVLink Subnet Manager daemon..."
477+
local pid=$(< /var/run/nvidia-fabricmanager/nvlsm.pid)
478+
479+
kill -SIGTERM "${pid}"
480+
for i in $(seq 1 50); do
481+
kill -0 "${pid}" 2> /dev/null || break
482+
sleep 0.1
483+
done
484+
if [ $i -eq 50 ]; then
485+
echo "Could not stop NVLink Subnet Manager daemon" >&2
486+
return 1
487+
fi
488+
fi
489+
433490
echo "Unloading NVIDIA driver kernel modules..."
434491
if [ -f /sys/module/nvidia_modeset/refcnt ]; then
435492
nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)

rhel9/nvidia-driver

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,37 @@ _assert_nvswitch_system() {
268268
return 0
269269
}
270270

271+
_assert_nvlink5_system() (
272+
for dir in /sys/class/infiniband/*/device; do
273+
# Define the path to the VPD file
274+
vpd_file="$dir/vpd"
275+
276+
# Check if the VPD file exists
277+
if [ -f "$vpd_file" ]; then
278+
# Search for 'SW_MNG' in the VPD file
279+
if grep -q "SW_MNG" "$vpd_file"; then
280+
echo "Detected NVLink5+ system"
281+
return 0
282+
fi
283+
fi
284+
done
285+
return 1
286+
)
287+
288+
_ensure_nvlink5_prerequisites() (
289+
until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1;
290+
do
291+
echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded"
292+
sleep 10
293+
done
294+
295+
echo "Installing Infiniband packages..."
296+
dnf -q -y --releasever=${DNF_RELEASEVER} install libibverbs-utils infiniband-diags > /dev/null
297+
298+
echo "Installing the NVLink subnet manager..."
299+
dnf -q -y --releasever=${DNF_RELEASEVER} install nvlsm > /dev/null
300+
)
301+
271302
# For each kernel module configuration file mounted into the container,
272303
# parse the file contents and extract the custom module parameters that
273304
# are to be passed as input to 'modprobe'.
@@ -370,7 +401,18 @@ _load_driver() {
370401
_start_vgpu_topology_daemon
371402
fi
372403

373-
if _assert_nvswitch_system; then
404+
if _assert_nvlink5_system; then
405+
_ensure_nvlink5_prerequisites || return 1
406+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
407+
408+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
409+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
410+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
411+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
412+
/usr/bin/nvidia-fabricmanager-start.sh $fm_config_file $fm_pid_file $nvlsm_config_file $nvlsm_pid_file
413+
414+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
415+
elif _assert_nvswitch_system; then
374416
echo "Starting NVIDIA fabric manager daemon..."
375417
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
376418
fi
@@ -430,6 +472,21 @@ _unload_driver() {
430472
fi
431473
fi
432474

475+
if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then
476+
echo "Stopping NVLink Subnet Manager daemon..."
477+
local pid=$(< /var/run/nvidia-fabricmanager/nvlsm.pid)
478+
479+
kill -SIGTERM "${pid}"
480+
for i in $(seq 1 50); do
481+
kill -0 "${pid}" 2> /dev/null || break
482+
sleep 0.1
483+
done
484+
if [ $i -eq 50 ]; then
485+
echo "Could not stop NVLink Subnet Manager daemon" >&2
486+
return 1
487+
fi
488+
fi
489+
433490
echo "Unloading NVIDIA driver kernel modules..."
434491
if [ -f /sys/module/nvidia_modeset/refcnt ]; then
435492
nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)

ubuntu24.04/Dockerfile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,14 @@ RUN if [ "$DRIVER_TYPE" != "vgpu" ]; then \
8383

8484
RUN if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$DRIVER_BRANCH" -ge "550" ]; then \
8585
apt-get update && \
86-
apt-get install -y --no-install-recommends nvidia-imex-${DRIVER_BRANCH}=${DRIVER_VERSION}-1 && \
86+
apt-get install -y --no-install-recommends nvlsm infiniband-diags \
87+
nvidia-imex-${DRIVER_BRANCH}=${DRIVER_VERSION}-1 && \
88+
rm -rf /var/lib/apt/lists/*; fi
89+
90+
# libnvsdm packages are not available for arm64
91+
RUN if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$DRIVER_BRANCH" -ge "560" ] && [ "$TARGETARCH" != "arm64" ]; then \
92+
apt-get update && \
93+
apt-get install -y --no-install-recommends libnvsdm-${DRIVER_BRANCH}=${DRIVER_VERSION}-1 && \
8794
rm -rf /var/lib/apt/lists/*; fi
8895

8996
WORKDIR /drivers

ubuntu24.04/nvidia-driver

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,31 @@ _assert_nvswitch_system() {
124124
return 0
125125
}
126126

127+
_assert_nvlink5_system() (
128+
for dir in /sys/class/infiniband/*/device; do
129+
# Define the path to the VPD file
130+
vpd_file="$dir/vpd"
131+
132+
# Check if the VPD file exists
133+
if [ -f "$vpd_file" ]; then
134+
# Search for 'SW_MNG' in the VPD file
135+
if grep -q "SW_MNG" "$vpd_file"; then
136+
echo "Detected NVLink5+ system"
137+
return 0
138+
fi
139+
fi
140+
done
141+
return 1
142+
)
143+
144+
_ensure_nvlink5_prerequisites() (
145+
until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1;
146+
do
147+
echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded"
148+
sleep 10
149+
done
150+
)
151+
127152
# Check if mellanox devices are present
128153
_mellanox_devices_present() {
129154
devices_found=0
@@ -250,7 +275,18 @@ _load_driver() {
250275
_start_vgpu_topology_daemon
251276
fi
252277

253-
if _assert_nvswitch_system; then
278+
if _assert_nvlink5_system; then
279+
_ensure_nvlink5_prerequisites || return 1
280+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
281+
282+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
283+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
284+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
285+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
286+
/usr/bin/nvidia-fabricmanager-start.sh $fm_config_file $fm_pid_file $nvlsm_config_file $nvlsm_pid_file
287+
288+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
289+
elif _assert_nvswitch_system; then
254290
echo "Starting NVIDIA fabric manager daemon..."
255291
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
256292
fi
@@ -312,6 +348,21 @@ _unload_driver() {
312348
fi
313349
fi
314350

351+
if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then
352+
echo "Stopping NVLink Subnet Manager daemon..."
353+
local pid=$(< /var/run/nvidia-fabricmanager/nvlsm.pid)
354+
355+
kill -SIGTERM "${pid}"
356+
for i in $(seq 1 50); do
357+
kill -0 "${pid}" 2> /dev/null || break
358+
sleep 0.1
359+
done
360+
if [ $i -eq 50 ]; then
361+
echo "Could not stop NVLink Subnet Manager daemon" >&2
362+
return 1
363+
fi
364+
fi
365+
315366
echo "Unloading NVIDIA driver kernel modules..."
316367
if [ -f /sys/module/nvidia_modeset/refcnt ]; then
317368
nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)

0 commit comments

Comments
 (0)