Skip to content

Commit be996d0

Browse files
authored
Merge pull request #246 from NVIDIA/nvlink5-fm
[ubuntu22.04] add support for nvlink5+ systems
2 parents 3552903 + 21abd7c commit be996d0

File tree

2 files changed

+60
-2
lines changed

2 files changed

+60
-2
lines changed

ubuntu22.04/Dockerfile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,14 @@ RUN if [ "$DRIVER_TYPE" != "vgpu" ]; then \
8686

8787
RUN if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$DRIVER_BRANCH" -ge "550" ]; then \
8888
apt-get update && \
89-
apt-get install -y --no-install-recommends nvidia-imex-${DRIVER_BRANCH}=${DRIVER_VERSION}-1 && \
89+
apt-get install -y --no-install-recommends nvlsm infiniband-diags \
90+
nvidia-imex-${DRIVER_BRANCH}=${DRIVER_VERSION}-1 && \
91+
rm -rf /var/lib/apt/lists/*; fi
92+
93+
# libnvsdm packages are not available for arm64
94+
RUN if [ "$DRIVER_TYPE" != "vgpu" ] && [ "$DRIVER_BRANCH" -ge "560" ] && [ "$TARGETARCH" != "arm64" ]; then \
95+
apt-get update && \
96+
apt-get install -y --no-install-recommends libnvsdm-${DRIVER_BRANCH}=${DRIVER_VERSION}-1 && \
9097
rm -rf /var/lib/apt/lists/*; fi
9198

9299
WORKDIR /drivers

ubuntu22.04/nvidia-driver

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,31 @@ _assert_nvswitch_system() {
195195
return 0
196196
}
197197

198+
_assert_nvlink5_system() (
199+
for dir in /sys/class/infiniband/*/device; do
200+
# Define the path to the VPD file
201+
vpd_file="$dir/vpd"
202+
203+
# Check if the VPD file exists
204+
if [ -f "$vpd_file" ]; then
205+
# Search for 'SW_MNG' in the VPD file
206+
if grep -q "SW_MNG" "$vpd_file"; then
207+
echo "Detected NVLink5+ system"
208+
return 0
209+
fi
210+
fi
211+
done
212+
return 1
213+
)
214+
215+
_ensure_nvlink5_prerequisites() (
216+
until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1;
217+
do
218+
echo "waiting for the mlx5_core and ib_umad kernel modules to be loaded"
219+
sleep 10
220+
done
221+
)
222+
198223
# Check if mellanox devices are present
199224
_mellanox_devices_present() {
200225
devices_found=0
@@ -321,7 +346,18 @@ _load_driver() {
321346
_start_vgpu_topology_daemon
322347
fi
323348

324-
if _assert_nvswitch_system; then
349+
if _assert_nvlink5_system; then
350+
_ensure_nvlink5_prerequisites || return 1
351+
echo "Starting NVIDIA fabric manager daemon for NVLink5+..."
352+
353+
fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
354+
fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
355+
nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
356+
nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
357+
/usr/bin/nvidia-fabricmanager-start.sh $fm_config_file $fm_pid_file $nvlsm_config_file $nvlsm_pid_file
358+
359+
# If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
360+
elif _assert_nvswitch_system; then
325361
echo "Starting NVIDIA fabric manager daemon..."
326362
nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
327363
fi
@@ -383,6 +419,21 @@ _unload_driver() {
383419
fi
384420
fi
385421

422+
if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then
423+
echo "Stopping NVLink Subnet Manager daemon..."
424+
local pid=$(< /var/run/nvidia-fabricmanager/nvlsm.pid)
425+
426+
kill -SIGTERM "${pid}"
427+
for i in $(seq 1 50); do
428+
kill -0 "${pid}" 2> /dev/null || break
429+
sleep 0.1
430+
done
431+
if [ $i -eq 50 ]; then
432+
echo "Could not stop NVLink Subnet Manager daemon" >&2
433+
return 1
434+
fi
435+
fi
436+
386437
echo "Unloading NVIDIA driver kernel modules..."
387438
if [ -f /sys/module/nvidia_modeset/refcnt ]; then
388439
nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)

0 commit comments

Comments
 (0)