@@ -124,6 +124,31 @@ _assert_nvswitch_system() {
124124 return 0
125125}
126126
127+ _assert_nvlink5_system () (
128+ for dir in /sys/class/infiniband/* /device; do
129+ # Define the path to the VPD file
130+ vpd_file=" $dir /vpd"
131+
132+ # Check if the VPD file exists
133+ if [ -f " $vpd_file " ]; then
134+ # Search for 'SW_MNG' in the VPD file
135+ if grep -q " SW_MNG" " $vpd_file " ; then
136+ echo " Detected NVLink5+ system"
137+ return 0
138+ fi
139+ fi
140+ done
141+ return 1
142+ )
143+
144+ _ensure_nvlink5_prerequisites () (
145+ until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1 ;
146+ do
147+ echo " waiting for the mlx5_core and ib_umad kernel modules to be loaded"
148+ sleep 10
149+ done
150+ )
151+
127152# Check if mellanox devices are present
128153_mellanox_devices_present () {
129154 devices_found=0
@@ -250,7 +275,18 @@ _load_driver() {
250275 _start_vgpu_topology_daemon
251276 fi
252277
253- if _assert_nvswitch_system; then
278+ if _assert_nvlink5_system; then
279+ _ensure_nvlink5_prerequisites || return 1
280+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
281+
282+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
283+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
284+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
285+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
286+ /usr/bin/nvidia-fabricmanager-start.sh $fm_config_file $fm_pid_file $nvlsm_config_file $nvlsm_pid_file
287+
288+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
289+ elif _assert_nvswitch_system; then
254290 echo " Starting NVIDIA fabric manager daemon..."
255291 nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
256292 fi
@@ -312,6 +348,21 @@ _unload_driver() {
312348 fi
313349 fi
314350
351+ if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then
352+ echo " Stopping NVLink Subnet Manager daemon..."
353+ local pid=$( < /var/run/nvidia-fabricmanager/nvlsm.pid)
354+
355+ kill -SIGTERM " ${pid} "
356+ for i in $( seq 1 50) ; do
357+ kill -0 " ${pid} " 2> /dev/null || break
358+ sleep 0.1
359+ done
360+ if [ $i -eq 50 ]; then
361+ echo " Could not stop NVLink Subnet Manager daemon" >&2
362+ return 1
363+ fi
364+ fi
365+
315366 echo " Unloading NVIDIA driver kernel modules..."
316367 if [ -f /sys/module/nvidia_modeset/refcnt ]; then
317368 nvidia_modeset_refs=$( < /sys/module/nvidia_modeset/refcnt)
0 commit comments