@@ -195,6 +195,31 @@ _assert_nvswitch_system() {
195195 return 0
196196}
197197
198+ _assert_nvlink5_system () (
199+ for dir in /sys/class/infiniband/* /device; do
200+ # Define the path to the VPD file
201+ vpd_file=" $dir /vpd"
202+
203+ # Check if the VPD file exists
204+ if [ -f " $vpd_file " ]; then
205+ # Search for 'SW_MNG' in the VPD file
206+ if grep -q " SW_MNG" " $vpd_file " ; then
207+ echo " Detected NVLink5+ system"
208+ return 0
209+ fi
210+ fi
211+ done
212+ return 1
213+ )
214+
215+ _ensure_nvlink5_prerequisites () (
216+ until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1 ;
217+ do
218+ echo " waiting for the mlx5_core and ib_umad kernel modules to be loaded"
219+ sleep 10
220+ done
221+ )
222+
198223# Check if mellanox devices are present
199224_mellanox_devices_present () {
200225 devices_found=0
@@ -321,7 +346,18 @@ _load_driver() {
321346 _start_vgpu_topology_daemon
322347 fi
323348
324- if _assert_nvswitch_system; then
349+ if _assert_nvlink5_system; then
350+ _ensure_nvlink5_prerequisites || return 1
351+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
352+
353+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
354+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
355+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
356+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
357+ /usr/bin/nvidia-fabricmanager-start.sh $fm_config_file $fm_pid_file $nvlsm_config_file $nvlsm_pid_file
358+
359+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
360+ elif _assert_nvswitch_system; then
325361 echo " Starting NVIDIA fabric manager daemon..."
326362 nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
327363 fi
@@ -383,6 +419,21 @@ _unload_driver() {
383419 fi
384420 fi
385421
422+ if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then
423+ echo " Stopping NVLink Subnet Manager daemon..."
424+ local pid=$( < /var/run/nvidia-fabricmanager/nvlsm.pid)
425+
426+ kill -SIGTERM " ${pid} "
427+ for i in $( seq 1 50) ; do
428+ kill -0 " ${pid} " 2> /dev/null || break
429+ sleep 0.1
430+ done
431+ if [ $i -eq 50 ]; then
432+ echo " Could not stop NVLink Subnet Manager daemon" >&2
433+ return 1
434+ fi
435+ fi
436+
386437 echo " Unloading NVIDIA driver kernel modules..."
387438 if [ -f /sys/module/nvidia_modeset/refcnt ]; then
388439 nvidia_modeset_refs=$( < /sys/module/nvidia_modeset/refcnt)
0 commit comments