@@ -268,6 +268,37 @@ _assert_nvswitch_system() {
268268 return 0
269269}
270270
271+ _assert_nvlink5_system () (
272+ for dir in /sys/class/infiniband/* /device; do
273+ # Define the path to the VPD file
274+ vpd_file=" $dir /vpd"
275+
276+ # Check if the VPD file exists
277+ if [ -f " $vpd_file " ]; then
278+ # Search for 'SW_MNG' in the VPD file
279+ if grep -q " SW_MNG" " $vpd_file " ; then
280+ echo " Detected NVLink5+ system"
281+ return 0
282+ fi
283+ fi
284+ done
285+ return 1
286+ )
287+
288+ _ensure_nvlink5_prerequisites () (
289+ until lsmod | grep mlx5_core > /dev/null 2>&1 && lsmod | grep ib_umad > /dev/null 2>&1 ;
290+ do
291+ echo " waiting for the mlx5_core and ib_umad kernel modules to be loaded"
292+ sleep 10
293+ done
294+
295+ echo " Installing Infiniband packages..."
296+ dnf -q -y --releasever=${DNF_RELEASEVER} install libibverbs-utils infiniband-diags > /dev/null
297+
298+ echo " Installing the NVLink subnet manager..."
299+ dnf -q -y --releasever=${DNF_RELEASEVER} install nvlsm > /dev/null
300+ )
301+
271302# For each kernel module configuration file mounted into the container,
272303# parse the file contents and extract the custom module parameters that
273304# are to be passed as input to 'modprobe'.
@@ -370,7 +401,18 @@ _load_driver() {
370401 _start_vgpu_topology_daemon
371402 fi
372403
373- if _assert_nvswitch_system; then
404+ if _assert_nvlink5_system; then
405+ _ensure_nvlink5_prerequisites || return 1
406+ echo " Starting NVIDIA fabric manager daemon for NVLink5+..."
407+
408+ fm_config_file=/usr/share/nvidia/nvswitch/fabricmanager.cfg
409+ fm_pid_file=/var/run/nvidia-fabricmanager/nv-fabricmanager.pid
410+ nvlsm_config_file=/usr/share/nvidia/nvlsm/nvlsm.conf
411+ nvlsm_pid_file=/var/run/nvidia-fabricmanager/nvlsm.pid
412+ /usr/bin/nvidia-fabricmanager-start.sh $fm_config_file $fm_pid_file $nvlsm_config_file $nvlsm_pid_file
413+
414+ # If not a NVLink5+ switch, check for the presence of NVLink4 (or below) switches
415+ elif _assert_nvswitch_system; then
374416 echo " Starting NVIDIA fabric manager daemon..."
375417 nv-fabricmanager -c /usr/share/nvidia/nvswitch/fabricmanager.cfg
376418 fi
@@ -430,6 +472,21 @@ _unload_driver() {
430472 fi
431473 fi
432474
475+ if [ -f /var/run/nvidia-fabricmanager/nvlsm.pid ]; then
476+ echo " Stopping NVLink Subnet Manager daemon..."
477+ local pid=$( < /var/run/nvidia-fabricmanager/nvlsm.pid)
478+
479+ kill -SIGTERM " ${pid} "
480+ for i in $( seq 1 50) ; do
481+ kill -0 " ${pid} " 2> /dev/null || break
482+ sleep 0.1
483+ done
484+ if [ $i -eq 50 ]; then
485+ echo " Could not stop NVLink Subnet Manager daemon" >&2
486+ return 1
487+ fi
488+ fi
489+
433490 echo " Unloading NVIDIA driver kernel modules..."
434491 if [ -f /sys/module/nvidia_modeset/refcnt ]; then
435492 nvidia_modeset_refs=$( < /sys/module/nvidia_modeset/refcnt)
0 commit comments