@@ -72,8 +72,9 @@ type Interface interface {
7272}
7373
7474type driverMgr struct {
75- cfg config.Config
76- containerMode string
75+ cfg config.Config
76+ containerMode string
77+ newDriverLoaded bool
7778
7879 cmd cmd.Interface
7980 host host.Interface
@@ -209,16 +210,93 @@ func (d *driverMgr) Load(ctx context.Context) (bool, error) {
209210 if err := d .generateOfedModulesBlacklist (ctx ); err != nil {
210211 return false , err
211212 }
212- if err := d .removeOfedModulesBlacklist (ctx ); err != nil {
213- return false , err
213+ defer func () {
214+ if err := d .removeOfedModulesBlacklist (ctx ); err != nil {
215+ log := logr .FromContextOrDiscard (ctx )
216+ log .Error (err , "Failed to remove OFED modules blacklist during cleanup" )
217+ }
218+ }()
219+
220+ log := logr .FromContextOrDiscard (ctx )
221+ log .V (1 ).Info ("Loading driver modules" )
222+
223+ // Define modules to check
224+ modulesToCheck := []string {"mlx5_core" , "mlx5_ib" , "ib_core" }
225+
226+ // Add NFS RDMA modules if enabled
227+ if d .cfg .EnableNfsRdma {
228+ modulesToCheck = append (modulesToCheck , "nvme_rdma" , "rpcrdma" )
214229 }
230+
231+ // Check if loaded kernel modules match expected versions
232+ modulesMatch , err := d .checkLoadedKmodSrcverVsModinfo (ctx , modulesToCheck )
233+ if err != nil {
234+ return false , fmt .Errorf ("failed to check module versions: %w" , err )
235+ }
236+
237+ if ! modulesMatch {
238+ log .V (1 ).Info ("Module versions don't match, restarting driver" )
239+
240+ // Restart driver
241+ if err := d .restartDriver (ctx ); err != nil {
242+ return false , fmt .Errorf ("failed to restart driver: %w" , err )
243+ }
244+
245+ // Mark that a new driver was loaded
246+ d .newDriverLoaded = true
247+
248+ // Load NFS RDMA modules if enabled
249+ if d .cfg .EnableNfsRdma {
250+ if err := d .loadNfsRdma (ctx ); err != nil {
251+ log .V (1 ).Info ("Failed to load NFS RDMA modules" , "error" , err )
252+ // Non-fatal error, continue
253+ }
254+ }
255+ } else {
256+ log .V (1 ).Info ("Loaded and candidate drivers are identical, skipping reload" )
257+ }
258+
259+ // Print loaded driver version
260+ if err := d .printLoadedDriverVersion (ctx ); err != nil {
261+ log .V (1 ).Info ("Failed to print driver version" , "error" , err )
262+ // Non-fatal error, continue
263+ }
264+
265+ log .Info ("Driver loaded successfully" )
215266 return true , nil
216267}
217268
218269// Unload is the default implementation of the driver.Interface.
219270func (d * driverMgr ) Unload (ctx context.Context ) (bool , error ) {
220- // TODO: Implement
221- return true , nil
271+ log := logr .FromContextOrDiscard (ctx )
272+
273+ if d .newDriverLoaded {
274+ // Check if mlnxofedctl exists
275+ if _ , err := d .os .Stat ("/usr/sbin/mlnxofedctl" ); err == nil {
276+ log .Info ("Restoring Mellanox OFED Driver from host..." )
277+
278+ // Execute mlnxofedctl --alt-mods force-restart
279+ _ , _ , err := d .cmd .RunCommand (ctx , "/usr/sbin/mlnxofedctl" , "--alt-mods" , "force-restart" )
280+ if err != nil {
281+ return false , fmt .Errorf ("failed to restore driver with mlnxofedctl: %w" , err )
282+ }
283+
284+ // Print loaded driver version
285+ if err := d .printLoadedDriverVersion (ctx ); err != nil {
286+ log .V (1 ).Info ("Failed to print driver version after restore" , "error" , err )
287+ // Non-fatal error, continue
288+ }
289+
290+ log .Info ("Driver restored successfully" )
291+ return true , nil
292+ } else {
293+ log .V (1 ).Info ("mlnxofedctl not found, cannot restore driver" )
294+ }
295+ } else {
296+ log .Info ("Keeping currently loaded Mellanox OFED Driver..." )
297+ }
298+
299+ return false , nil
222300}
223301
224302// Clear is the default implementation of the driver.Interface.
@@ -1174,6 +1252,269 @@ func (d *driverMgr) analyzeKernelType(
11741252 return kernelTypeStandard , kVer , rtHpSubstr , releaseverStr
11751253}
11761254
1255+ // checkLoadedKmodSrcverVsModinfo checks if loaded kernel module srcversion matches modinfo
1256+ func (d * driverMgr ) checkLoadedKmodSrcverVsModinfo (ctx context.Context , modules []string ) (bool , error ) {
1257+ log := logr .FromContextOrDiscard (ctx )
1258+
1259+ // Get list of loaded modules using host interface
1260+ loadedModules , err := d .host .LsMod (ctx )
1261+ if err != nil {
1262+ return false , fmt .Errorf ("failed to get loaded modules: %w" , err )
1263+ }
1264+
1265+ for _ , module := range modules {
1266+ log .V (1 ).Info ("Checking module" , "module" , module )
1267+
1268+ // Check if module is loaded
1269+ if _ , exists := loadedModules [module ]; ! exists {
1270+ log .V (1 ).Info ("Module not loaded" , "module" , module )
1271+ return false , nil // Module not loaded, need to reload
1272+ }
1273+
1274+ // Get srcversion from modinfo
1275+ srcverFromModinfo , _ , err := d .cmd .RunCommand (ctx , "modinfo" , module )
1276+ if err != nil {
1277+ log .V (1 ).Info ("Failed to get modinfo for module" , "module" , module , "error" , err )
1278+ return false , nil // Module not found, need to reload
1279+ }
1280+
1281+ // Extract srcversion from modinfo output
1282+ srcverFromModinfo = strings .TrimSpace (srcverFromModinfo )
1283+ lines := strings .Split (srcverFromModinfo , "\n " )
1284+ var modinfoSrcver string
1285+ for _ , line := range lines {
1286+ if strings .Contains (line , "srcversion" ) {
1287+ parts := strings .Fields (line )
1288+ if len (parts ) > 0 {
1289+ modinfoSrcver = parts [len (parts )- 1 ]
1290+ break
1291+ }
1292+ }
1293+ }
1294+
1295+ // Get srcversion from sysfs
1296+ sysfsPath := fmt .Sprintf ("/sys/module/%s/srcversion" , module )
1297+ srcverFromSysfs , _ , err := d .cmd .RunCommand (ctx , "cat" , sysfsPath )
1298+ if err != nil {
1299+ log .V (1 ).Info ("Failed to read sysfs srcversion for module" , "module" , module , "error" , err )
1300+ return false , nil // Module not loaded, need to reload
1301+ }
1302+
1303+ srcverFromSysfs = strings .TrimSpace (srcverFromSysfs )
1304+
1305+ log .V (1 ).Info ("Module version check" , "module" , module , "modinfo" , modinfoSrcver , "sysfs" , srcverFromSysfs )
1306+
1307+ if modinfoSrcver != srcverFromSysfs {
1308+ log .V (1 ).Info ("Module srcversion differs" , "module" , module )
1309+ return false , nil
1310+ }
1311+ }
1312+
1313+ return true , nil
1314+ }
1315+
1316+ // restartDriver restarts the driver modules
1317+ func (d * driverMgr ) restartDriver (ctx context.Context ) error {
1318+ log := logr .FromContextOrDiscard (ctx )
1319+
1320+ log .V (1 ).Info ("Restarting driver modules" )
1321+
1322+ // Load dependencies
1323+ _ , _ , err := d .cmd .RunCommand (ctx , "modprobe" , "-d" , "/host" , "tls" )
1324+ if err != nil {
1325+ log .V (1 ).Info ("Failed to load tls module" , "error" , err )
1326+ // Non-fatal, continue
1327+ }
1328+
1329+ _ , _ , err = d .cmd .RunCommand (ctx , "modprobe" , "-d" , "/host" , "psample" )
1330+ if err != nil {
1331+ log .V (1 ).Info ("Failed to load psample module" , "error" , err )
1332+ // Non-fatal, continue
1333+ }
1334+
1335+ // Check if mlx5_ib depends on macsec and load it if needed
1336+ depends , _ , err := d .cmd .RunCommand (ctx , "modinfo" , "-F" , "depends" , "mlx5_ib" )
1337+ if err == nil && strings .Contains (depends , "macsec" ) {
1338+ _ , _ , err = d .cmd .RunCommand (ctx , "modprobe" , "-d" , "/host" , "macsec" )
1339+ if err != nil {
1340+ log .V (1 ).Info ("Failed to load macsec module" , "error" , err )
1341+ // Non-fatal, continue
1342+ }
1343+ }
1344+
1345+ // Load pci-hyperv-intf if needed (simplified logic)
1346+ arch := d .getArchitecture (ctx )
1347+ if arch != "aarch64" {
1348+ _ , _ , err = d .cmd .RunCommand (ctx , "modprobe" , "-d" , "/host" , "pci-hyperv-intf" )
1349+ if err != nil {
1350+ log .V (1 ).Info ("Failed to load pci-hyperv-intf module" , "error" , err )
1351+ // Non-fatal, continue
1352+ }
1353+ }
1354+
1355+ // Unload storage modules if enabled
1356+ if d .cfg .UnloadStorageModules {
1357+ if err := d .unloadStorageModules (ctx ); err != nil {
1358+ log .V (1 ).Info ("Failed to unload storage modules" , "error" , err )
1359+ // Non-fatal, continue
1360+ }
1361+ }
1362+
1363+ // Restart openibd service
1364+ _ , _ , err = d .cmd .RunCommand (ctx , "/etc/init.d/openibd" , "restart" )
1365+ if err != nil {
1366+ return fmt .Errorf ("failed to restart openibd service: %w" , err )
1367+ }
1368+
1369+ // Load mlx5_vdpa if available
1370+ _ , _ , err = d .cmd .RunCommand (ctx , "modinfo" , "mlx5_vdpa" )
1371+ if err == nil {
1372+ // Module exists, try to load it
1373+ _ , _ , err = d .cmd .RunCommand (ctx , "modprobe" , "mlx5_vdpa" )
1374+ if err != nil {
1375+ log .V (1 ).Info ("Failed to load mlx5_vdpa module" , "error" , err )
1376+ // Non-fatal, continue
1377+ }
1378+ } else {
1379+ log .V (1 ).Info ("mlx5_vdpa module not found, skipping" )
1380+ }
1381+
1382+ return nil
1383+ }
1384+
1385+ // loadNfsRdma loads NFS RDMA modules if enabled
1386+ func (d * driverMgr ) loadNfsRdma (ctx context.Context ) error {
1387+ log := logr .FromContextOrDiscard (ctx )
1388+
1389+ if ! d .cfg .EnableNfsRdma {
1390+ return nil
1391+ }
1392+
1393+ log .V (1 ).Info ("Loading NFS RDMA modules" )
1394+
1395+ _ , _ , err := d .cmd .RunCommand (ctx , "modprobe" , "rpcrdma" )
1396+ if err != nil {
1397+ return fmt .Errorf ("failed to load rpcrdma module: %w" , err )
1398+ }
1399+
1400+ return nil
1401+ }
1402+
1403+ // printLoadedDriverVersion prints the currently loaded driver version
1404+ func (d * driverMgr ) printLoadedDriverVersion (ctx context.Context ) error {
1405+ log := logr .FromContextOrDiscard (ctx )
1406+
1407+ // Check if mlx5_core is loaded using host interface
1408+ loadedModules , err := d .host .LsMod (ctx )
1409+ if err != nil {
1410+ return fmt .Errorf ("failed to check loaded modules: %w" , err )
1411+ }
1412+
1413+ // Check if mlx5_core is loaded
1414+ if _ , exists := loadedModules ["mlx5_core" ]; ! exists {
1415+ log .V (1 ).Info ("mlx5_core module not loaded" )
1416+ return nil
1417+ }
1418+
1419+ // Get first Mellanox network device name
1420+ netdevName , err := d .getFirstMlxNetdevName (ctx )
1421+ if err != nil {
1422+ log .V (1 ).Info ("No Mellanox network device found" , "error" , err )
1423+ return nil
1424+ }
1425+
1426+ // Get driver version via ethtool
1427+ ethtoolOutput , _ , err := d .cmd .RunCommand (ctx , "ethtool" , "--driver" , netdevName )
1428+ if err != nil {
1429+ log .V (1 ).Info ("Failed to get driver version via ethtool" , "error" , err )
1430+ return nil
1431+ }
1432+
1433+ // Extract version from ethtool output
1434+ lines := strings .Split (ethtoolOutput , "\n " )
1435+ for _ , line := range lines {
1436+ if strings .HasPrefix (line , "version:" ) {
1437+ version := strings .TrimSpace (strings .TrimPrefix (line , "version:" ))
1438+ log .Info ("Current mlx5_core driver version" , "version" , version )
1439+ break
1440+ }
1441+ }
1442+
1443+ return nil
1444+ }
1445+
1446+ // getFirstMlxNetdevName gets the first Mellanox network device name
1447+ func (d * driverMgr ) getFirstMlxNetdevName (ctx context.Context ) (string , error ) {
1448+ // List network devices
1449+ netdevOutput , _ , err := d .cmd .RunCommand (ctx , "ls" , "/sys/class/net/" )
1450+ if err != nil {
1451+ return "" , fmt .Errorf ("failed to list network devices: %w" , err )
1452+ }
1453+
1454+ devices := strings .Fields (netdevOutput )
1455+ for _ , device := range devices {
1456+ // Check if this is a Mellanox device by looking at driver
1457+ driverPath := fmt .Sprintf ("/sys/class/net/%s/device/driver" , device )
1458+ driverLink , _ , err := d .cmd .RunCommand (ctx , "readlink" , driverPath )
1459+ if err != nil {
1460+ continue
1461+ }
1462+
1463+ if strings .Contains (driverLink , "mlx5" ) {
1464+ return device , nil
1465+ }
1466+ }
1467+
1468+ return "" , fmt .Errorf ("no Mellanox network device found" )
1469+ }
1470+
1471+ // unloadStorageModules modifies the openibd script to include storage modules in the unload list
1472+ func (d * driverMgr ) unloadStorageModules (ctx context.Context ) error {
1473+ log := logr .FromContextOrDiscard (ctx )
1474+
1475+ log .V (1 ).Info ("Unloading storage modules" )
1476+
1477+ // Determine the unload storage script path
1478+ unloadStorageScript := "/etc/init.d/openibd"
1479+ if _ , err := d .os .Stat ("/usr/share/mlnx_ofed/mod_load_funcs" ); err == nil {
1480+ unloadStorageScript = "/usr/share/mlnx_ofed/mod_load_funcs"
1481+ }
1482+
1483+ log .V (1 ).Info ("Using unload storage script" , "script" , unloadStorageScript )
1484+
1485+ // Create the sed command to add storage modules to UNLOAD_MODULES
1486+ // This matches the bash script:
1487+ // sed -i -e '/^[[:space:]]*UNLOAD_MODULES="[a-z]/a\ UNLOAD_MODULES="$UNLOAD_MODULES \
1488+ // ib_isert nvme_rdma nvmet_rdma rpcrdma xprtrdma ib_srpt"'
1489+ storageModulesStr := strings .Join (d .cfg .StorageModules , " " )
1490+ sedCommand := fmt .Sprintf (`/^[[:space:]]*UNLOAD_MODULES="[a-z]/a\ UNLOAD_MODULES="$UNLOAD_MODULES %s"` , storageModulesStr )
1491+ log .V (1 ).Info ("Executing sed command" , "sedCommand" , sedCommand , "storageModules" , d .cfg .StorageModules )
1492+
1493+ // Execute sed command to modify the script
1494+ _ , _ , err := d .cmd .RunCommand (ctx , "sed" , "-i" , "-e" , sedCommand , unloadStorageScript )
1495+ if err != nil {
1496+ return fmt .Errorf ("failed to modify unload storage script: %w" , err )
1497+ }
1498+
1499+ // Verify the modification was successful by checking if storage modules are now in the script
1500+ // This matches the bash script: if [ `grep ib_isert ${unload_storage_script} -c` -lt 1 ]; then
1501+ grepCmd := fmt .Sprintf ("grep %s %s -c" , d .cfg .StorageModules [0 ], unloadStorageScript )
1502+ _ , stdout , err := d .cmd .RunCommand (ctx , "sh" , "-c" , grepCmd )
1503+ if err != nil {
1504+ return fmt .Errorf ("failed to verify storage modules injection: %w" , err )
1505+ }
1506+
1507+ count := strings .TrimSpace (stdout )
1508+ log .V (1 ).Info ("Verification result" , "grepCmd" , grepCmd , "count" , count )
1509+
1510+ if count == "0" {
1511+ return fmt .Errorf ("failed to inject storage modules for unload" )
1512+ }
1513+
1514+ log .V (1 ).Info ("Successfully added storage modules to unload script" , "modules" , d .cfg .StorageModules )
1515+ return nil
1516+ }
1517+
11771518// setupSpecialKernelRepos sets up repositories for RT and 64k kernels
11781519func (d * driverMgr ) setupSpecialKernelRepos (ctx context.Context ) error {
11791520 log := logr .FromContextOrDiscard (ctx )
0 commit comments