Skip to content

Commit cce19a7

Browse files
committed
feat: Go implement Load
Signed-off-by: Fred Rolland <[email protected]>
1 parent ea4379e commit cce19a7

File tree

4 files changed

+936
-9
lines changed

4 files changed

+936
-9
lines changed

Golang_Ubuntu_Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ ARG D_OFED_VERSION
7272
ARG D_OFED_SRC_DOWNLOAD_PATH
7373

7474
# Stage args
75-
ARG D_OFED_BASE_URL="https://linux.mellanox.com/public/repo/doca/${D_DOCA_VERSION}/SOURCES/MLNX_OFED"
75+
ARG D_OFED_BASE_URL="https://linux.mellanox.com/public/repo/doca/${D_DOCA_VERSION}/SOURCES/mlnx_ofed"
7676
ARG D_OFED_SRC_TYPE="debian-"
7777

7878
ARG D_OFED_SRC_ARCHIVE="MLNX_OFED_SRC-${D_OFED_SRC_TYPE}${D_OFED_VERSION}.tgz"

entrypoint/internal/config/config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ type Config struct {
2727
UnloadStorageModules bool `env:"UNLOAD_STORAGE_MODULES"`
2828
CreateIfnamesUdev bool `env:"CREATE_IFNAMES_UDEV"`
2929
EnableNfsRdma bool `env:"ENABLE_NFSRDMA"`
30-
RestoreDriverOnPodTermination bool `env:"RESTORE_DRIVER_ON_POD_TERMINATION" envDefault:"true"`
30+
RestoreDriverOnPodTermination bool `env:"RESTORE_DRIVER_ON_POD_TERMINATION" envDefault:"false"`
3131

3232
// driver manager advanced settings
3333
DriverReadyPath string `env:"DRIVER_READY_PATH" envDefault:"/run/mellanox/drivers/.driver-ready"`

entrypoint/internal/driver/driver.go

Lines changed: 347 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,9 @@ type Interface interface {
7272
}
7373

7474
type driverMgr struct {
75-
cfg config.Config
76-
containerMode string
75+
cfg config.Config
76+
containerMode string
77+
newDriverLoaded bool
7778

7879
cmd cmd.Interface
7980
host host.Interface
@@ -209,16 +210,93 @@ func (d *driverMgr) Load(ctx context.Context) (bool, error) {
209210
if err := d.generateOfedModulesBlacklist(ctx); err != nil {
210211
return false, err
211212
}
212-
if err := d.removeOfedModulesBlacklist(ctx); err != nil {
213-
return false, err
213+
defer func() {
214+
if err := d.removeOfedModulesBlacklist(ctx); err != nil {
215+
log := logr.FromContextOrDiscard(ctx)
216+
log.Error(err, "Failed to remove OFED modules blacklist during cleanup")
217+
}
218+
}()
219+
220+
log := logr.FromContextOrDiscard(ctx)
221+
log.V(1).Info("Loading driver modules")
222+
223+
// Define modules to check
224+
modulesToCheck := []string{"mlx5_core", "mlx5_ib", "ib_core"}
225+
226+
// Add NFS RDMA modules if enabled
227+
if d.cfg.EnableNfsRdma {
228+
modulesToCheck = append(modulesToCheck, "nvme_rdma", "rpcrdma")
214229
}
230+
231+
// Check if loaded kernel modules match expected versions
232+
modulesMatch, err := d.checkLoadedKmodSrcverVsModinfo(ctx, modulesToCheck)
233+
if err != nil {
234+
return false, fmt.Errorf("failed to check module versions: %w", err)
235+
}
236+
237+
if !modulesMatch {
238+
log.V(1).Info("Module versions don't match, restarting driver")
239+
240+
// Restart driver
241+
if err := d.restartDriver(ctx); err != nil {
242+
return false, fmt.Errorf("failed to restart driver: %w", err)
243+
}
244+
245+
// Mark that a new driver was loaded
246+
d.newDriverLoaded = true
247+
248+
// Load NFS RDMA modules if enabled
249+
if d.cfg.EnableNfsRdma {
250+
if err := d.loadNfsRdma(ctx); err != nil {
251+
log.V(1).Info("Failed to load NFS RDMA modules", "error", err)
252+
// Non-fatal error, continue
253+
}
254+
}
255+
} else {
256+
log.V(1).Info("Loaded and candidate drivers are identical, skipping reload")
257+
}
258+
259+
// Print loaded driver version
260+
if err := d.printLoadedDriverVersion(ctx); err != nil {
261+
log.V(1).Info("Failed to print driver version", "error", err)
262+
// Non-fatal error, continue
263+
}
264+
265+
log.Info("Driver loaded successfully")
215266
return true, nil
216267
}
217268

218269
// Unload is the default implementation of the driver.Interface.
219270
func (d *driverMgr) Unload(ctx context.Context) (bool, error) {
220-
// TODO: Implement
221-
return true, nil
271+
log := logr.FromContextOrDiscard(ctx)
272+
273+
if d.newDriverLoaded {
274+
// Check if mlnxofedctl exists
275+
if _, err := d.os.Stat("/usr/sbin/mlnxofedctl"); err == nil {
276+
log.Info("Restoring Mellanox OFED Driver from host...")
277+
278+
// Execute mlnxofedctl --alt-mods force-restart
279+
_, _, err := d.cmd.RunCommand(ctx, "/usr/sbin/mlnxofedctl", "--alt-mods", "force-restart")
280+
if err != nil {
281+
return false, fmt.Errorf("failed to restore driver with mlnxofedctl: %w", err)
282+
}
283+
284+
// Print loaded driver version
285+
if err := d.printLoadedDriverVersion(ctx); err != nil {
286+
log.V(1).Info("Failed to print driver version after restore", "error", err)
287+
// Non-fatal error, continue
288+
}
289+
290+
log.Info("Driver restored successfully")
291+
return true, nil
292+
} else {
293+
log.V(1).Info("mlnxofedctl not found, cannot restore driver")
294+
}
295+
} else {
296+
log.Info("Keeping currently loaded Mellanox OFED Driver...")
297+
}
298+
299+
return false, nil
222300
}
223301

224302
// Clear is the default implementation of the driver.Interface.
@@ -1174,6 +1252,269 @@ func (d *driverMgr) analyzeKernelType(
11741252
return kernelTypeStandard, kVer, rtHpSubstr, releaseverStr
11751253
}
11761254

1255+
// checkLoadedKmodSrcverVsModinfo checks if loaded kernel module srcversion matches modinfo
1256+
func (d *driverMgr) checkLoadedKmodSrcverVsModinfo(ctx context.Context, modules []string) (bool, error) {
1257+
log := logr.FromContextOrDiscard(ctx)
1258+
1259+
// Get list of loaded modules using host interface
1260+
loadedModules, err := d.host.LsMod(ctx)
1261+
if err != nil {
1262+
return false, fmt.Errorf("failed to get loaded modules: %w", err)
1263+
}
1264+
1265+
for _, module := range modules {
1266+
log.V(1).Info("Checking module", "module", module)
1267+
1268+
// Check if module is loaded
1269+
if _, exists := loadedModules[module]; !exists {
1270+
log.V(1).Info("Module not loaded", "module", module)
1271+
return false, nil // Module not loaded, need to reload
1272+
}
1273+
1274+
// Get srcversion from modinfo
1275+
srcverFromModinfo, _, err := d.cmd.RunCommand(ctx, "modinfo", module)
1276+
if err != nil {
1277+
log.V(1).Info("Failed to get modinfo for module", "module", module, "error", err)
1278+
return false, nil // Module not found, need to reload
1279+
}
1280+
1281+
// Extract srcversion from modinfo output
1282+
srcverFromModinfo = strings.TrimSpace(srcverFromModinfo)
1283+
lines := strings.Split(srcverFromModinfo, "\n")
1284+
var modinfoSrcver string
1285+
for _, line := range lines {
1286+
if strings.Contains(line, "srcversion") {
1287+
parts := strings.Fields(line)
1288+
if len(parts) > 0 {
1289+
modinfoSrcver = parts[len(parts)-1]
1290+
break
1291+
}
1292+
}
1293+
}
1294+
1295+
// Get srcversion from sysfs
1296+
sysfsPath := fmt.Sprintf("/sys/module/%s/srcversion", module)
1297+
srcverFromSysfs, _, err := d.cmd.RunCommand(ctx, "cat", sysfsPath)
1298+
if err != nil {
1299+
log.V(1).Info("Failed to read sysfs srcversion for module", "module", module, "error", err)
1300+
return false, nil // Module not loaded, need to reload
1301+
}
1302+
1303+
srcverFromSysfs = strings.TrimSpace(srcverFromSysfs)
1304+
1305+
log.V(1).Info("Module version check", "module", module, "modinfo", modinfoSrcver, "sysfs", srcverFromSysfs)
1306+
1307+
if modinfoSrcver != srcverFromSysfs {
1308+
log.V(1).Info("Module srcversion differs", "module", module)
1309+
return false, nil
1310+
}
1311+
}
1312+
1313+
return true, nil
1314+
}
1315+
1316+
// restartDriver restarts the driver modules
1317+
func (d *driverMgr) restartDriver(ctx context.Context) error {
1318+
log := logr.FromContextOrDiscard(ctx)
1319+
1320+
log.V(1).Info("Restarting driver modules")
1321+
1322+
// Load dependencies
1323+
_, _, err := d.cmd.RunCommand(ctx, "modprobe", "-d", "/host", "tls")
1324+
if err != nil {
1325+
log.V(1).Info("Failed to load tls module", "error", err)
1326+
// Non-fatal, continue
1327+
}
1328+
1329+
_, _, err = d.cmd.RunCommand(ctx, "modprobe", "-d", "/host", "psample")
1330+
if err != nil {
1331+
log.V(1).Info("Failed to load psample module", "error", err)
1332+
// Non-fatal, continue
1333+
}
1334+
1335+
// Check if mlx5_ib depends on macsec and load it if needed
1336+
depends, _, err := d.cmd.RunCommand(ctx, "modinfo", "-F", "depends", "mlx5_ib")
1337+
if err == nil && strings.Contains(depends, "macsec") {
1338+
_, _, err = d.cmd.RunCommand(ctx, "modprobe", "-d", "/host", "macsec")
1339+
if err != nil {
1340+
log.V(1).Info("Failed to load macsec module", "error", err)
1341+
// Non-fatal, continue
1342+
}
1343+
}
1344+
1345+
// Load pci-hyperv-intf if needed (simplified logic)
1346+
arch := d.getArchitecture(ctx)
1347+
if arch != "aarch64" {
1348+
_, _, err = d.cmd.RunCommand(ctx, "modprobe", "-d", "/host", "pci-hyperv-intf")
1349+
if err != nil {
1350+
log.V(1).Info("Failed to load pci-hyperv-intf module", "error", err)
1351+
// Non-fatal, continue
1352+
}
1353+
}
1354+
1355+
// Unload storage modules if enabled
1356+
if d.cfg.UnloadStorageModules {
1357+
if err := d.unloadStorageModules(ctx); err != nil {
1358+
log.V(1).Info("Failed to unload storage modules", "error", err)
1359+
// Non-fatal, continue
1360+
}
1361+
}
1362+
1363+
// Restart openibd service
1364+
_, _, err = d.cmd.RunCommand(ctx, "/etc/init.d/openibd", "restart")
1365+
if err != nil {
1366+
return fmt.Errorf("failed to restart openibd service: %w", err)
1367+
}
1368+
1369+
// Load mlx5_vdpa if available
1370+
_, _, err = d.cmd.RunCommand(ctx, "modinfo", "mlx5_vdpa")
1371+
if err == nil {
1372+
// Module exists, try to load it
1373+
_, _, err = d.cmd.RunCommand(ctx, "modprobe", "mlx5_vdpa")
1374+
if err != nil {
1375+
log.V(1).Info("Failed to load mlx5_vdpa module", "error", err)
1376+
// Non-fatal, continue
1377+
}
1378+
} else {
1379+
log.V(1).Info("mlx5_vdpa module not found, skipping")
1380+
}
1381+
1382+
return nil
1383+
}
1384+
1385+
// loadNfsRdma loads NFS RDMA modules if enabled
1386+
func (d *driverMgr) loadNfsRdma(ctx context.Context) error {
1387+
log := logr.FromContextOrDiscard(ctx)
1388+
1389+
if !d.cfg.EnableNfsRdma {
1390+
return nil
1391+
}
1392+
1393+
log.V(1).Info("Loading NFS RDMA modules")
1394+
1395+
_, _, err := d.cmd.RunCommand(ctx, "modprobe", "rpcrdma")
1396+
if err != nil {
1397+
return fmt.Errorf("failed to load rpcrdma module: %w", err)
1398+
}
1399+
1400+
return nil
1401+
}
1402+
1403+
// printLoadedDriverVersion prints the currently loaded driver version
1404+
func (d *driverMgr) printLoadedDriverVersion(ctx context.Context) error {
1405+
log := logr.FromContextOrDiscard(ctx)
1406+
1407+
// Check if mlx5_core is loaded using host interface
1408+
loadedModules, err := d.host.LsMod(ctx)
1409+
if err != nil {
1410+
return fmt.Errorf("failed to check loaded modules: %w", err)
1411+
}
1412+
1413+
// Check if mlx5_core is loaded
1414+
if _, exists := loadedModules["mlx5_core"]; !exists {
1415+
log.V(1).Info("mlx5_core module not loaded")
1416+
return nil
1417+
}
1418+
1419+
// Get first Mellanox network device name
1420+
netdevName, err := d.getFirstMlxNetdevName(ctx)
1421+
if err != nil {
1422+
log.V(1).Info("No Mellanox network device found", "error", err)
1423+
return nil
1424+
}
1425+
1426+
// Get driver version via ethtool
1427+
ethtoolOutput, _, err := d.cmd.RunCommand(ctx, "ethtool", "--driver", netdevName)
1428+
if err != nil {
1429+
log.V(1).Info("Failed to get driver version via ethtool", "error", err)
1430+
return nil
1431+
}
1432+
1433+
// Extract version from ethtool output
1434+
lines := strings.Split(ethtoolOutput, "\n")
1435+
for _, line := range lines {
1436+
if strings.HasPrefix(line, "version:") {
1437+
version := strings.TrimSpace(strings.TrimPrefix(line, "version:"))
1438+
log.Info("Current mlx5_core driver version", "version", version)
1439+
break
1440+
}
1441+
}
1442+
1443+
return nil
1444+
}
1445+
1446+
// getFirstMlxNetdevName gets the first Mellanox network device name
1447+
func (d *driverMgr) getFirstMlxNetdevName(ctx context.Context) (string, error) {
1448+
// List network devices
1449+
netdevOutput, _, err := d.cmd.RunCommand(ctx, "ls", "/sys/class/net/")
1450+
if err != nil {
1451+
return "", fmt.Errorf("failed to list network devices: %w", err)
1452+
}
1453+
1454+
devices := strings.Fields(netdevOutput)
1455+
for _, device := range devices {
1456+
// Check if this is a Mellanox device by looking at driver
1457+
driverPath := fmt.Sprintf("/sys/class/net/%s/device/driver", device)
1458+
driverLink, _, err := d.cmd.RunCommand(ctx, "readlink", driverPath)
1459+
if err != nil {
1460+
continue
1461+
}
1462+
1463+
if strings.Contains(driverLink, "mlx5") {
1464+
return device, nil
1465+
}
1466+
}
1467+
1468+
return "", fmt.Errorf("no Mellanox network device found")
1469+
}
1470+
1471+
// unloadStorageModules modifies the openibd script to include storage modules in the unload list
1472+
func (d *driverMgr) unloadStorageModules(ctx context.Context) error {
1473+
log := logr.FromContextOrDiscard(ctx)
1474+
1475+
log.V(1).Info("Unloading storage modules")
1476+
1477+
// Determine the unload storage script path
1478+
unloadStorageScript := "/etc/init.d/openibd"
1479+
if _, err := d.os.Stat("/usr/share/mlnx_ofed/mod_load_funcs"); err == nil {
1480+
unloadStorageScript = "/usr/share/mlnx_ofed/mod_load_funcs"
1481+
}
1482+
1483+
log.V(1).Info("Using unload storage script", "script", unloadStorageScript)
1484+
1485+
// Create the sed command to add storage modules to UNLOAD_MODULES
1486+
// This matches the bash script:
1487+
// sed -i -e '/^[[:space:]]*UNLOAD_MODULES="[a-z]/a\ UNLOAD_MODULES="$UNLOAD_MODULES \
1488+
// ib_isert nvme_rdma nvmet_rdma rpcrdma xprtrdma ib_srpt"'
1489+
storageModulesStr := strings.Join(d.cfg.StorageModules, " ")
1490+
sedCommand := fmt.Sprintf(`/^[[:space:]]*UNLOAD_MODULES="[a-z]/a\ UNLOAD_MODULES="$UNLOAD_MODULES %s"`, storageModulesStr)
1491+
log.V(1).Info("Executing sed command", "sedCommand", sedCommand, "storageModules", d.cfg.StorageModules)
1492+
1493+
// Execute sed command to modify the script
1494+
_, _, err := d.cmd.RunCommand(ctx, "sed", "-i", "-e", sedCommand, unloadStorageScript)
1495+
if err != nil {
1496+
return fmt.Errorf("failed to modify unload storage script: %w", err)
1497+
}
1498+
1499+
// Verify the modification was successful by checking if storage modules are now in the script
1500+
// This matches the bash script: if [ `grep ib_isert ${unload_storage_script} -c` -lt 1 ]; then
1501+
grepCmd := fmt.Sprintf("grep %s %s -c", d.cfg.StorageModules[0], unloadStorageScript)
1502+
_, stdout, err := d.cmd.RunCommand(ctx, "sh", "-c", grepCmd)
1503+
if err != nil {
1504+
return fmt.Errorf("failed to verify storage modules injection: %w", err)
1505+
}
1506+
1507+
count := strings.TrimSpace(stdout)
1508+
log.V(1).Info("Verification result", "grepCmd", grepCmd, "count", count)
1509+
1510+
if count == "0" {
1511+
return fmt.Errorf("failed to inject storage modules for unload")
1512+
}
1513+
1514+
log.V(1).Info("Successfully added storage modules to unload script", "modules", d.cfg.StorageModules)
1515+
return nil
1516+
}
1517+
11771518
// setupSpecialKernelRepos sets up repositories for RT and 64k kernels
11781519
func (d *driverMgr) setupSpecialKernelRepos(ctx context.Context) error {
11791520
log := logr.FromContextOrDiscard(ctx)

0 commit comments

Comments
 (0)