Skip to content

Commit b0b38ba

Browse files
feat: detect config changes (version, kernel, module params, RDMA) to trigger driver reinstall
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent 37ae9a0 commit b0b38ba

File tree

1 file changed

+121
-49
lines changed

1 file changed

+121
-49
lines changed

cmd/driver-manager/main.go

Lines changed: 121 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package main
2020

2121
import (
22+
"bytes"
2223
"context"
2324
"errors"
2425
"fmt"
@@ -40,14 +41,25 @@ import (
4041
)
4142

4243
const (
43-
driverRoot = "/run/nvidia/driver"
44-
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45-
operatorNamespace = "gpu-operator"
46-
pausedStr = "paused-for-driver-upgrade"
47-
defaultDrainTimeout = time.Second * 0
48-
defaultGracePeriod = 5 * time.Minute
44+
driverRoot = "/run/nvidia/driver"
45+
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
46+
driverConfigStateFile = "/run/nvidia/driver-config.state"
47+
operatorNamespace = "gpu-operator"
48+
pausedStr = "paused-for-driver-upgrade"
49+
defaultDrainTimeout = time.Second * 0
50+
defaultGracePeriod = 5 * time.Minute
4951

5052
nvidiaDomainPrefix = "nvidia.com"
53+
)
54+
55+
var (
56+
// Driver module config files
57+
driverConfigFiles = []string{
58+
"/drivers/nvidia.conf",
59+
"/drivers/nvidia-uvm.conf",
60+
"/drivers/nvidia-modeset.conf",
61+
"/drivers/nvidia-peermem.conf",
62+
}
5163

5264
nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver"
5365
nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator"
@@ -305,7 +317,24 @@ func (dm *DriverManager) uninstallDriver() error {
305317
}
306318

307319
if skip, reason := dm.shouldSkipUninstall(); skip {
308-
dm.log.Infof("Skipping driver uninstall: %s", reason)
320+
dm.log.Infof("Fast path activated: %s", reason)
321+
322+
// Clean up stale artifacts from previous container before rescheduling operands
323+
dm.log.Info("Cleaning up stale mounts and state files...")
324+
325+
// Unmount stale rootfs from previous container
326+
if err := dm.unmountRootfs(); err != nil {
327+
return fmt.Errorf("failed to unmount stale rootfs: %w", err)
328+
}
329+
330+
// Remove stale PID file from previous container
331+
if _, err := os.Stat(driverPIDFile); err == nil {
332+
if err := os.Remove(driverPIDFile); err != nil {
333+
dm.log.Warnf("Failed to remove PID file: %v", err)
334+
}
335+
}
336+
337+
// Now safe to reschedule operands
309338
if err := dm.rescheduleGPUOperatorComponents(); err != nil {
310339
dm.log.Warnf("Failed to reschedule GPU operator components: %v", err)
311340
}
@@ -653,68 +682,111 @@ func (dm *DriverManager) isDriverLoaded() bool {
653682
return err == nil
654683
}
655684

656-
func (dm *DriverManager) shouldSkipUninstall() (bool, string) {
657-
if dm.config.forceReinstall {
658-
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
659-
return false, ""
685+
// getConfigValueOrDefault extracts a value from config by key, falling back to defaultVal if key not found
686+
func getConfigValueOrDefault(config, key, defaultVal string) string {
687+
if defaultVal != "" {
688+
return defaultVal
660689
}
690+
for _, line := range strings.Split(config, "\n") {
691+
if strings.HasPrefix(line, key+"=") {
692+
return strings.TrimPrefix(line, key+"=")
693+
}
694+
}
695+
return ""
696+
}
661697

662-
if !dm.isDriverLoaded() {
663-
return false, ""
698+
// getKernelVersion returns the current kernel version
699+
func getKernelVersion() string {
700+
var utsname unix.Utsname
701+
if err := unix.Uname(&utsname); err != nil {
702+
return ""
664703
}
704+
return string(utsname.Release[:bytes.IndexByte(utsname.Release[:], 0)])
705+
}
665706

666-
if dm.config.driverVersion == "" {
667-
return false, "Driver version environment variable is not set"
707+
// buildCurrentConfig constructs the current driver configuration string
708+
func (dm *DriverManager) buildCurrentConfig(storedConfig string) string {
709+
driverVersion := getConfigValueOrDefault(storedConfig, "DRIVER_VERSION", dm.config.driverVersion)
710+
kernelVersion := getConfigValueOrDefault(storedConfig, "KERNEL_VERSION", getKernelVersion())
711+
kernelModuleType := getConfigValueOrDefault(storedConfig, "KERNEL_MODULE_TYPE", os.Getenv("KERNEL_MODULE_TYPE"))
712+
driverType := getConfigValueOrDefault(storedConfig, "DRIVER_TYPE", os.Getenv("DRIVER_TYPE"))
713+
714+
// Read module parameters from conf files
715+
nvidiaParams := readModuleParams("/drivers/nvidia.conf")
716+
nvidiaUvmParams := readModuleParams("/drivers/nvidia-uvm.conf")
717+
nvidiaModeset := readModuleParams("/drivers/nvidia-modeset.conf")
718+
nvidiaPeermem := readModuleParams("/drivers/nvidia-peermem.conf")
719+
720+
var config strings.Builder
721+
config.WriteString(fmt.Sprintf("DRIVER_VERSION=%s\n", driverVersion))
722+
config.WriteString(fmt.Sprintf("DRIVER_TYPE=%s\n", driverType))
723+
config.WriteString(fmt.Sprintf("KERNEL_VERSION=%s\n", kernelVersion))
724+
config.WriteString(fmt.Sprintf("GPU_DIRECT_RDMA_ENABLED=%v\n", dm.config.gpuDirectRDMAEnabled))
725+
config.WriteString(fmt.Sprintf("USE_HOST_MOFED=%v\n", dm.config.useHostMofed))
726+
config.WriteString(fmt.Sprintf("KERNEL_MODULE_TYPE=%s\n", kernelModuleType))
727+
config.WriteString(fmt.Sprintf("NVIDIA_MODULE_PARAMS=%s\n", nvidiaParams))
728+
config.WriteString(fmt.Sprintf("NVIDIA_UVM_MODULE_PARAMS=%s\n", nvidiaUvmParams))
729+
config.WriteString(fmt.Sprintf("NVIDIA_MODESET_MODULE_PARAMS=%s\n", nvidiaModeset))
730+
config.WriteString(fmt.Sprintf("NVIDIA_PEERMEM_MODULE_PARAMS=%s\n", nvidiaPeermem))
731+
732+
// Append config file contents directly
733+
for _, file := range driverConfigFiles {
734+
if data, err := os.ReadFile(file); err == nil && len(data) > 0 {
735+
config.Write(data)
736+
}
668737
}
669738

670-
version, err := dm.detectCurrentDriverVersion()
739+
return config.String()
740+
}
741+
742+
// readModuleParams reads a module parameter config file and returns its contents as a single-line space-separated string
743+
func readModuleParams(filepath string) string {
744+
data, err := os.ReadFile(filepath)
671745
if err != nil {
672-
dm.log.Warnf("Unable to determine installed driver version: %v", err)
673-
// If driver is loaded but we can't detect version, proceed with reinstall to ensure correct version
674-
dm.log.Info("Cannot verify driver version, proceeding with reinstall to ensure correct version is installed")
675-
return false, ""
746+
return ""
676747
}
748+
// Convert newlines to spaces to match bash implementation
749+
return strings.ReplaceAll(strings.TrimSpace(string(data)), "\n", " ")
750+
}
677751

678-
if version != dm.config.driverVersion {
679-
dm.log.Infof("Installed driver version %s does not match desired %s, proceeding with uninstall", version, dm.config.driverVersion)
680-
return false, ""
752+
// hasDriverConfigChanged checks if the current driver configuration differs from stored state
753+
func (dm *DriverManager) hasDriverConfigChanged() (bool, string) {
754+
storedData, err := os.ReadFile(driverConfigStateFile)
755+
if err != nil {
756+
if os.IsNotExist(err) {
757+
return true, "no previous driver configuration found"
758+
}
759+
dm.log.Warnf("Failed to read driver config state file: %v", err)
760+
return true, "unable to read previous driver configuration"
681761
}
682762

683-
dm.log.Infof("Installed driver version %s matches desired version, skipping uninstall", version)
684-
return true, "desired version already present"
685-
}
763+
storedConfig := string(storedData)
764+
currentConfig := dm.buildCurrentConfig(storedConfig)
686765

687-
func (dm *DriverManager) detectCurrentDriverVersion() (string, error) {
688-
baseCtx := dm.ctx
689-
if baseCtx == nil {
690-
baseCtx = context.Background()
766+
if currentConfig == storedConfig {
767+
return false, ""
691768
}
692769

693-
ctx, cancel := context.WithTimeout(baseCtx, 10*time.Second)
694-
defer cancel()
770+
return true, "driver configuration changed"
771+
}
695772

696-
// Try chroot to /run/nvidia/driver for containerized driver
697-
cmd := exec.CommandContext(ctx, "chroot", "/run/nvidia/driver", "modinfo", "-F", "version", "nvidia")
698-
cmd.Env = append(os.Environ(), "LC_ALL=C")
699-
cmdOutput, chrootErr := cmd.Output()
700-
if chrootErr == nil {
701-
version := strings.TrimSpace(string(cmdOutput))
702-
if version != "" {
703-
dm.log.Infof("Driver version detected via chroot: %s", version)
704-
return version, nil
705-
}
773+
func (dm *DriverManager) shouldSkipUninstall() (bool, string) {
774+
if dm.config.forceReinstall {
775+
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
776+
return false, ""
706777
}
707778

708-
// Second try to read from /sys/module/nvidia/version if available
709-
if versionData, err := os.ReadFile("/sys/module/nvidia/version"); err == nil {
710-
version := strings.TrimSpace(string(versionData))
711-
if version != "" {
712-
dm.log.Infof("Driver version detected from /sys/module/nvidia/version: %s", version)
713-
return version, nil
779+
// Only skip uninstall if driver IS loaded AND config matches (fast path optimization)
780+
if dm.isDriverLoaded() {
781+
if configChanged, _ := dm.hasDriverConfigChanged(); !configChanged {
782+
dm.log.Info("Driver is loaded with matching config, enabling fast path")
783+
return true, "desired version and configuration already present"
714784
}
715785
}
716786

717-
return "", fmt.Errorf("all version detection methods failed: chroot: %v", chrootErr)
787+
// Driver not loaded or config changed - proceed with cleanup
788+
dm.log.Info("Proceeding with cleanup operations")
789+
return false, ""
718790
}
719791

720792
func (dm *DriverManager) isNouveauLoaded() bool {

0 commit comments

Comments
 (0)