Skip to content

Commit 5fba425

Browse files
feat: detect config changes (version, kernel, module params, RDMA) to trigger driver reinstall
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent 37ae9a0 commit 5fba425

File tree

1 file changed

+83
-48
lines changed

1 file changed

+83
-48
lines changed

cmd/driver-manager/main.go

Lines changed: 83 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
package main
2020

2121
import (
22+
"bytes"
2223
"context"
2324
"errors"
2425
"fmt"
@@ -40,14 +41,25 @@ import (
4041
)
4142

4243
const (
43-
driverRoot = "/run/nvidia/driver"
44-
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45-
operatorNamespace = "gpu-operator"
46-
pausedStr = "paused-for-driver-upgrade"
47-
defaultDrainTimeout = time.Second * 0
48-
defaultGracePeriod = 5 * time.Minute
44+
driverRoot = "/run/nvidia/driver"
45+
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
46+
driverConfigStateFile = "/run/nvidia/driver-config.state"
47+
operatorNamespace = "gpu-operator"
48+
pausedStr = "paused-for-driver-upgrade"
49+
defaultDrainTimeout = time.Second * 0
50+
defaultGracePeriod = 5 * time.Minute
4951

5052
nvidiaDomainPrefix = "nvidia.com"
53+
)
54+
55+
var (
56+
// Driver module config files
57+
driverConfigFiles = []string{
58+
"/drivers/nvidia.conf",
59+
"/drivers/nvidia-uvm.conf",
60+
"/drivers/nvidia-modeset.conf",
61+
"/drivers/nvidia-peermem.conf",
62+
}
5163

5264
nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver"
5365
nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator"
@@ -653,68 +665,91 @@ func (dm *DriverManager) isDriverLoaded() bool {
653665
return err == nil
654666
}
655667

656-
func (dm *DriverManager) shouldSkipUninstall() (bool, string) {
657-
if dm.config.forceReinstall {
658-
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
659-
return false, ""
668+
// getConfigValueOrDefault extracts a value from config by key, falling back to defaultVal if key not found
669+
func getConfigValueOrDefault(config, key, defaultVal string) string {
670+
if defaultVal != "" {
671+
return defaultVal
672+
}
673+
for _, line := range strings.Split(config, "\n") {
674+
if strings.HasPrefix(line, key+"=") {
675+
return strings.TrimPrefix(line, key+"=")
676+
}
660677
}
678+
return ""
679+
}
661680

662-
if !dm.isDriverLoaded() {
663-
return false, ""
681+
// getKernelVersion returns the current kernel version
682+
func getKernelVersion() string {
683+
var utsname unix.Utsname
684+
if err := unix.Uname(&utsname); err != nil {
685+
return ""
664686
}
687+
return string(utsname.Release[:bytes.IndexByte(utsname.Release[:], 0)])
688+
}
689+
690+
// buildCurrentConfig constructs the current driver configuration string
691+
func (dm *DriverManager) buildCurrentConfig(storedConfig string) string {
692+
driverVersion := getConfigValueOrDefault(storedConfig, "DRIVER_VERSION", dm.config.driverVersion)
693+
kernelVersion := getConfigValueOrDefault(storedConfig, "KERNEL_VERSION", getKernelVersion())
694+
kernelModuleType := getConfigValueOrDefault(storedConfig, "KERNEL_MODULE_TYPE", os.Getenv("KERNEL_MODULE_TYPE"))
695+
696+
var config strings.Builder
697+
config.WriteString(fmt.Sprintf("DRIVER_VERSION=%s\n", driverVersion))
698+
config.WriteString(fmt.Sprintf("KERNEL_VERSION=%s\n", kernelVersion))
699+
config.WriteString(fmt.Sprintf("GPU_DIRECT_RDMA_ENABLED=%v\n", dm.config.gpuDirectRDMAEnabled))
700+
config.WriteString(fmt.Sprintf("USE_HOST_MOFED=%v\n", dm.config.useHostMofed))
701+
config.WriteString(fmt.Sprintf("KERNEL_MODULE_TYPE=%s\n", kernelModuleType))
665702

666-
if dm.config.driverVersion == "" {
667-
return false, "Driver version environment variable is not set"
703+
// Append config file contents directly
704+
for _, file := range driverConfigFiles {
705+
if data, err := os.ReadFile(file); err == nil && len(data) > 0 {
706+
config.Write(data)
707+
}
668708
}
669709

670-
version, err := dm.detectCurrentDriverVersion()
710+
return config.String()
711+
}
712+
713+
// hasDriverConfigChanged checks if the current driver configuration differs from stored state
714+
func (dm *DriverManager) hasDriverConfigChanged() (bool, string) {
715+
storedData, err := os.ReadFile(driverConfigStateFile)
671716
if err != nil {
672-
dm.log.Warnf("Unable to determine installed driver version: %v", err)
673-
// If driver is loaded but we can't detect version, proceed with reinstall to ensure correct version
674-
dm.log.Info("Cannot verify driver version, proceeding with reinstall to ensure correct version is installed")
675-
return false, ""
717+
if os.IsNotExist(err) {
718+
return true, "no previous driver configuration found"
719+
}
720+
dm.log.Warnf("Failed to read driver config state file: %v", err)
721+
return true, "unable to read previous driver configuration"
676722
}
677723

678-
if version != dm.config.driverVersion {
679-
dm.log.Infof("Installed driver version %s does not match desired %s, proceeding with uninstall", version, dm.config.driverVersion)
724+
storedConfig := string(storedData)
725+
currentConfig := dm.buildCurrentConfig(storedConfig)
726+
727+
if currentConfig == storedConfig {
680728
return false, ""
681729
}
682730

683-
dm.log.Infof("Installed driver version %s matches desired version, skipping uninstall", version)
684-
return true, "desired version already present"
731+
return true, "driver configuration changed"
685732
}
686733

687-
func (dm *DriverManager) detectCurrentDriverVersion() (string, error) {
688-
baseCtx := dm.ctx
689-
if baseCtx == nil {
690-
baseCtx = context.Background()
734+
func (dm *DriverManager) shouldSkipUninstall() (bool, string) {
735+
if dm.config.forceReinstall {
736+
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
737+
return false, ""
691738
}
692739

693-
ctx, cancel := context.WithTimeout(baseCtx, 10*time.Second)
694-
defer cancel()
695-
696-
// Try chroot to /run/nvidia/driver for containerized driver
697-
cmd := exec.CommandContext(ctx, "chroot", "/run/nvidia/driver", "modinfo", "-F", "version", "nvidia")
698-
cmd.Env = append(os.Environ(), "LC_ALL=C")
699-
cmdOutput, chrootErr := cmd.Output()
700-
if chrootErr == nil {
701-
version := strings.TrimSpace(string(cmdOutput))
702-
if version != "" {
703-
dm.log.Infof("Driver version detected via chroot: %s", version)
704-
return version, nil
705-
}
740+
if !dm.isDriverLoaded() {
741+
dm.log.Info("Driver not currently loaded, proceeding with installation")
742+
return false, ""
706743
}
707744

708-
// Second try to read from /sys/module/nvidia/version if available
709-
if versionData, err := os.ReadFile("/sys/module/nvidia/version"); err == nil {
710-
version := strings.TrimSpace(string(versionData))
711-
if version != "" {
712-
dm.log.Infof("Driver version detected from /sys/module/nvidia/version: %s", version)
713-
return version, nil
714-
}
745+
// Check if driver configuration (including version) has changed
746+
if configChanged, reason := dm.hasDriverConfigChanged(); configChanged {
747+
dm.log.Infof("Driver configuration has changed: %s", reason)
748+
return false, reason
715749
}
716750

717-
return "", fmt.Errorf("all version detection methods failed: chroot: %v", chrootErr)
751+
dm.log.Info("Installed driver version and configuration match desired state, skipping uninstall")
752+
return true, "desired version and configuration already present"
718753
}
719754

720755
func (dm *DriverManager) isNouveauLoaded() bool {

0 commit comments

Comments
 (0)