diff --git a/cmd/driver-manager/main.go b/cmd/driver-manager/main.go index 2a1857fd..72cecbb8 100644 --- a/cmd/driver-manager/main.go +++ b/cmd/driver-manager/main.go @@ -19,6 +19,7 @@ package main import ( + "bytes" "context" "errors" "fmt" @@ -40,14 +41,25 @@ import ( ) const ( - driverRoot = "/run/nvidia/driver" - driverPIDFile = "/run/nvidia/nvidia-driver.pid" - operatorNamespace = "gpu-operator" - pausedStr = "paused-for-driver-upgrade" - defaultDrainTimeout = time.Second * 0 - defaultGracePeriod = 5 * time.Minute + driverRoot = "/run/nvidia/driver" + driverPIDFile = "/run/nvidia/nvidia-driver.pid" + driverConfigStateFile = "/run/nvidia/driver-config.state" + operatorNamespace = "gpu-operator" + pausedStr = "paused-for-driver-upgrade" + defaultDrainTimeout = time.Second * 0 + defaultGracePeriod = 5 * time.Minute nvidiaDomainPrefix = "nvidia.com" +) + +var ( + // Driver module config files + driverConfigFiles = []string{ + "/drivers/nvidia.conf", + "/drivers/nvidia-uvm.conf", + "/drivers/nvidia-modeset.conf", + "/drivers/nvidia-peermem.conf", + } nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver" nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator" @@ -77,6 +89,8 @@ type config struct { gpuDirectRDMAEnabled bool useHostMofed bool kubeconfig string + driverVersion string + forceReinstall bool } // ComponentState tracks the deployment state of GPU operator components @@ -208,6 +222,20 @@ func main() { EnvVars: []string{"KUBECONFIG"}, Value: "", }, + &cli.StringFlag{ + Name: "driver-version", + Usage: "Desired NVIDIA driver version", + Destination: &cfg.driverVersion, + EnvVars: []string{"DRIVER_VERSION"}, + Value: "", + }, + &cli.BoolFlag{ + Name: "force-reinstall", + Usage: "Force driver reinstall regardless of current state", + Destination: &cfg.forceReinstall, + EnvVars: []string{"FORCE_REINSTALL"}, + Value: false, + }, } app.Commands = []*cli.Command{ @@ -288,6 +316,14 @@ func (dm *DriverManager) uninstallDriver() error { return fmt.Errorf("failed to evict GPU operator components: %w", err) } + if skip, reason := dm.shouldSkipUninstall(); skip { + dm.log.Infof("Skipping driver uninstall: %s", reason) + if err := dm.rescheduleGPUOperatorComponents(); err != nil { + dm.log.Warnf("Failed to reschedule GPU operator components: %v", err) + } + return nil + } + drainOpts := kube.DrainOptions{ Force: dm.config.drainUseForce, DeleteEmptyDirData: dm.config.drainDeleteEmptyDirData, @@ -629,6 +665,93 @@ func (dm *DriverManager) isDriverLoaded() bool { return err == nil } +// getConfigValueOrDefault extracts a value from config by key, falling back to defaultVal if key not found +func getConfigValueOrDefault(config, key, defaultVal string) string { + if defaultVal != "" { + return defaultVal + } + for _, line := range strings.Split(config, "\n") { + if strings.HasPrefix(line, key+"=") { + return strings.TrimPrefix(line, key+"=") + } + } + return "" +} + +// getKernelVersion returns the current kernel version +func getKernelVersion() string { + var utsname unix.Utsname + if err := unix.Uname(&utsname); err != nil { + return "" + } + return string(utsname.Release[:bytes.IndexByte(utsname.Release[:], 0)]) +} + +// buildCurrentConfig constructs the current driver configuration string +func (dm *DriverManager) buildCurrentConfig(storedConfig string) string { + driverVersion := getConfigValueOrDefault(storedConfig, "DRIVER_VERSION", dm.config.driverVersion) + kernelVersion := getConfigValueOrDefault(storedConfig, "KERNEL_VERSION", getKernelVersion()) + kernelModuleType := getConfigValueOrDefault(storedConfig, "KERNEL_MODULE_TYPE", os.Getenv("KERNEL_MODULE_TYPE")) + + var config strings.Builder + config.WriteString(fmt.Sprintf("DRIVER_VERSION=%s\n", driverVersion)) + config.WriteString(fmt.Sprintf("KERNEL_VERSION=%s\n", kernelVersion)) + config.WriteString(fmt.Sprintf("GPU_DIRECT_RDMA_ENABLED=%v\n", dm.config.gpuDirectRDMAEnabled)) + config.WriteString(fmt.Sprintf("USE_HOST_MOFED=%v\n", dm.config.useHostMofed)) + config.WriteString(fmt.Sprintf("KERNEL_MODULE_TYPE=%s\n", kernelModuleType)) + + // Append config file contents directly + for _, file := range driverConfigFiles { + if data, err := os.ReadFile(file); err == nil && len(data) > 0 { + config.Write(data) + } + } + + return config.String() +} + +// hasDriverConfigChanged checks if the current driver configuration differs from stored state +func (dm *DriverManager) hasDriverConfigChanged() (bool, string) { + storedData, err := os.ReadFile(driverConfigStateFile) + if err != nil { + if os.IsNotExist(err) { + return true, "no previous driver configuration found" + } + dm.log.Warnf("Failed to read driver config state file: %v", err) + return true, "unable to read previous driver configuration" + } + + storedConfig := string(storedData) + currentConfig := dm.buildCurrentConfig(storedConfig) + + if currentConfig == storedConfig { + return false, "" + } + + return true, "driver configuration changed" +} + +func (dm *DriverManager) shouldSkipUninstall() (bool, string) { + if dm.config.forceReinstall { + dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall") + return false, "" + } + + if !dm.isDriverLoaded() { + dm.log.Info("Driver not currently loaded, proceeding with installation") + return false, "" + } + + // Check if driver configuration (including version) has changed + if configChanged, reason := dm.hasDriverConfigChanged(); configChanged { + dm.log.Infof("Driver configuration has changed: %s", reason) + return false, reason + } + + dm.log.Info("Installed driver version and configuration match desired state, skipping uninstall") + return true, "desired version and configuration already present" +} + func (dm *DriverManager) isNouveauLoaded() bool { _, err := os.Stat("/sys/module/nouveau/refcnt") return err == nil