Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 129 additions & 6 deletions cmd/driver-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package main

import (
"bytes"
"context"
"errors"
"fmt"
Expand All @@ -40,14 +41,25 @@ import (
)

const (
driverRoot = "/run/nvidia/driver"
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
operatorNamespace = "gpu-operator"
pausedStr = "paused-for-driver-upgrade"
defaultDrainTimeout = time.Second * 0
defaultGracePeriod = 5 * time.Minute
driverRoot = "/run/nvidia/driver"
driverPIDFile = "/run/nvidia/nvidia-driver.pid"
driverConfigStateFile = "/run/nvidia/driver-config.state"
operatorNamespace = "gpu-operator"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use this from the OPERATOR_NAMESPACE env instead of hard coding.

Copy link
Member Author

@karthikvetrivel karthikvetrivel Nov 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This definition was in the original version, not sure why it is in the diff. If OPERATOR_NAMESPACE is set, that value is used. Otherwise, it uses the Value field as a default, which is what we use this variable for.

pausedStr = "paused-for-driver-upgrade"
defaultDrainTimeout = time.Second * 0
defaultGracePeriod = 5 * time.Minute

nvidiaDomainPrefix = "nvidia.com"
)

var (
// Driver module config files
driverConfigFiles = []string{
"/drivers/nvidia.conf",
"/drivers/nvidia-uvm.conf",
"/drivers/nvidia-modeset.conf",
"/drivers/nvidia-peermem.conf",
}

nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver"
nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator"
Expand Down Expand Up @@ -77,6 +89,8 @@ type config struct {
gpuDirectRDMAEnabled bool
useHostMofed bool
kubeconfig string
driverVersion string
forceReinstall bool
}

// ComponentState tracks the deployment state of GPU operator components
Expand Down Expand Up @@ -208,6 +222,20 @@ func main() {
EnvVars: []string{"KUBECONFIG"},
Value: "",
},
&cli.StringFlag{
Name: "driver-version",
Usage: "Desired NVIDIA driver version",
Destination: &cfg.driverVersion,
EnvVars: []string{"DRIVER_VERSION"},
Value: "",
},
&cli.BoolFlag{
Name: "force-reinstall",
Usage: "Force driver reinstall regardless of current state",
Destination: &cfg.forceReinstall,
EnvVars: []string{"FORCE_REINSTALL"},
Value: false,
},
}

app.Commands = []*cli.Command{
Expand Down Expand Up @@ -288,6 +316,14 @@ func (dm *DriverManager) uninstallDriver() error {
return fmt.Errorf("failed to evict GPU operator components: %w", err)
}

if skip, reason := dm.shouldSkipUninstall(); skip {
dm.log.Infof("Skipping driver uninstall: %s", reason)
if err := dm.rescheduleGPUOperatorComponents(); err != nil {
dm.log.Warnf("Failed to reschedule GPU operator components: %v", err)
}
return nil
}

drainOpts := kube.DrainOptions{
Force: dm.config.drainUseForce,
DeleteEmptyDirData: dm.config.drainDeleteEmptyDirData,
Expand Down Expand Up @@ -629,6 +665,93 @@ func (dm *DriverManager) isDriverLoaded() bool {
return err == nil
}

// getConfigValueOrDefault extracts a value from config by key, falling back to defaultVal if key not found
func getConfigValueOrDefault(config, key, defaultVal string) string {
if defaultVal != "" {
return defaultVal
}
for _, line := range strings.Split(config, "\n") {
if strings.HasPrefix(line, key+"=") {
return strings.TrimPrefix(line, key+"=")
}
}
return ""
}

// getKernelVersion returns the current kernel version
func getKernelVersion() string {
var utsname unix.Utsname
if err := unix.Uname(&utsname); err != nil {
return ""
}
return string(utsname.Release[:bytes.IndexByte(utsname.Release[:], 0)])
}

// buildCurrentConfig constructs the current driver configuration string
func (dm *DriverManager) buildCurrentConfig(storedConfig string) string {
driverVersion := getConfigValueOrDefault(storedConfig, "DRIVER_VERSION", dm.config.driverVersion)
kernelVersion := getConfigValueOrDefault(storedConfig, "KERNEL_VERSION", getKernelVersion())
kernelModuleType := getConfigValueOrDefault(storedConfig, "KERNEL_MODULE_TYPE", os.Getenv("KERNEL_MODULE_TYPE"))

var config strings.Builder
config.WriteString(fmt.Sprintf("DRIVER_VERSION=%s\n", driverVersion))
config.WriteString(fmt.Sprintf("KERNEL_VERSION=%s\n", kernelVersion))
config.WriteString(fmt.Sprintf("GPU_DIRECT_RDMA_ENABLED=%v\n", dm.config.gpuDirectRDMAEnabled))
config.WriteString(fmt.Sprintf("USE_HOST_MOFED=%v\n", dm.config.useHostMofed))
config.WriteString(fmt.Sprintf("KERNEL_MODULE_TYPE=%s\n", kernelModuleType))

// Append config file contents directly
for _, file := range driverConfigFiles {
if data, err := os.ReadFile(file); err == nil && len(data) > 0 {
config.Write(data)
}
}

return config.String()
}

// hasDriverConfigChanged checks if the current driver configuration differs from stored state
func (dm *DriverManager) hasDriverConfigChanged() (bool, string) {
storedData, err := os.ReadFile(driverConfigStateFile)
if err != nil {
if os.IsNotExist(err) {
return true, "no previous driver configuration found"
}
dm.log.Warnf("Failed to read driver config state file: %v", err)
return true, "unable to read previous driver configuration"
}

storedConfig := string(storedData)
currentConfig := dm.buildCurrentConfig(storedConfig)

if currentConfig == storedConfig {
return false, ""
}

return true, "driver configuration changed"
}

func (dm *DriverManager) shouldSkipUninstall() (bool, string) {
if dm.config.forceReinstall {
dm.log.Info("Force reinstall is enabled, proceeding with driver uninstall")
return false, ""
}

if !dm.isDriverLoaded() {
dm.log.Info("Driver not currently loaded, proceeding with installation")
return false, ""
}

// Check if driver configuration (including version) has changed
if configChanged, reason := dm.hasDriverConfigChanged(); configChanged {
dm.log.Infof("Driver configuration has changed: %s", reason)
return false, reason
}

dm.log.Info("Installed driver version and configuration match desired state, skipping uninstall")
return true, "desired version and configuration already present"
}

func (dm *DriverManager) isNouveauLoaded() bool {
_, err := os.Stat("/sys/module/nouveau/refcnt")
return err == nil
Expand Down