1919package main
2020
2121import (
22+ "bytes"
2223 "context"
2324 "errors"
2425 "fmt"
@@ -40,14 +41,25 @@ import (
4041)
4142
4243const (
43- driverRoot = "/run/nvidia/driver"
44- driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45- operatorNamespace = "gpu-operator"
46- pausedStr = "paused-for-driver-upgrade"
47- defaultDrainTimeout = time .Second * 0
48- defaultGracePeriod = 5 * time .Minute
44+ driverRoot = "/run/nvidia/driver"
45+ driverPIDFile = "/run/nvidia/nvidia-driver.pid"
46+ driverConfigStateFile = "/run/nvidia/driver-config.state"
47+ operatorNamespace = "gpu-operator"
48+ pausedStr = "paused-for-driver-upgrade"
49+ defaultDrainTimeout = time .Second * 0
50+ defaultGracePeriod = 5 * time .Minute
4951
5052 nvidiaDomainPrefix = "nvidia.com"
53+ )
54+
55+ var (
56+ // Driver module config files
57+ driverConfigFiles = []string {
58+ "/drivers/nvidia.conf" ,
59+ "/drivers/nvidia-uvm.conf" ,
60+ "/drivers/nvidia-modeset.conf" ,
61+ "/drivers/nvidia-peermem.conf" ,
62+ }
5163
5264 nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver"
5365 nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator"
@@ -653,68 +665,91 @@ func (dm *DriverManager) isDriverLoaded() bool {
653665 return err == nil
654666}
655667
656- func (dm * DriverManager ) shouldSkipUninstall () (bool , string ) {
657- if dm .config .forceReinstall {
658- dm .log .Info ("Force reinstall is enabled, proceeding with driver uninstall" )
659- return false , ""
668+ // getConfigValueOrDefault extracts a value from config by key, falling back to defaultVal if key not found
669+ func getConfigValueOrDefault (config , key , defaultVal string ) string {
670+ if defaultVal != "" {
671+ return defaultVal
672+ }
673+ for _ , line := range strings .Split (config , "\n " ) {
674+ if strings .HasPrefix (line , key + "=" ) {
675+ return strings .TrimPrefix (line , key + "=" )
676+ }
660677 }
678+ return ""
679+ }
661680
662- if ! dm .isDriverLoaded () {
663- return false , ""
681+ // getKernelVersion returns the current kernel version
682+ func getKernelVersion () string {
683+ var utsname unix.Utsname
684+ if err := unix .Uname (& utsname ); err != nil {
685+ return ""
664686 }
687+ return string (utsname .Release [:bytes .IndexByte (utsname .Release [:], 0 )])
688+ }
689+
690+ // buildCurrentConfig constructs the current driver configuration string
691+ func (dm * DriverManager ) buildCurrentConfig (storedConfig string ) string {
692+ driverVersion := getConfigValueOrDefault (storedConfig , "DRIVER_VERSION" , dm .config .driverVersion )
693+ kernelVersion := getConfigValueOrDefault (storedConfig , "KERNEL_VERSION" , getKernelVersion ())
694+ kernelModuleType := getConfigValueOrDefault (storedConfig , "KERNEL_MODULE_TYPE" , os .Getenv ("KERNEL_MODULE_TYPE" ))
695+
696+ var config strings.Builder
697+ config .WriteString (fmt .Sprintf ("DRIVER_VERSION=%s\n " , driverVersion ))
698+ config .WriteString (fmt .Sprintf ("KERNEL_VERSION=%s\n " , kernelVersion ))
699+ config .WriteString (fmt .Sprintf ("GPU_DIRECT_RDMA_ENABLED=%v\n " , dm .config .gpuDirectRDMAEnabled ))
700+ config .WriteString (fmt .Sprintf ("USE_HOST_MOFED=%v\n " , dm .config .useHostMofed ))
701+ config .WriteString (fmt .Sprintf ("KERNEL_MODULE_TYPE=%s\n " , kernelModuleType ))
665702
666- if dm .config .driverVersion == "" {
667- return false , "Driver version environment variable is not set"
703+ // Append config file contents directly
704+ for _ , file := range driverConfigFiles {
705+ if data , err := os .ReadFile (file ); err == nil && len (data ) > 0 {
706+ config .Write (data )
707+ }
668708 }
669709
670- version , err := dm .detectCurrentDriverVersion ()
710+ return config .String ()
711+ }
712+
713+ // hasDriverConfigChanged checks if the current driver configuration differs from stored state
714+ func (dm * DriverManager ) hasDriverConfigChanged () (bool , string ) {
715+ storedData , err := os .ReadFile (driverConfigStateFile )
671716 if err != nil {
672- dm .log .Warnf ("Unable to determine installed driver version: %v" , err )
673- // If driver is loaded but we can't detect version, proceed with reinstall to ensure correct version
674- dm .log .Info ("Cannot verify driver version, proceeding with reinstall to ensure correct version is installed" )
675- return false , ""
717+ if os .IsNotExist (err ) {
718+ return true , "no previous driver configuration found"
719+ }
720+ dm .log .Warnf ("Failed to read driver config state file: %v" , err )
721+ return true , "unable to read previous driver configuration"
676722 }
677723
678- if version != dm .config .driverVersion {
679- dm .log .Infof ("Installed driver version %s does not match desired %s, proceeding with uninstall" , version , dm .config .driverVersion )
724+ storedConfig := string (storedData )
725+ currentConfig := dm .buildCurrentConfig (storedConfig )
726+
727+ if currentConfig == storedConfig {
680728 return false , ""
681729 }
682730
683- dm .log .Infof ("Installed driver version %s matches desired version, skipping uninstall" , version )
684- return true , "desired version already present"
731+ return true , "driver configuration changed"
685732}
686733
687- func (dm * DriverManager ) detectCurrentDriverVersion () (string , error ) {
688- baseCtx := dm .ctx
689- if baseCtx == nil {
690- baseCtx = context . Background ()
734+ func (dm * DriverManager ) shouldSkipUninstall () (bool , string ) {
735+ if dm .config . forceReinstall {
736+ dm . log . Info ( "Force reinstall is enabled, proceeding with driver uninstall" )
737+ return false , ""
691738 }
692739
693- ctx , cancel := context .WithTimeout (baseCtx , 10 * time .Second )
694- defer cancel ()
695-
696- // Try chroot to /run/nvidia/driver for containerized driver
697- cmd := exec .CommandContext (ctx , "chroot" , "/run/nvidia/driver" , "modinfo" , "-F" , "version" , "nvidia" )
698- cmd .Env = append (os .Environ (), "LC_ALL=C" )
699- cmdOutput , chrootErr := cmd .Output ()
700- if chrootErr == nil {
701- version := strings .TrimSpace (string (cmdOutput ))
702- if version != "" {
703- dm .log .Infof ("Driver version detected via chroot: %s" , version )
704- return version , nil
705- }
740+ if ! dm .isDriverLoaded () {
741+ dm .log .Info ("Driver not currently loaded, proceeding with installation" )
742+ return false , ""
706743 }
707744
708- // Second try to read from /sys/module/nvidia/version if available
709- if versionData , err := os .ReadFile ("/sys/module/nvidia/version" ); err == nil {
710- version := strings .TrimSpace (string (versionData ))
711- if version != "" {
712- dm .log .Infof ("Driver version detected from /sys/module/nvidia/version: %s" , version )
713- return version , nil
714- }
745+ // Check if driver configuration (including version) has changed
746+ if configChanged , reason := dm .hasDriverConfigChanged (); configChanged {
747+ dm .log .Infof ("Driver configuration has changed: %s" , reason )
748+ return false , reason
715749 }
716750
717- return "" , fmt .Errorf ("all version detection methods failed: chroot: %v" , chrootErr )
751+ dm .log .Info ("Installed driver version and configuration match desired state, skipping uninstall" )
752+ return true , "desired version and configuration already present"
718753}
719754
720755func (dm * DriverManager ) isNouveauLoaded () bool {
0 commit comments