1919package main
2020
2121import (
22+ "bytes"
2223 "context"
2324 "errors"
2425 "fmt"
@@ -40,14 +41,25 @@ import (
4041)
4142
4243const (
43- driverRoot = "/run/nvidia/driver"
44- driverPIDFile = "/run/nvidia/nvidia-driver.pid"
45- operatorNamespace = "gpu-operator"
46- pausedStr = "paused-for-driver-upgrade"
47- defaultDrainTimeout = time .Second * 0
48- defaultGracePeriod = 5 * time .Minute
44+ driverRoot = "/run/nvidia/driver"
45+ driverPIDFile = "/run/nvidia/nvidia-driver.pid"
46+ driverConfigStateFile = "/run/nvidia/driver-config.state"
47+ operatorNamespace = "gpu-operator"
48+ pausedStr = "paused-for-driver-upgrade"
49+ defaultDrainTimeout = time .Second * 0
50+ defaultGracePeriod = 5 * time .Minute
4951
5052 nvidiaDomainPrefix = "nvidia.com"
53+ )
54+
55+ var (
56+ // Driver module config files
57+ driverConfigFiles = []string {
58+ "/drivers/nvidia.conf" ,
59+ "/drivers/nvidia-uvm.conf" ,
60+ "/drivers/nvidia-modeset.conf" ,
61+ "/drivers/nvidia-peermem.conf" ,
62+ }
5163
5264 nvidiaDriverDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.driver"
5365 nvidiaOperatorValidatorDeployLabel = nvidiaDomainPrefix + "/" + "gpu.deploy.operator-validator"
@@ -305,7 +317,24 @@ func (dm *DriverManager) uninstallDriver() error {
305317 }
306318
307319 if skip , reason := dm .shouldSkipUninstall (); skip {
308- dm .log .Infof ("Skipping driver uninstall: %s" , reason )
320+ dm .log .Infof ("Fast path activated: %s" , reason )
321+
322+ // Clean up stale artifacts from previous container before rescheduling operands
323+ dm .log .Info ("Cleaning up stale mounts and state files..." )
324+
325+ // Unmount stale rootfs from previous container
326+ if err := dm .unmountRootfs (); err != nil {
327+ return fmt .Errorf ("failed to unmount stale rootfs: %w" , err )
328+ }
329+
330+ // Remove stale PID file from previous container
331+ if _ , err := os .Stat (driverPIDFile ); err == nil {
332+ if err := os .Remove (driverPIDFile ); err != nil {
333+ dm .log .Warnf ("Failed to remove PID file: %v" , err )
334+ }
335+ }
336+
337+ // Now safe to reschedule operands
309338 if err := dm .rescheduleGPUOperatorComponents (); err != nil {
310339 dm .log .Warnf ("Failed to reschedule GPU operator components: %v" , err )
311340 }
@@ -653,68 +682,111 @@ func (dm *DriverManager) isDriverLoaded() bool {
653682 return err == nil
654683}
655684
656- func ( dm * DriverManager ) shouldSkipUninstall () ( bool , string ) {
657- if dm . config . forceReinstall {
658- dm . log . Info ( "Force reinstall is enabled, proceeding with driver uninstall" )
659- return false , ""
685+ // getConfigValueOrDefault extracts a value from config by key, falling back to defaultVal if key not found
686+ func getConfigValueOrDefault ( config , key , defaultVal string ) string {
687+ if defaultVal != "" {
688+ return defaultVal
660689 }
690+ for _ , line := range strings .Split (config , "\n " ) {
691+ if strings .HasPrefix (line , key + "=" ) {
692+ return strings .TrimPrefix (line , key + "=" )
693+ }
694+ }
695+ return ""
696+ }
661697
662- if ! dm .isDriverLoaded () {
663- return false , ""
698+ // getKernelVersion returns the current kernel version
699+ func getKernelVersion () string {
700+ var utsname unix.Utsname
701+ if err := unix .Uname (& utsname ); err != nil {
702+ return ""
664703 }
704+ return string (utsname .Release [:bytes .IndexByte (utsname .Release [:], 0 )])
705+ }
665706
666- if dm .config .driverVersion == "" {
667- return false , "Driver version environment variable is not set"
707+ // buildCurrentConfig constructs the current driver configuration string
708+ func (dm * DriverManager ) buildCurrentConfig (storedConfig string ) string {
709+ driverVersion := getConfigValueOrDefault (storedConfig , "DRIVER_VERSION" , dm .config .driverVersion )
710+ kernelVersion := getConfigValueOrDefault (storedConfig , "KERNEL_VERSION" , getKernelVersion ())
711+ kernelModuleType := getConfigValueOrDefault (storedConfig , "KERNEL_MODULE_TYPE" , os .Getenv ("KERNEL_MODULE_TYPE" ))
712+ driverType := getConfigValueOrDefault (storedConfig , "DRIVER_TYPE" , os .Getenv ("DRIVER_TYPE" ))
713+
714+ // Read module parameters from conf files
715+ nvidiaParams := readModuleParams ("/drivers/nvidia.conf" )
716+ nvidiaUvmParams := readModuleParams ("/drivers/nvidia-uvm.conf" )
717+ nvidiaModeset := readModuleParams ("/drivers/nvidia-modeset.conf" )
718+ nvidiaPeermem := readModuleParams ("/drivers/nvidia-peermem.conf" )
719+
720+ var config strings.Builder
721+ config .WriteString (fmt .Sprintf ("DRIVER_VERSION=%s\n " , driverVersion ))
722+ config .WriteString (fmt .Sprintf ("DRIVER_TYPE=%s\n " , driverType ))
723+ config .WriteString (fmt .Sprintf ("KERNEL_VERSION=%s\n " , kernelVersion ))
724+ config .WriteString (fmt .Sprintf ("GPU_DIRECT_RDMA_ENABLED=%v\n " , dm .config .gpuDirectRDMAEnabled ))
725+ config .WriteString (fmt .Sprintf ("USE_HOST_MOFED=%v\n " , dm .config .useHostMofed ))
726+ config .WriteString (fmt .Sprintf ("KERNEL_MODULE_TYPE=%s\n " , kernelModuleType ))
727+ config .WriteString (fmt .Sprintf ("NVIDIA_MODULE_PARAMS=%s\n " , nvidiaParams ))
728+ config .WriteString (fmt .Sprintf ("NVIDIA_UVM_MODULE_PARAMS=%s\n " , nvidiaUvmParams ))
729+ config .WriteString (fmt .Sprintf ("NVIDIA_MODESET_MODULE_PARAMS=%s\n " , nvidiaModeset ))
730+ config .WriteString (fmt .Sprintf ("NVIDIA_PEERMEM_MODULE_PARAMS=%s\n " , nvidiaPeermem ))
731+
732+ // Append config file contents directly
733+ for _ , file := range driverConfigFiles {
734+ if data , err := os .ReadFile (file ); err == nil && len (data ) > 0 {
735+ config .Write (data )
736+ }
668737 }
669738
670- version , err := dm .detectCurrentDriverVersion ()
739+ return config .String ()
740+ }
741+
742+ // readModuleParams reads a module parameter config file and returns its contents as a single-line space-separated string
743+ func readModuleParams (filepath string ) string {
744+ data , err := os .ReadFile (filepath )
671745 if err != nil {
672- dm .log .Warnf ("Unable to determine installed driver version: %v" , err )
673- // If driver is loaded but we can't detect version, proceed with reinstall to ensure correct version
674- dm .log .Info ("Cannot verify driver version, proceeding with reinstall to ensure correct version is installed" )
675- return false , ""
746+ return ""
676747 }
748+ // Convert newlines to spaces to match bash implementation
749+ return strings .ReplaceAll (strings .TrimSpace (string (data )), "\n " , " " )
750+ }
677751
678- if version != dm .config .driverVersion {
679- dm .log .Infof ("Installed driver version %s does not match desired %s, proceeding with uninstall" , version , dm .config .driverVersion )
680- return false , ""
752+ // hasDriverConfigChanged checks if the current driver configuration differs from stored state
753+ func (dm * DriverManager ) hasDriverConfigChanged () (bool , string ) {
754+ storedData , err := os .ReadFile (driverConfigStateFile )
755+ if err != nil {
756+ if os .IsNotExist (err ) {
757+ return true , "no previous driver configuration found"
758+ }
759+ dm .log .Warnf ("Failed to read driver config state file: %v" , err )
760+ return true , "unable to read previous driver configuration"
681761 }
682762
683- dm .log .Infof ("Installed driver version %s matches desired version, skipping uninstall" , version )
684- return true , "desired version already present"
685- }
763+ storedConfig := string (storedData )
764+ currentConfig := dm .buildCurrentConfig (storedConfig )
686765
687- func (dm * DriverManager ) detectCurrentDriverVersion () (string , error ) {
688- baseCtx := dm .ctx
689- if baseCtx == nil {
690- baseCtx = context .Background ()
766+ if currentConfig == storedConfig {
767+ return false , ""
691768 }
692769
693- ctx , cancel := context . WithTimeout ( baseCtx , 10 * time . Second )
694- defer cancel ()
770+ return true , "driver configuration changed"
771+ }
695772
696- // Try chroot to /run/nvidia/driver for containerized driver
697- cmd := exec .CommandContext (ctx , "chroot" , "/run/nvidia/driver" , "modinfo" , "-F" , "version" , "nvidia" )
698- cmd .Env = append (os .Environ (), "LC_ALL=C" )
699- cmdOutput , chrootErr := cmd .Output ()
700- if chrootErr == nil {
701- version := strings .TrimSpace (string (cmdOutput ))
702- if version != "" {
703- dm .log .Infof ("Driver version detected via chroot: %s" , version )
704- return version , nil
705- }
773+ func (dm * DriverManager ) shouldSkipUninstall () (bool , string ) {
774+ if dm .config .forceReinstall {
775+ dm .log .Info ("Force reinstall is enabled, proceeding with driver uninstall" )
776+ return false , ""
706777 }
707778
708- // Second try to read from /sys/module/nvidia/version if available
709- if versionData , err := os .ReadFile ("/sys/module/nvidia/version" ); err == nil {
710- version := strings .TrimSpace (string (versionData ))
711- if version != "" {
712- dm .log .Infof ("Driver version detected from /sys/module/nvidia/version: %s" , version )
713- return version , nil
779+ // Only skip uninstall if driver IS loaded AND config matches (fast path optimization)
780+ if dm .isDriverLoaded () {
781+ if configChanged , _ := dm .hasDriverConfigChanged (); ! configChanged {
782+ dm .log .Info ("Driver is loaded with matching config, enabling fast path" )
783+ return true , "desired version and configuration already present"
714784 }
715785 }
716786
717- return "" , fmt .Errorf ("all version detection methods failed: chroot: %v" , chrootErr )
787+ // Driver not loaded or config changed - proceed with cleanup
788+ dm .log .Info ("Proceeding with cleanup operations" )
789+ return false , ""
718790}
719791
720792func (dm * DriverManager ) isNouveauLoaded () bool {
0 commit comments