Skip to content

Commit eae084f

Browse files
check if NVIDIA kernel modules are loaded to avoid modprobe
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent f82e3bd commit eae084f

File tree

1 file changed

+25
-3
lines changed

1 file changed

+25
-3
lines changed

cmd/nvidia-validator/main.go

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ func main() {
355355
Value: defaultDriverInstallDirCtrPath,
356356
Usage: "the path where the NVIDIA driver install dir is mounted in the container",
357357
Destination: &driverInstallDirCtrPathFlag,
358-
Sources: cli.EnvVars("DISABLE_DEV_CHAR_SYMLINK_CREATION"),
358+
Sources: cli.EnvVars("DRIVER_INSTALL_DIR_CTR_PATH"),
359359
},
360360
}
361361

@@ -843,6 +843,20 @@ func (d *Driver) createStatusFile(driverInfo driverInfo) error {
843843
return createStatusFileWithContent(outputDirFlag+"/"+driverStatusFile, statusFileContent)
844844
}
845845

846+
// areNvidiaModulesLoaded checks if NVIDIA kernel modules are already loaded in kernel memory.
847+
func areNvidiaModulesLoaded() bool {
848+
// Check if the nvidia module is loaded by checking if /sys/module/nvidia/refcnt exists
849+
if _, err := os.Stat("/sys/module/nvidia/refcnt"); err == nil {
850+
refcntData, err := os.ReadFile("/sys/module/nvidia/refcnt")
851+
if err == nil {
852+
refcnt := strings.TrimSpace(string(refcntData))
853+
log.Infof("NVIDIA kernel modules already loaded in kernel memory (refcnt=%s)", refcnt)
854+
return true
855+
}
856+
}
857+
return false
858+
}
859+
846860
// createDevCharSymlinks creates symlinks in /host-dev-char that point to all possible NVIDIA devices nodes.
847861
func createDevCharSymlinks(driverInfo driverInfo, disableDevCharSymlinkCreation bool) error {
848862
if disableDevCharSymlinkCreation {
@@ -853,8 +867,16 @@ func createDevCharSymlinks(driverInfo driverInfo, disableDevCharSymlinkCreation
853867

854868
log.Info("creating symlinks under /dev/char that correspond to NVIDIA character devices")
855869

856-
// Only attempt to load NVIDIA kernel modules when we can chroot into driverRoot
857-
loadKernelModules := driverInfo.isHostDriver || (driverInfo.devRoot == driverInfo.driverRoot)
870+
// Check if NVIDIA modules are already loaded in kernel memory.
871+
// If they are, we don't need to run modprobe (which would fail if modules aren't in /lib/modules/).
872+
// This handles the case where the driver container performed a userspace-only install
873+
// after detecting that modules were already loaded from a previous boot.
874+
modulesAlreadyLoaded := areNvidiaModulesLoaded()
875+
876+
// Only attempt to load NVIDIA kernel modules when:
877+
// 1. Modules are not already loaded in kernel memory, AND
878+
// 2. We can chroot into driverRoot to run modprobe
879+
loadKernelModules := !modulesAlreadyLoaded && (driverInfo.isHostDriver || (driverInfo.devRoot == driverInfo.driverRoot))
858880

859881
// driverRootCtrPath is the path of the driver install dir in the container. This will either be
860882
// driverInstallDirCtrPathFlag or '/host'.

0 commit comments

Comments
 (0)