Skip to content

Commit 0f2aa79

Browse files
committed
Conditionally mount the host's /dev over the container's /dev
Currently, in the compute domain driver, we unconditionally mount the host's /dev over the container's /dev regardless of what the devRoot of the GPU driver is. This change was introduced in NVIDIA#307. Unfortunately, this causes some problems on certain systems as described in NVIDIA#477. This patch makes this mount contingent on devRoot == "/" (which is the only case where we actually ever wanted / needed to have this mount in the first place. Signed-off-by: Kevin Klues <[email protected]>
1 parent fe736f1 commit 0f2aa79

File tree

2 files changed

+28
-1
lines changed

2 files changed

+28
-1
lines changed

cmd/compute-domain-kubelet-plugin/nvlib.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ const (
4343
nvidiaCapsDeviceName = "nvidia-caps"
4444
nvidiaCapsImexChannelsDeviceName = "nvidia-caps-imex-channels"
4545
nvidiaCapFabricImexMgmtPath = "/proc/driver/nvidia/capabilities/fabric-imex-mgmt"
46+
hostDevPath = "/host/dev"
4647
)
4748

4849
type deviceLib struct {
@@ -89,6 +90,10 @@ func newDeviceLib(driverRoot root) (*deviceLib, error) {
8990
return nil, fmt.Errorf("error recursively unmounting %s: %w", procDriverNvidiaPath, err)
9091
}
9192

93+
if err := d.conditionallyBindMountHostDev(); err != nil {
94+
return nil, fmt.Errorf("error conditionally bind mounting host dev: %w", err)
95+
}
96+
9297
return &d, nil
9398
}
9499

@@ -418,3 +423,25 @@ func (l deviceLib) unmountRecursively(root string) error {
418423

419424
return helper(root)
420425
}
426+
427+
// conditionallyBindMountHostDev bind mounts hostDevPath over /dev when devRoot is "/".
428+
func (l deviceLib) conditionallyBindMountHostDev() error {
429+
// If devRoot != "/" then we don't need to do the mount
430+
if l.devRoot != "/" {
431+
return nil
432+
}
433+
434+
// Get a reference to the mount executable.
435+
mountExecutable, err := exec.LookPath("mount")
436+
if err != nil {
437+
return fmt.Errorf("error looking up mount executable: %w", err)
438+
}
439+
mounter := mount.New(mountExecutable)
440+
441+
// Bind mount hostDevPath over /dev
442+
if err := mounter.Mount(hostDevPath, "/dev", "", []string{"bind"}); err != nil {
443+
return fmt.Errorf("failed to bind mount %s over /dev: %w", hostDevPath, err)
444+
}
445+
446+
return nil
447+
}

deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ spec:
141141
# TODO: make this more surgical, see discussion in
142142
# https://github.com/NVIDIA/k8s-dra-driver-gpu/pull/307.
143143
- name: host-dev
144-
mountPath: /dev
144+
mountPath: /host/dev
145145
{{- end }}
146146
{{- if .Values.resources.gpus.enabled }}
147147
- name: gpus

0 commit comments

Comments
 (0)