Skip to content

Commit cc06766

Browse files
Evan Lezarelezar
authored andcommitted
Merge branch 'fix-load-kernel-modules' into 'main'
Split internal system package See merge request nvidia/container-toolkit/container-toolkit!420
1 parent 7c807c2 commit cc06766

File tree

17 files changed

+957
-224
lines changed

17 files changed

+957
-224
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* Fix bug causing incorrect nvidia-smi symlink to be created on WSL2 systems with multiple driver roots.
77
* Fix bug when using driver versions that do not include a patch component in their version number.
88
* Skip additional modifications in CDI mode.
9+
* Fix loading of kernel modules and creation of device nodes in containerized use cases.
910

1011
* [toolkit-container] Allow same envars for all runtime configs
1112

cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ import (
2828

2929
type allPossible struct {
3030
logger *logrus.Logger
31-
driverRoot string
31+
devRoot string
3232
deviceMajors devices.Devices
3333
migCaps nvcaps.MigCaps
3434
}
3535

3636
// newAllPossible returns a new allPossible device node lister.
3737
// This lister lists all possible device nodes for NVIDIA GPUs, control devices, and capability devices.
38-
func newAllPossible(logger *logrus.Logger, driverRoot string) (nodeLister, error) {
38+
func newAllPossible(logger *logrus.Logger, devRoot string) (nodeLister, error) {
3939
deviceMajors, err := devices.GetNVIDIADevices()
4040
if err != nil {
4141
return nil, fmt.Errorf("failed reading device majors: %v", err)
@@ -61,7 +61,7 @@ func newAllPossible(logger *logrus.Logger, driverRoot string) (nodeLister, error
6161

6262
l := allPossible{
6363
logger: logger,
64-
driverRoot: driverRoot,
64+
devRoot: devRoot,
6565
deviceMajors: deviceMajors,
6666
migCaps: migCaps,
6767
}
@@ -72,15 +72,15 @@ func newAllPossible(logger *logrus.Logger, driverRoot string) (nodeLister, error
7272
// DeviceNodes returns a list of all possible device nodes for NVIDIA GPUs, control devices, and capability devices.
7373
func (m allPossible) DeviceNodes() ([]deviceNode, error) {
7474
gpus, err := nvpci.NewFrom(
75-
filepath.Join(m.driverRoot, nvpci.PCIDevicesRoot),
75+
filepath.Join(m.devRoot, nvpci.PCIDevicesRoot),
7676
).GetGPUs()
7777
if err != nil {
7878
return nil, fmt.Errorf("failed to get GPU information: %v", err)
7979
}
8080

8181
count := len(gpus)
8282
if count == 0 {
83-
m.logger.Infof("No NVIDIA devices found in %s", m.driverRoot)
83+
m.logger.Infof("No NVIDIA devices found in %s", m.devRoot)
8484
return nil, nil
8585
}
8686

@@ -179,7 +179,7 @@ func (m allPossible) newDeviceNode(deviceName devices.Name, path string, minor i
179179
major, _ := m.deviceMajors.Get(deviceName)
180180

181181
return deviceNode{
182-
path: filepath.Join(m.driverRoot, path),
182+
path: filepath.Join(m.devRoot, path),
183183
major: uint32(major),
184184
minor: uint32(minor),
185185
}

cmd/nvidia-ctk/system/create-dev-char-symlinks/create-dev-char-symlinks.go

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ import (
2424
"strings"
2525
"syscall"
2626

27-
"github.com/NVIDIA/nvidia-container-toolkit/internal/system"
27+
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvdevices"
28+
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules"
2829
"github.com/fsnotify/fsnotify"
2930
"github.com/sirupsen/logrus"
3031
"github.com/urfave/cli/v2"
@@ -216,6 +217,7 @@ type linkCreator struct {
216217
logger *logrus.Logger
217218
lister nodeLister
218219
driverRoot string
220+
devRoot string
219221
devCharPath string
220222
dryRun bool
221223
createAll bool
@@ -243,6 +245,9 @@ func NewSymlinkCreator(opts ...Option) (Creator, error) {
243245
if c.driverRoot == "" {
244246
c.driverRoot = "/"
245247
}
248+
if c.devRoot == "" {
249+
c.devRoot = "/"
250+
}
246251
if c.devCharPath == "" {
247252
c.devCharPath = defaultDevCharPath
248253
}
@@ -252,13 +257,13 @@ func NewSymlinkCreator(opts ...Option) (Creator, error) {
252257
}
253258

254259
if c.createAll {
255-
lister, err := newAllPossible(c.logger, c.driverRoot)
260+
lister, err := newAllPossible(c.logger, c.devRoot)
256261
if err != nil {
257262
return nil, fmt.Errorf("failed to create all possible device lister: %v", err)
258263
}
259264
c.lister = lister
260265
} else {
261-
c.lister = existing{c.logger, c.driverRoot}
266+
c.lister = existing{c.logger, c.devRoot}
262267
}
263268
return c, nil
264269
}
@@ -268,36 +273,48 @@ func (m linkCreator) setup() error {
268273
return nil
269274
}
270275

271-
s, err := system.New(
272-
system.WithLogger(m.logger),
273-
system.WithDryRun(m.dryRun),
274-
)
275-
if err != nil {
276-
return err
277-
}
278-
279276
if m.loadKernelModules {
280-
if err := s.LoadNVIDIAKernelModules(); err != nil {
277+
modules := nvmodules.New(
278+
nvmodules.WithLogger(m.logger),
279+
nvmodules.WithDryRun(m.dryRun),
280+
nvmodules.WithRoot(m.driverRoot),
281+
)
282+
if err := modules.LoadAll(); err != nil {
281283
return fmt.Errorf("failed to load NVIDIA kernel modules: %v", err)
282284
}
283285
}
284286

285287
if m.createDeviceNodes {
286-
if err := s.CreateNVIDIAControlDeviceNodesAt(m.driverRoot); err != nil {
288+
devices, err := nvdevices.New(
289+
nvdevices.WithLogger(m.logger),
290+
nvdevices.WithDryRun(m.dryRun),
291+
nvdevices.WithDevRoot(m.devRoot),
292+
)
293+
if err != nil {
294+
return err
295+
}
296+
if err := devices.CreateNVIDIAControlDevices(); err != nil {
287297
return fmt.Errorf("failed to create NVIDIA device nodes: %v", err)
288298
}
289299
}
290-
291300
return nil
292301
}
293302

294303
// WithDriverRoot sets the driver root path.
304+
// This is the path in which kernel modules must be loaded.
295305
func WithDriverRoot(root string) Option {
296306
return func(c *linkCreator) {
297307
c.driverRoot = root
298308
}
299309
}
300310

311+
// WithDevRoot sets the root path for the /dev directory.
312+
func WithDevRoot(root string) Option {
313+
return func(c *linkCreator) {
314+
c.devRoot = root
315+
}
316+
}
317+
301318
// WithDevCharPath sets the path at which the symlinks will be created.
302319
func WithDevCharPath(path string) Option {
303320
return func(c *linkCreator) {

cmd/nvidia-ctk/system/create-dev-char-symlinks/existing.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,16 @@ type nodeLister interface {
3030
}
3131

3232
type existing struct {
33-
logger *logrus.Logger
34-
driverRoot string
33+
logger *logrus.Logger
34+
devRoot string
3535
}
3636

3737
// DeviceNodes returns a list of NVIDIA device nodes in the specified root.
3838
// The nvidia-nvswitch* and nvidia-nvlink devices are excluded.
3939
func (m existing) DeviceNodes() ([]deviceNode, error) {
4040
locator := lookup.NewCharDeviceLocator(
4141
lookup.WithLogger(m.logger),
42-
lookup.WithRoot(m.driverRoot),
42+
lookup.WithRoot(m.devRoot),
4343
lookup.WithOptional(true),
4444
)
4545

@@ -54,7 +54,7 @@ func (m existing) DeviceNodes() ([]deviceNode, error) {
5454
}
5555

5656
if len(devices) == 0 && len(capDevices) == 0 {
57-
m.logger.Infof("No NVIDIA devices found in %s", m.driverRoot)
57+
m.logger.Infof("No NVIDIA devices found in %s", m.devRoot)
5858
return nil, nil
5959
}
6060

cmd/nvidia-ctk/system/create-device-nodes/create-device-nodes.go

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ package createdevicenodes
1919
import (
2020
"fmt"
2121

22-
"github.com/NVIDIA/nvidia-container-toolkit/internal/system"
22+
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvdevices"
23+
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules"
2324
"github.com/sirupsen/logrus"
2425
"github.com/urfave/cli/v2"
2526
)
@@ -96,19 +97,29 @@ func (m command) validateFlags(r *cli.Context, opts *options) error {
9697
}
9798

9899
func (m command) run(c *cli.Context, opts *options) error {
99-
s, err := system.New(
100-
system.WithLogger(m.logger),
101-
system.WithDryRun(opts.dryRun),
102-
system.WithLoadKernelModules(opts.loadKernelModules),
103-
)
104-
if err != nil {
105-
return fmt.Errorf("failed to create library: %v", err)
100+
if opts.loadKernelModules {
101+
modules := nvmodules.New(
102+
nvmodules.WithLogger(m.logger),
103+
nvmodules.WithDryRun(opts.dryRun),
104+
nvmodules.WithRoot(opts.driverRoot),
105+
)
106+
if err := modules.LoadAll(); err != nil {
107+
return fmt.Errorf("failed to load NVIDIA kernel modules: %v", err)
108+
}
106109
}
107110

108111
if opts.control {
112+
devices, err := nvdevices.New(
113+
nvdevices.WithLogger(m.logger),
114+
nvdevices.WithDryRun(opts.dryRun),
115+
nvdevices.WithDevRoot(opts.driverRoot),
116+
)
117+
if err != nil {
118+
return err
119+
}
109120
m.logger.Infof("Creating control device nodes at %s", opts.driverRoot)
110-
if err := s.CreateNVIDIAControlDeviceNodesAt(opts.driverRoot); err != nil {
111-
return fmt.Errorf("failed to create control device nodes: %v", err)
121+
if err := devices.CreateNVIDIAControlDevices(); err != nil {
122+
return fmt.Errorf("failed to create NVIDIA control device nodes: %v", err)
112123
}
113124
}
114125
return nil
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
/**
2+
# Copyright (c) NVIDIA CORPORATIOm. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package nvdevices
18+
19+
import (
20+
"errors"
21+
"fmt"
22+
"os"
23+
"path/filepath"
24+
"strings"
25+
26+
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
27+
"github.com/sirupsen/logrus"
28+
)
29+
30+
var errInvalidDeviceNode = errors.New("invalid device node")
31+
32+
// Interface provides a set of utilities for interacting with NVIDIA devices on the system.
33+
type Interface struct {
34+
devices.Devices
35+
36+
logger *logrus.Logger
37+
38+
dryRun bool
39+
// devRoot is the root directory where device nodes are expected to exist.
40+
devRoot string
41+
42+
mknoder
43+
}
44+
45+
// New constructs a new Interface struct with the specified options.
46+
func New(opts ...Option) (*Interface, error) {
47+
i := &Interface{}
48+
for _, opt := range opts {
49+
opt(i)
50+
}
51+
52+
if i.logger == nil {
53+
i.logger = logrus.StandardLogger()
54+
}
55+
if i.devRoot == "" {
56+
i.devRoot = "/"
57+
}
58+
if i.Devices == nil {
59+
devices, err := devices.GetNVIDIADevices()
60+
if err != nil {
61+
return nil, fmt.Errorf("failed to create devices info: %v", err)
62+
}
63+
i.Devices = devices
64+
}
65+
66+
if i.dryRun {
67+
i.mknoder = &mknodLogger{i.logger}
68+
} else {
69+
i.mknoder = &mknodUnix{}
70+
}
71+
return i, nil
72+
}
73+
74+
// CreateNVIDIAControlDevices creates the NVIDIA control device nodes at the configured devRoot.
75+
func (m *Interface) CreateNVIDIAControlDevices() error {
76+
controlNodes := []string{"nvidiactl", "nvidia-modeset", "nvidia-uvm", "nvidia-uvm-tools"}
77+
for _, node := range controlNodes {
78+
err := m.CreateNVIDIADevice(node)
79+
if err != nil {
80+
return fmt.Errorf("failed to create device node %s: %w", node, err)
81+
}
82+
}
83+
return nil
84+
}
85+
86+
// CreateNVIDIADevice creates the specified NVIDIA device node at the configured devRoot.
87+
func (m *Interface) CreateNVIDIADevice(node string) error {
88+
node = filepath.Base(node)
89+
if !strings.HasPrefix(node, "nvidia") {
90+
return fmt.Errorf("invalid device node %q: %w", node, errInvalidDeviceNode)
91+
}
92+
93+
major, err := m.Major(node)
94+
if err != nil {
95+
return fmt.Errorf("failed to determine major: %w", err)
96+
}
97+
98+
minor, err := m.Minor(node)
99+
if err != nil {
100+
return fmt.Errorf("failed to determine minor: %w", err)
101+
}
102+
103+
return m.createDeviceNode(filepath.Join("dev", node), int(major), int(minor))
104+
}
105+
106+
// createDeviceNode creates the specified device node with the require major and minor numbers.
107+
// If a devRoot is configured, this is prepended to the path.
108+
func (m *Interface) createDeviceNode(path string, major int, minor int) error {
109+
path = filepath.Join(m.devRoot, path)
110+
if _, err := os.Stat(path); err == nil {
111+
m.logger.Infof("Skipping: %s already exists", path)
112+
return nil
113+
} else if !os.IsNotExist(err) {
114+
return fmt.Errorf("failed to stat %s: %v", path, err)
115+
}
116+
117+
return m.Mknode(path, major, minor)
118+
}
119+
120+
// Major returns the major number for the specified NVIDIA device node.
121+
// If the device node is not supported, an error is returned.
122+
func (m *Interface) Major(node string) (int64, error) {
123+
var valid bool
124+
var major devices.Major
125+
switch node {
126+
case "nvidia-uvm", "nvidia-uvm-tools":
127+
major, valid = m.Get(devices.NVIDIAUVM)
128+
case "nvidia-modeset", "nvidiactl":
129+
major, valid = m.Get(devices.NVIDIAGPU)
130+
}
131+
132+
if valid {
133+
return int64(major), nil
134+
}
135+
136+
return 0, errInvalidDeviceNode
137+
}
138+
139+
// Minor returns the minor number for the specified NVIDIA device node.
140+
// If the device node is not supported, an error is returned.
141+
func (m *Interface) Minor(node string) (int64, error) {
142+
switch node {
143+
case "nvidia-modeset":
144+
return devices.NVIDIAModesetMinor, nil
145+
case "nvidia-uvm-tools":
146+
return devices.NVIDIAUVMToolsMinor, nil
147+
case "nvidia-uvm":
148+
return devices.NVIDIAUVMMinor, nil
149+
case "nvidiactl":
150+
return devices.NVIDIACTLMinor, nil
151+
}
152+
153+
return 0, errInvalidDeviceNode
154+
}

0 commit comments

Comments
 (0)