Skip to content

Commit 883d613

Browse files
committed
Ensure that the fabric-imex-mgmt nvcap is created and injected always
Signed-off-by: Kevin Klues <[email protected]>
1 parent bee82ee commit 883d613

File tree

3 files changed

+106
-5
lines changed

3 files changed

+106
-5
lines changed

cmd/compute-domain-kubelet-plugin/computedomain.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ func (s *ComputeDomainDaemonSettings) GetDomain() string {
152152
return s.domain
153153
}
154154

155-
func (s *ComputeDomainDaemonSettings) GetCDIContainerEdits() *cdiapi.ContainerEdits {
155+
func (s *ComputeDomainDaemonSettings) GetCDIContainerEdits(devRoot string, info *nvcapDeviceInfo) *cdiapi.ContainerEdits {
156+
156157
return &cdiapi.ContainerEdits{
157158
ContainerEdits: &cdispec.ContainerEdits{
158159
Mounts: []*cdispec.Mount{
@@ -162,6 +163,12 @@ func (s *ComputeDomainDaemonSettings) GetCDIContainerEdits() *cdiapi.ContainerEd
162163
Options: []string{"rw", "nosuid", "nodev", "bind"},
163164
},
164165
},
166+
DeviceNodes: []*cdispec.DeviceNode{
167+
{
168+
Path: info.path,
169+
HostPath: filepath.Join(devRoot, info.path),
170+
},
171+
},
165172
},
166173
}
167174
}

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,17 @@ func (s *DeviceState) applyComputeDomainDaemonConfig(ctx context.Context, config
371371
return nil, fmt.Errorf("only expected 1 device for requests '%v' in claim '%v'", requests, claim.UID)
372372
}
373373

374+
// Parse the device node info for the fabic-imex-mgmt nvcap.
375+
nvcapDeviceInfo, err := s.nvdevlib.parseNVCapDeviceInfo(nvidiaCapFabricImexMgmtPath)
376+
if err != nil {
377+
return nil, fmt.Errorf("error parsing nvcap device info for fabic-imex-mgmt: %w", err)
378+
}
379+
380+
// Create the device node for the fabic-imex-mgmt nvcap.
381+
if err := s.nvdevlib.createNvCapDevice(nvidiaCapFabricImexMgmtPath); err != nil {
382+
return nil, fmt.Errorf("error creating nvcap device for fabic-imex-mgmt: %w", err)
383+
}
384+
374385
// Declare a device group state object to populate.
375386
var configState DeviceConfigState
376387

@@ -385,7 +396,7 @@ func (s *DeviceState) applyComputeDomainDaemonConfig(ctx context.Context, config
385396
// Store information about the ComputeDomain daemon in the configState.
386397
configState.Type = ComputeDomainDaemonType
387398
configState.ComputeDomain = config.DomainID
388-
configState.containerEdits = computeDomainDaemonSettings.GetCDIContainerEdits()
399+
configState.containerEdits = computeDomainDaemonSettings.GetCDIContainerEdits(s.cdi.devRoot, nvcapDeviceInfo)
389400

390401
return &configState, nil
391402
}

cmd/compute-domain-kubelet-plugin/nvlib.go

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ import (
3232

3333
const (
3434
procDevicesPath = "/proc/devices"
35+
nvidiaCapsDeviceName = "nvidia-caps"
3536
nvidiaCapsImexChannelsDeviceName = "nvidia-caps-imex-channels"
37+
nvidiaCapFabricImexMgmtPath = "/proc/driver/nvidia/capabilities/fabric-imex-mgmt"
3638
)
3739

3840
type deviceLib struct {
@@ -43,6 +45,14 @@ type deviceLib struct {
4345
nvidiaSMIPath string
4446
}
4547

48+
type nvcapDeviceInfo struct {
49+
major int
50+
minor int
51+
mode int
52+
modify int
53+
path string
54+
}
55+
4656
func newDeviceLib(driverRoot root) (*deviceLib, error) {
4757
driverLibraryPath, err := driverRoot.getDriverLibraryPath()
4858
if err != nil {
@@ -128,7 +138,7 @@ func (l deviceLib) getImexChannelCount() (int, error) {
128138
return 2048, nil
129139
}
130140

131-
func (l deviceLib) getImexChannelMajor() (int, error) {
141+
func (l deviceLib) getDeviceMajor(name string) (int, error) {
132142
file, err := os.Open(procDevicesPath)
133143
if err != nil {
134144
return -1, err
@@ -163,7 +173,7 @@ func (l deviceLib) getImexChannelMajor() (int, error) {
163173
// If we've passed the character devices section, check for nvidiaCapsImexChannelsDeviceName
164174
if foundCharDevices {
165175
parts := strings.Fields(line)
166-
if len(parts) == 2 && parts[1] == nvidiaCapsImexChannelsDeviceName {
176+
if len(parts) == 2 && parts[1] == name {
167177
return strconv.Atoi(parts[0])
168178
}
169179
}
@@ -172,14 +182,57 @@ func (l deviceLib) getImexChannelMajor() (int, error) {
172182
return -1, scanner.Err()
173183
}
174184

185+
func (l deviceLib) parseNVCapDeviceInfo(nvcapsFilePath string) (*nvcapDeviceInfo, error) {
186+
file, err := os.Open(nvcapsFilePath)
187+
if err != nil {
188+
return nil, err
189+
}
190+
defer file.Close()
191+
192+
info := &nvcapDeviceInfo{}
193+
194+
major, err := l.getDeviceMajor(nvidiaCapsDeviceName)
195+
if err != nil {
196+
return nil, fmt.Errorf("error getting device major: %w", err)
197+
}
198+
info.major = major
199+
200+
scanner := bufio.NewScanner(file)
201+
for scanner.Scan() {
202+
line := scanner.Text()
203+
parts := strings.SplitN(line, ":", 2)
204+
if len(parts) != 2 {
205+
continue
206+
}
207+
key := strings.TrimSpace(parts[0])
208+
value := strings.TrimSpace(parts[1])
209+
210+
switch key {
211+
case "DeviceFileMinor":
212+
_, _ = fmt.Sscanf(value, "%d", &info.minor)
213+
case "DeviceFileMode":
214+
_, _ = fmt.Sscanf(value, "%d", &info.mode)
215+
case "DeviceFileModify":
216+
_, _ = fmt.Sscanf(value, "%d", &info.modify)
217+
}
218+
}
219+
info.path = fmt.Sprintf("/dev/nvidia-caps/nvidia-cap%d", info.minor)
220+
221+
if err := scanner.Err(); err != nil {
222+
return nil, err
223+
}
224+
225+
return info, nil
226+
}
227+
175228
func (l deviceLib) createComputeDomainChannelDevice(channel int) error {
176229
// Construct the properties of the device node to create.
177230
path := fmt.Sprintf("/dev/nvidia-caps-imex-channels/channel%d", channel)
178231
path = filepath.Join(l.devRoot, path)
179232
mode := uint32(unix.S_IFCHR | 0666)
180233

181234
// Get the IMEX channel major and build a /dev device from it
182-
major, err := l.getImexChannelMajor()
235+
major, err := l.getDeviceMajor(nvidiaCapsImexChannelsDeviceName)
183236
if err != nil {
184237
return fmt.Errorf("error getting IMEX channel major: %w", err)
185238
}
@@ -202,3 +255,33 @@ func (l deviceLib) createComputeDomainChannelDevice(channel int) error {
202255

203256
return nil
204257
}
258+
259+
func (l deviceLib) createNvCapDevice(nvcapFilePath string) error {
260+
// Get the nvcapDeviceInfo for the nvcap file.
261+
deviceInfo, err := l.parseNVCapDeviceInfo(nvcapFilePath)
262+
if err != nil {
263+
return fmt.Errorf("error parsing nvcap file for fabric-imex-mgmt: %w", err)
264+
}
265+
266+
// Construct the necessary information to create the device node
267+
path := filepath.Join(l.devRoot, deviceInfo.path)
268+
mode := unix.S_IFCHR | uint32(deviceInfo.mode)
269+
dev := unix.Mkdev(uint32(deviceInfo.major), uint32(deviceInfo.minor))
270+
271+
// Recursively create any parent directories of the device.
272+
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
273+
return fmt.Errorf("error creating directory for nvcaps device nodes: %w", err)
274+
}
275+
276+
// Remove the device if it already exists.
277+
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
278+
return fmt.Errorf("error removing existing nvcap device node: %w", err)
279+
}
280+
281+
// Create the device node using syscall.Mknod
282+
if err := unix.Mknod(path, mode, int(dev)); err != nil {
283+
return fmt.Errorf("mknod of nvcap device failed: %w", err)
284+
}
285+
286+
return nil
287+
}

0 commit comments

Comments
 (0)