Skip to content

Commit 7cbac7b

Browse files
committed
Only inject channel / daemon settings if running on an IMEX capable node
Signed-off-by: Kevin Klues <[email protected]>
1 parent f0bb1cd commit 7cbac7b

File tree

3 files changed

+36
-33
lines changed

3 files changed

+36
-33
lines changed

cmd/compute-domain-kubelet-plugin/computedomain.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,6 @@ func (m *ComputeDomainManager) NewSettings(domain string) *ComputeDomainDaemonSe
130130
}
131131

132132
func (m *ComputeDomainManager) GetComputeDomainChannelContainerEdits(devRoot string, info *ComputeDomainChannelInfo) *cdiapi.ContainerEdits {
133-
if m.cliqueID == "" {
134-
return nil
135-
}
136-
137133
channelPath := fmt.Sprintf("/dev/nvidia-caps-imex-channels/channel%d", info.ID)
138134

139135
return &cdiapi.ContainerEdits{

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 33 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,10 @@ func (s *DeviceState) applyConfig(ctx context.Context, config configapi.Interfac
329329

330330
func (s *DeviceState) applyComputeDomainChannelConfig(ctx context.Context, config *configapi.ComputeDomainChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) {
331331
// Declare a device group state object to populate.
332-
var configState DeviceConfigState
332+
configState := DeviceConfigState{
333+
Type: ComputeDomainChannelType,
334+
ComputeDomain: config.DomainID,
335+
}
333336

334337
// Create any necessary ComputeDomain channels and gather their CDI container edits.
335338
for _, r := range results {
@@ -343,12 +346,12 @@ func (s *DeviceState) applyComputeDomainChannelConfig(ctx context.Context, confi
343346
if err := s.computeDomainManager.AssertComputeDomainReady(ctx, config.DomainID); err != nil {
344347
return nil, fmt.Errorf("error asserting ComputeDomain Ready: %w", err)
345348
}
346-
if err := s.nvdevlib.createComputeDomainChannelDevice(channel.ID); err != nil {
347-
return nil, fmt.Errorf("error creating ComputeDomain channel device: %w", err)
349+
if s.computeDomainManager.cliqueID != "" {
350+
if err := s.nvdevlib.createComputeDomainChannelDevice(channel.ID); err != nil {
351+
return nil, fmt.Errorf("error creating ComputeDomain channel device: %w", err)
352+
}
353+
configState.containerEdits = configState.containerEdits.Append(s.computeDomainManager.GetComputeDomainChannelContainerEdits(s.cdi.devRoot, channel))
348354
}
349-
configState.Type = ComputeDomainChannelType
350-
configState.ComputeDomain = config.DomainID
351-
configState.containerEdits = configState.containerEdits.Append(s.computeDomainManager.GetComputeDomainChannelContainerEdits(s.cdi.devRoot, channel))
352355
}
353356

354357
return &configState, nil
@@ -371,32 +374,36 @@ func (s *DeviceState) applyComputeDomainDaemonConfig(ctx context.Context, config
371374
return nil, fmt.Errorf("only expected 1 device for requests '%v' in claim '%v'", requests, claim.UID)
372375
}
373376

374-
// Parse the device node info for the fabic-imex-mgmt nvcap.
375-
nvcapDeviceInfo, err := s.nvdevlib.parseNVCapDeviceInfo(nvidiaCapFabricImexMgmtPath)
376-
if err != nil {
377-
return nil, fmt.Errorf("error parsing nvcap device info for fabic-imex-mgmt: %w", err)
377+
// Declare a device group state object to populate.
378+
configState := DeviceConfigState{
379+
Type: ComputeDomainDaemonType,
380+
ComputeDomain: config.DomainID,
378381
}
379382

380-
// Create the device node for the fabic-imex-mgmt nvcap.
381-
if err := s.nvdevlib.createNvCapDevice(nvidiaCapFabricImexMgmtPath); err != nil {
382-
return nil, fmt.Errorf("error creating nvcap device for fabic-imex-mgmt: %w", err)
383-
}
383+
// Only prepare files to inject to the daemon if IMEX is supported.
384+
if s.computeDomainManager.cliqueID != "" {
385+
// Parse the device node info for the fabic-imex-mgmt nvcap.
386+
nvcapDeviceInfo, err := s.nvdevlib.parseNVCapDeviceInfo(nvidiaCapFabricImexMgmtPath)
387+
if err != nil {
388+
return nil, fmt.Errorf("error parsing nvcap device info for fabic-imex-mgmt: %w", err)
389+
}
384390

385-
// Declare a device group state object to populate.
386-
var configState DeviceConfigState
391+
// Create the device node for the fabic-imex-mgmt nvcap.
392+
if err := s.nvdevlib.createNvCapDevice(nvidiaCapFabricImexMgmtPath); err != nil {
393+
return nil, fmt.Errorf("error creating nvcap device for fabic-imex-mgmt: %w", err)
394+
}
387395

388-
// Create new ComputeDomain daemon settings from the ComputeDomainManager.
389-
computeDomainDaemonSettings := s.computeDomainManager.NewSettings(config.DomainID)
396+
// Create new ComputeDomain daemon settings from the ComputeDomainManager.
397+
computeDomainDaemonSettings := s.computeDomainManager.NewSettings(config.DomainID)
390398

391-
// Prepare the new ComputeDomain daemon.
392-
if err := computeDomainDaemonSettings.Prepare(ctx); err != nil {
393-
return nil, fmt.Errorf("error preparing ComputeDomain daemon settings for requests '%v' in claim '%v': %w", requests, claim.UID, err)
394-
}
399+
// Prepare the new ComputeDomain daemon.
400+
if err := computeDomainDaemonSettings.Prepare(ctx); err != nil {
401+
return nil, fmt.Errorf("error preparing ComputeDomain daemon settings for requests '%v' in claim '%v': %w", requests, claim.UID, err)
402+
}
395403

396-
// Store information about the ComputeDomain daemon in the configState.
397-
configState.Type = ComputeDomainDaemonType
398-
configState.ComputeDomain = config.DomainID
399-
configState.containerEdits = computeDomainDaemonSettings.GetCDIContainerEdits(s.cdi.devRoot, nvcapDeviceInfo)
404+
// Store information about the ComputeDomain daemon in the configState.
405+
configState.containerEdits = configState.containerEdits.Append(computeDomainDaemonSettings.GetCDIContainerEdits(s.cdi.devRoot, nvcapDeviceInfo))
406+
}
400407

401408
return &configState, nil
402409
}

templates/compute-domain-daemon.tmpl.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ spec:
3333
echo "ClusterUUID and CliqueId are NOT set for GPUs on this node."
3434
echo "The IMEX daemon will not be started."
3535
echo "Sleeping forever..."
36-
touch /etc/nvidia-imex/null
36+
touch /etc/nvidia-imex-null
3737
tail -f /dev/null & wait
3838
fi
3939
/usr/bin/nvidia-imex -c /etc/nvidia-imex/config.cfg
@@ -47,7 +47,7 @@ spec:
4747
- "sh"
4848
- "-c"
4949
- |-
50-
if [ -f /etc/nvidia-imex/null ]; then
50+
if [ -f /etc/nvidia-imex-null ]; then
5151
exit 0
5252
fi
5353
test "$(nvidia-imex-ctl -q)" = "READY"
@@ -59,7 +59,7 @@ spec:
5959
- "sh"
6060
- "-c"
6161
- |
62-
if [ -f /etc/nvidia-imex/null ]; then
62+
if [ -f /etc/nvidia-imex-null ]; then
6363
exit 0
6464
fi
6565
test "$(nvidia-imex-ctl -q)" = "READY"

0 commit comments

Comments
 (0)