Skip to content

Commit a79a9fd

Browse files
authored
Merge pull request #646 from jgehrcke/jp/no-clique-update-cd-node-status
Release workload on a non-MNNVL node in a CD
2 parents 3903df7 + 6e56823 commit a79a9fd

File tree

6 files changed

+107
-52
lines changed

6 files changed

+107
-52
lines changed

cmd/compute-domain-daemon/computedomain.go

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,10 @@ func (m *ComputeDomainManager) Get(uid string) (*nvapi.ComputeDomain, error) {
184184
return cds[0], nil
185185
}
186186

187-
// onAddOrUpdate handles the addition or update of a ComputeDomain.
187+
// onAddOrUpdate handles the addition or update of a ComputeDomain. Here, we
188+
// receive updates not for all CDs in the system, but only for the CD that we
189+
// are registered for (filtered by CD name). Note that the informer triggers
190+
// this callback once upon startup for all existing objects.
188191
func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error {
189192
// Cast the object to a ComputeDomain object
190193
o, ok := obj.(*nvapi.ComputeDomain)
@@ -203,13 +206,14 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
203206
return nil
204207
}
205208

209+
// Because the informer only filters by name:
206210
// Skip ComputeDomains that don't match on UUID
207211
if string(cd.UID) != m.config.computeDomainUUID {
208-
klog.Errorf("ComputeDomain processed with non-matching UID (%v, %v)", cd.UID, m.config.computeDomainUUID)
212+
klog.Warningf("ComputeDomain processed with non-matching UID (%v, %v)", cd.UID, m.config.computeDomainUUID)
209213
return nil
210214
}
211215

212-
// Update node info in ComputeDomain
216+
// Update node info in ComputeDomain.
213217
if err := m.UpdateComputeDomainNodeInfo(ctx, cd); err != nil {
214218
return fmt.Errorf("error updating node info in ComputeDomain: %w", err)
215219
}
@@ -257,7 +261,10 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context,
257261
Name: m.config.nodeName,
258262
CliqueID: m.config.cliqueID,
259263
Index: nextIndex,
264+
Status: nvapi.ComputeDomainStatusNotReady,
260265
}
266+
267+
klog.Infof("CD status does not contain node name '%s' yet, try to insert myself: %v", m.config.nodeName, nodeInfo)
261268
newCD.Status.Nodes = append(newCD.Status.Nodes, nodeInfo)
262269
}
263270

@@ -266,7 +273,7 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context,
266273
// across pod restarts.
267274
nodeInfo.IPAddress = m.config.podIP
268275

269-
// Conditionally update its status
276+
// Conditionally update global CD status if it's still in its initial status
270277
if newCD.Status.Status == "" {
271278
newCD.Status.Status = nvapi.ComputeDomainStatusNotReady
272279
}
@@ -279,6 +286,7 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context,
279286
}
280287
m.mutationCache.Mutation(newCD)
281288

289+
klog.V(2).Infof("Successfully updated CD")
282290
return nil
283291
}
284292

@@ -318,6 +326,14 @@ func getNextAvailableIndex(currentCliqueID string, nodes []*nvapi.ComputeDomainN
318326
nextIndex++
319327
}
320328

329+
// Skip `maxNodesPerIMEXDomain` check in the special case of no clique ID
330+
// being set: this means that this node does not actually run an IMEX daemon
331+
// managed by us and the set of nodes in this "noop" mode in this CD is
332+
// allowed to grow larger than maxNodesPerIMEXDomain.
333+
if currentCliqueID == "" {
334+
return nextIndex, nil
335+
}
336+
321337
// Ensure nextIndex is within the range 0..maxNodesPerIMEXDomain
322338
if nextIndex < 0 || nextIndex >= maxNodesPerIMEXDomain {
323339
return -1, fmt.Errorf("no available indices within maxNodesPerIMEXDomain (%d) for cliqueID %s", maxNodesPerIMEXDomain, currentCliqueID)
@@ -352,7 +368,7 @@ func (m *ComputeDomainManager) MaybePushNodesUpdate(cd *nvapi.ComputeDomain) {
352368
m.previousNodes = cd.Status.Nodes
353369
m.updatedNodesChan <- cd.Status.Nodes
354370
} else {
355-
klog.Infof("IP set did not change")
371+
klog.V(6).Infof("IP set did not change")
356372
}
357373
}
358374

cmd/compute-domain-daemon/main.go

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -185,14 +185,6 @@ func newApp() *cli.App {
185185

186186
// Run invokes the IMEX daemon and manages its lifecycle.
187187
func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
188-
// Support heterogeneous compute domain
189-
if flags.cliqueID == "" {
190-
fmt.Println("ClusterUUID and CliqueId are NOT set for GPUs on this node.")
191-
fmt.Println("The IMEX daemon will not be started.")
192-
fmt.Println("Sleeping forever...")
193-
<-ctx.Done()
194-
return nil
195-
}
196188

197189
config := &ControllerConfig{
198190
cliqueID: flags.cliqueID,
@@ -207,7 +199,17 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
207199
}
208200
klog.Infof("config: %v", config)
209201

210-
// Write the IMEX config with the current pod IP before starting the daemon
202+
// Support heterogeneous ComputeDomains. That means that a CD may contain
203+
// nodes that do not take part in Multi-Node NVLink communication. On such
204+
// nodes, this program is started with an empty NVLink clique ID
205+
// configuration parameter. In this mode, do not start the IMEX daemon but
206+
// otherwise keep business logic intact. In particular, continuously update
207+
// this node's state in the CD object.
208+
if flags.cliqueID == "" {
209+
klog.Infof("no cliqueID: register with ComputeDomain, but do not run IMEX daemon")
210+
}
211+
212+
// Render and write the IMEX daemon config with the current pod IP
211213
if err := writeIMEXConfig(flags.podIP); err != nil {
212214
return fmt.Errorf("writeIMEXConfig failed: %w", err)
213215
}
@@ -302,6 +304,11 @@ func IMEXDaemonUpdateLoopWithIPs(ctx context.Context, controller *Controller, cl
302304
return fmt.Errorf("writeNodesConfig failed: %w", err)
303305
}
304306

307+
if cliqueID == "" {
308+
klog.V(1).Infof("empty cliqueID: do not start IMEX daemon")
309+
break
310+
}
311+
305312
klog.Infof("Got update, (re)start IMEX daemon")
306313
if err := pm.Restart(); err != nil {
307314
// This might be a permanent problem, and retrying upon next update
@@ -331,6 +338,11 @@ func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controlle
331338
return fmt.Errorf("failed to update DNS name => IP mappings: %w", err)
332339
}
333340

341+
if dnsNameManager.cliqueID == "" {
342+
klog.V(1).Infof("empty cliqueID: do not start IMEX daemon")
343+
break
344+
}
345+
334346
fresh, err := processManager.EnsureStarted()
335347
if err != nil {
336348
return fmt.Errorf("failed to ensure IMEX daemon is started: %w", err)
@@ -344,7 +356,7 @@ func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controlle
344356
// addresses compared to the old set (then we don't need to force
345357
// the daemon to re-resolve & re-connect).
346358
if !updated || fresh {
347-
continue
359+
break
348360
}
349361

350362
// Actively ask the IMEX daemon to re-read its config and to
@@ -365,7 +377,7 @@ func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controlle
365377
// It returns an error if any step fails.
366378
func check(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
367379
if flags.cliqueID == "" {
368-
fmt.Println("ClusterUUID and CliqueId are NOT set for GPUs on this node.")
380+
fmt.Println("check succeeded (noop, clique ID is empty)")
369381
return nil
370382
}
371383

@@ -405,6 +417,12 @@ func writeIMEXConfig(podIP string) error {
405417
return fmt.Errorf("error executing template: %w", err)
406418
}
407419

420+
// Ensure the directory exists
421+
dir := filepath.Dir(imexConfigPath)
422+
if err := os.MkdirAll(dir, 0755); err != nil {
423+
return fmt.Errorf("failed to create directory %s: %w", dir, err)
424+
}
425+
408426
if err := os.WriteFile(imexConfigPath, configFile.Bytes(), 0644); err != nil {
409427
return fmt.Errorf("error writing config file %v: %w", imexConfigPath, err)
410428
}

cmd/compute-domain-daemon/podmanager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ func (pm *PodManager) updateNodeStatus(ctx context.Context, status string) error
161161
return fmt.Errorf("failed to get ComputeDomain: %w", err)
162162
}
163163
if cd == nil {
164-
return fmt.Errorf("ComputeDomain not found")
164+
return fmt.Errorf("ComputeDomain '%s/%s' not found", pm.config.computeDomainName, pm.config.computeDomainUUID)
165165
}
166166

167167
// Create a deep copy to avoid modifying the original

cmd/compute-domain-daemon/process.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,7 @@ func (m *ProcessManager) Watchdog(ctx context.Context) error {
201201
}
202202
}
203203

204-
// Detect if process terminated unexpectedly. Caller must ignore bool if err
205-
// isn't nil.
204+
// Detect if process terminated unexpectedly.
206205
func (m *ProcessManager) lost() bool {
207206
if !m.TryLock() {
208207
// Start or stop is in progress; do not inspect state.

cmd/compute-domain-kubelet-plugin/computedomain.go

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ type ComputeDomainManager struct {
6060

6161
type ComputeDomainDaemonSettings struct {
6262
manager *ComputeDomainManager
63-
domain string
63+
domainID string
6464
rootDir string
6565
configTmplPath string
6666
nodesConfigPath string
@@ -128,13 +128,13 @@ func (m *ComputeDomainManager) Stop() error {
128128
return nil
129129
}
130130

131-
func (m *ComputeDomainManager) NewSettings(domain string) *ComputeDomainDaemonSettings {
131+
func (m *ComputeDomainManager) NewSettings(domainID string) *ComputeDomainDaemonSettings {
132132
return &ComputeDomainDaemonSettings{
133133
manager: m,
134-
domain: domain,
135-
rootDir: fmt.Sprintf("%s/%s", m.configFilesRoot, domain),
136-
configTmplPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domain, "config.tmpl.cfg"),
137-
nodesConfigPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domain, "nodes_config.cfg"),
134+
domainID: domainID,
135+
rootDir: fmt.Sprintf("%s/%s", m.configFilesRoot, domainID),
136+
configTmplPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "config.tmpl.cfg"),
137+
nodesConfigPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "nodes_config.cfg"),
138138
}
139139
}
140140

@@ -154,17 +154,16 @@ func (m *ComputeDomainManager) GetComputeDomainChannelContainerEdits(devRoot str
154154
}
155155
}
156156

157-
func (s *ComputeDomainDaemonSettings) GetDomain() string {
158-
return s.domain
159-
}
160-
161-
func (s *ComputeDomainDaemonSettings) GetCDIContainerEdits(ctx context.Context, devRoot string, info *nvcapDeviceInfo) (*cdiapi.ContainerEdits, error) {
162-
cd, err := s.manager.GetComputeDomain(ctx, s.domain)
157+
// GetCDIContainerEditsCommon() returns the CDI spec edits always required for
158+
// launching the CD Daemon (whether or not it tries to launch an IMEX daemon
159+
// internally).
160+
func (s *ComputeDomainDaemonSettings) GetCDIContainerEditsCommon(ctx context.Context) (*cdiapi.ContainerEdits, error) {
161+
cd, err := s.manager.GetComputeDomain(ctx, s.domainID)
163162
if err != nil {
164-
return nil, fmt.Errorf("error getting compute domain: %w", err)
163+
return nil, fmt.Errorf("error getting compute domain %s: %w", s.domainID, err)
165164
}
166165
if cd == nil {
167-
return nil, fmt.Errorf("compute domain not found: %s", s.domain)
166+
return nil, fmt.Errorf("compute domain not found: %s", s.domainID)
168167
}
169168

170169
edits := &cdiapi.ContainerEdits{
@@ -182,6 +181,20 @@ func (s *ComputeDomainDaemonSettings) GetCDIContainerEdits(ctx context.Context,
182181
Options: []string{"rw", "nosuid", "nodev", "bind"},
183182
},
184183
},
184+
},
185+
}
186+
return edits, nil
187+
}
188+
189+
func (s *ComputeDomainDaemonSettings) GetDomainID() string {
190+
return s.domainID
191+
}
192+
193+
// GetCDIContainerEditsForImex() returns the CDI spec edits only required for
194+
// launching the CD daemon when it actually wraps an IMEX daemon.
195+
func (s *ComputeDomainDaemonSettings) GetCDIContainerEditsForImex(ctx context.Context, devRoot string, info *nvcapDeviceInfo) *cdiapi.ContainerEdits {
196+
edits := &cdiapi.ContainerEdits{
197+
ContainerEdits: &cdispec.ContainerEdits{
185198
DeviceNodes: []*cdispec.DeviceNode{
186199
{
187200
Path: info.path,
@@ -193,8 +206,7 @@ func (s *ComputeDomainDaemonSettings) GetCDIContainerEdits(ctx context.Context,
193206
},
194207
},
195208
}
196-
197-
return edits, nil
209+
return edits
198210
}
199211

200212
func (s *ComputeDomainDaemonSettings) Prepare(ctx context.Context) error {

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,8 @@ func (s *DeviceState) applyComputeDomainChannelConfig(ctx context.Context, confi
462462
}
463463

464464
for _, info := range s.nvdevlib.nvCapImexChanDevInfos[:chancount] {
465-
configState.containerEdits = configState.containerEdits.Append(s.computeDomainManager.GetComputeDomainChannelContainerEdits(s.cdi.devRoot, info))
465+
edits := s.computeDomainManager.GetComputeDomainChannelContainerEdits(s.cdi.devRoot, info)
466+
configState.containerEdits = configState.containerEdits.Append(edits)
466467
}
467468

468469
return &configState, nil
@@ -491,27 +492,36 @@ func (s *DeviceState) applyComputeDomainDaemonConfig(ctx context.Context, config
491492
ComputeDomain: config.DomainID,
492493
}
493494

494-
// Only prepare files to inject to the daemon if IMEX is supported.
495-
if s.computeDomainManager.cliqueID != "" {
496-
// Parse the device node info for the fabic-imex-mgmt nvcap.
497-
nvcapDeviceInfo, err := s.nvdevlib.parseNVCapDeviceInfo(nvidiaCapFabricImexMgmtPath)
498-
if err != nil {
499-
return nil, fmt.Errorf("error parsing nvcap device info for fabic-imex-mgmt: %w", err)
500-
}
495+
// Create new ComputeDomain daemon settings from the ComputeDomainManager.
496+
computeDomainDaemonSettings := s.computeDomainManager.NewSettings(config.DomainID)
501497

502-
// Create new ComputeDomain daemon settings from the ComputeDomainManager.
503-
computeDomainDaemonSettings := s.computeDomainManager.NewSettings(config.DomainID)
498+
// Prepare injecting IMEX daemon config files even if IMEX is not supported.
499+
// This for example creates
500+
// '/var/lib/kubelet/plugins/compute-domain.nvidia.com/domains/<uid>' on the
501+
// host which is used as mount source mapped to /etc/nvidia-imex in the CD
502+
// daemon container.
503+
if err := computeDomainDaemonSettings.Prepare(ctx); err != nil {
504+
return nil, fmt.Errorf("error preparing ComputeDomain daemon settings for requests '%v' in claim '%v': %w", requests, claim.UID, err)
505+
}
504506

505-
// Prepare the new ComputeDomain daemon.
506-
if err := computeDomainDaemonSettings.Prepare(ctx); err != nil {
507-
return nil, fmt.Errorf("error preparing ComputeDomain daemon settings for requests '%v' in claim '%v': %w", requests, claim.UID, err)
508-
}
507+
// Always inject CD config details into the CD daemon (regardless of clique
508+
// ID being empty or not).
509+
edits, err := computeDomainDaemonSettings.GetCDIContainerEditsCommon(ctx)
510+
if err != nil {
511+
return nil, fmt.Errorf("error getting common container edits for ComputeDomain daemon '%s': %w", config.DomainID, err)
512+
}
513+
configState.containerEdits = configState.containerEdits.Append(edits)
509514

510-
// Store information about the ComputeDomain daemon in the configState.
511-
edits, err := computeDomainDaemonSettings.GetCDIContainerEdits(ctx, s.cdi.devRoot, nvcapDeviceInfo)
515+
// Only inject dev nodes related to
516+
// /proc/driver/nvidia/capabilities/fabric-imex-mgmt if IMEX is supported
517+
// (if we want to start the IMEX daemon process in the CD daemon pod).
518+
if s.computeDomainManager.cliqueID != "" {
519+
// Parse the device node info for the fabric-imex-mgmt nvcap.
520+
nvcapDeviceInfo, err := s.nvdevlib.parseNVCapDeviceInfo(nvidiaCapFabricImexMgmtPath)
512521
if err != nil {
513-
return nil, fmt.Errorf("error getting container edits for ComputeDomain daemon for requests '%v' in claim '%v': %w", requests, claim.UID, err)
522+
return nil, fmt.Errorf("error parsing nvcap device info for fabric-imex-mgmt: %w", err)
514523
}
524+
edits := computeDomainDaemonSettings.GetCDIContainerEditsForImex(ctx, s.cdi.devRoot, nvcapDeviceInfo)
515525
configState.containerEdits = configState.containerEdits.Append(edits)
516526
}
517527

0 commit comments

Comments
 (0)