Skip to content

Commit c5636d9

Browse files
committed
CD plugin: always inject CD details via CDI
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]> Rename 'domain' to 'domainID' Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]> squash: review feedback Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]> shorten comment Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent b25174e commit c5636d9

File tree

3 files changed

+43
-27
lines changed

3 files changed

+43
-27
lines changed

cmd/compute-domain-daemon/computedomain.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,8 @@ func (m *ComputeDomainManager) Get(uid string) (*nvapi.ComputeDomain, error) {
186186

187187
// onAddOrUpdate handles the addition or update of a ComputeDomain. Here, we
188188
// receive updates not for all CDs in the system, but only for the CD that we
189-
// are registered for (filtered by CD name).
189+
// are registered for (filtered by CD name). Note that the informer triggers
190+
// this callback once upon startup for all existing objects.
190191
func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error {
191192
// Cast the object to a ComputeDomain object
192193
o, ok := obj.(*nvapi.ComputeDomain)
@@ -212,8 +213,7 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
212213
return nil
213214
}
214215

215-
// Update node info in ComputeDomain
216-
// Why are we doing this (only) upon receiving another update?
216+
// Update node info in ComputeDomain.
217217
if err := m.UpdateComputeDomainNodeInfo(ctx, cd); err != nil {
218218
return fmt.Errorf("error updating node info in ComputeDomain: %w", err)
219219
}
@@ -228,7 +228,6 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context,
228228
var nodeInfo *nvapi.ComputeDomainNode
229229

230230
// Create a deep copy of the ComputeDomain to avoid modifying the original
231-
// TODO: review for 10000-node-CD
232231
newCD := cd.DeepCopy()
233232

234233
defer func() {
@@ -262,8 +261,7 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context,
262261
Name: m.config.nodeName,
263262
CliqueID: m.config.cliqueID,
264263
Index: nextIndex,
265-
// Initialize as NotReady (will be updated by podmanager).
266-
Status: nvapi.ComputeDomainStatusNotReady,
264+
Status: nvapi.ComputeDomainStatusNotReady,
267265
}
268266

269267
klog.Infof("CD status does not contain node name '%s' yet, try to insert myself: %v", m.config.nodeName, nodeInfo)

cmd/compute-domain-kubelet-plugin/computedomain.go

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ type ComputeDomainManager struct {
6060

6161
type ComputeDomainDaemonSettings struct {
6262
manager *ComputeDomainManager
63-
domain string
63+
domainID string
6464
rootDir string
6565
configTmplPath string
6666
nodesConfigPath string
@@ -128,13 +128,13 @@ func (m *ComputeDomainManager) Stop() error {
128128
return nil
129129
}
130130

131-
func (m *ComputeDomainManager) NewSettings(domain string) *ComputeDomainDaemonSettings {
131+
func (m *ComputeDomainManager) NewSettings(domainID string) *ComputeDomainDaemonSettings {
132132
return &ComputeDomainDaemonSettings{
133133
manager: m,
134-
domain: domain,
135-
rootDir: fmt.Sprintf("%s/%s", m.configFilesRoot, domain),
136-
configTmplPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domain, "config.tmpl.cfg"),
137-
nodesConfigPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domain, "nodes_config.cfg"),
134+
domainID: domainID,
135+
rootDir: fmt.Sprintf("%s/%s", m.configFilesRoot, domainID),
136+
configTmplPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "config.tmpl.cfg"),
137+
nodesConfigPath: fmt.Sprintf("%s/%s/%s", m.configFilesRoot, domainID, "nodes_config.cfg"),
138138
}
139139
}
140140

@@ -154,27 +154,40 @@ func (m *ComputeDomainManager) GetComputeDomainChannelContainerEdits(devRoot str
154154
}
155155
}
156156

157-
func (s *ComputeDomainDaemonSettings) GetDomain() string {
158-
return s.domain
159-
}
160-
161-
func (s *ComputeDomainDaemonSettings) GetCDIContainerEdits(ctx context.Context, devRoot string, info *nvcapDeviceInfo) (*cdiapi.ContainerEdits, error) {
162-
cd, err := s.manager.GetComputeDomain(ctx, s.domain)
157+
// GetComputeDomainDaemonContainerEdits() returns the CDI spec edits always
158+
// required for launching the CD Daemon (whether or not it tries to launch an
159+
// IMEX daemon internally).
160+
func (m *ComputeDomainManager) GetComputeDomainDaemonContainerEdits(ctx context.Context, domainID string) (*cdiapi.ContainerEdits, error) {
161+
cd, err := m.GetComputeDomain(ctx, domainID)
163162
if err != nil {
164-
return nil, fmt.Errorf("error getting compute domain: %w", err)
163+
return nil, fmt.Errorf("error getting compute domain %s: %w", domainID, err)
165164
}
166165
if cd == nil {
167-
return nil, fmt.Errorf("compute domain not found: %s", s.domain)
166+
return nil, fmt.Errorf("compute domain not found: %s", domainID)
168167
}
169168

170169
edits := &cdiapi.ContainerEdits{
171170
ContainerEdits: &cdispec.ContainerEdits{
172171
Env: []string{
173-
fmt.Sprintf("CLIQUE_ID=%s", s.manager.cliqueID),
172+
fmt.Sprintf("CLIQUE_ID=%s", m.cliqueID),
174173
fmt.Sprintf("COMPUTE_DOMAIN_UUID=%s", cd.UID),
175174
fmt.Sprintf("COMPUTE_DOMAIN_NAME=%s", cd.Name),
176175
fmt.Sprintf("COMPUTE_DOMAIN_NAMESPACE=%s", cd.Namespace),
177176
},
177+
},
178+
}
179+
return edits, nil
180+
}
181+
182+
func (s *ComputeDomainDaemonSettings) GetDomainID() string {
183+
return s.domainID
184+
}
185+
186+
// GetCDIContainerEditsForImex() returns the CDI spec edits only required for
187+
// launching the CD Daemon when it actually wraps an IMEX daemon.
188+
func (s *ComputeDomainDaemonSettings) GetCDIContainerEditsForImex(ctx context.Context, devRoot string, info *nvcapDeviceInfo) *cdiapi.ContainerEdits {
189+
edits := &cdiapi.ContainerEdits{
190+
ContainerEdits: &cdispec.ContainerEdits{
178191
Mounts: []*cdispec.Mount{
179192
{
180193
ContainerPath: "/etc/nvidia-imex",
@@ -193,8 +206,7 @@ func (s *ComputeDomainDaemonSettings) GetCDIContainerEdits(ctx context.Context,
193206
},
194207
},
195208
}
196-
197-
return edits, nil
209+
return edits
198210
}
199211

200212
func (s *ComputeDomainDaemonSettings) Prepare(ctx context.Context) error {

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,15 @@ func (s *DeviceState) applyComputeDomainDaemonConfig(ctx context.Context, config
491491
ComputeDomain: config.DomainID,
492492
}
493493

494+
// Always inject CD details into the CD daemon, in a heterogeneous CD clique
495+
// ID below may be empty (while other CD details are set, and consumed by
496+
// the CD daemon).
497+
edits, err := s.computeDomainManager.GetComputeDomainDaemonContainerEdits(ctx, config.DomainID)
498+
if err != nil {
499+
return nil, fmt.Errorf("error preparing ComputeDomain daemon settings: %w", err)
500+
}
501+
configState.containerEdits = configState.containerEdits.Append(edits)
502+
494503
// Only prepare files to inject to the daemon if IMEX is supported.
495504
if s.computeDomainManager.cliqueID != "" {
496505
// Parse the device node info for the fabic-imex-mgmt nvcap.
@@ -508,10 +517,7 @@ func (s *DeviceState) applyComputeDomainDaemonConfig(ctx context.Context, config
508517
}
509518

510519
// Store information about the ComputeDomain daemon in the configState.
511-
edits, err := computeDomainDaemonSettings.GetCDIContainerEdits(ctx, s.cdi.devRoot, nvcapDeviceInfo)
512-
if err != nil {
513-
return nil, fmt.Errorf("error getting container edits for ComputeDomain daemon for requests '%v' in claim '%v': %w", requests, claim.UID, err)
514-
}
520+
edits := computeDomainDaemonSettings.GetCDIContainerEditsForImex(ctx, s.cdi.devRoot, nvcapDeviceInfo)
515521
configState.containerEdits = configState.containerEdits.Append(edits)
516522
}
517523

0 commit comments

Comments
 (0)