@@ -78,7 +78,6 @@ func NewComputeDomainManager(config *ManagerConfig) *ComputeDomainManager {
7878 previousNodes : []* nvapi.ComputeDomainNode {},
7979 updatedNodesChan : make (chan []* nvapi.ComputeDomainNode ),
8080 }
81- m .podManager = NewPodManager (config , m .Get )
8281
8382 return m
8483}
@@ -112,6 +111,8 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) {
112111 true ,
113112 )
114113
114+ m .podManager = NewPodManager (m .config , m .Get , m .mutationCache )
115+
115116 _ , err = m .informer .AddEventHandler (cache.ResourceEventHandlerFuncs {
116117 AddFunc : func (obj any ) {
117118 m .config .workQueue .Enqueue (obj , m .onAddOrUpdate )
@@ -191,8 +192,9 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
191192 return fmt .Errorf ("failed to cast to ComputeDomain" )
192193 }
193194
194- // Get the latest ComputeDomain object from the informer cache since we
195- // plan to update it later and always *must* have the latest version.
195+ // Get the latest ComputeDomain object from the mutation cache (backed by
196+ // the informer cache) since we plan to update it later and always *must*
197+ // have the latest version.
196198 cd , err := m .Get (string (o .GetUID ()))
197199 if err != nil {
198200 return fmt .Errorf ("error getting latest ComputeDomain: %w" , err )
@@ -215,8 +217,9 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
215217 return nil
216218}
217219
218- // UpdateComputeDomainNodeInfo updates the Nodes field in the ComputeDomain
219- // with info about the ComputeDomain daemon running on this node.
220+ // UpdateComputeDomainNodeInfo updates the Nodes field in the ComputeDomain with
221+ // info about the ComputeDomain daemon running on this node. Upon success, it
222+ // reflects the mutation in `m.mutationCache`.
220223func (m * ComputeDomainManager ) UpdateComputeDomainNodeInfo (ctx context.Context , cd * nvapi.ComputeDomain ) (rerr error ) {
221224 var nodeInfo * nvapi.ComputeDomainNode
222225
@@ -268,12 +271,12 @@ func (m *ComputeDomainManager) UpdateComputeDomainNodeInfo(ctx context.Context,
268271 newCD .Status .Status = nvapi .ComputeDomainStatusNotReady
269272 }
270273
271- // Update the status
272- if _ , err := m .config .clientsets .Nvidia .ResourceV1beta1 ().ComputeDomains (newCD .Namespace ).UpdateStatus (ctx , newCD , metav1.UpdateOptions {}); err != nil {
273- return fmt .Errorf ("error updating nodes in ComputeDomain status: %w" , err )
274+ // Update status and (upon success) store the latest version of the object
275+ // (as returned by the API server) in the mutation cache.
276+ newCD , err := m .config .clientsets .Nvidia .ResourceV1beta1 ().ComputeDomains (newCD .Namespace ).UpdateStatus (ctx , newCD , metav1.UpdateOptions {})
277+ if err != nil {
278+ return fmt .Errorf ("error updating ComputeDomain status: %w" , err )
274279 }
275-
276- // Add the updated ComputeDomain to the mutation cache
277280 m .mutationCache .Mutation (newCD )
278281
279282 return nil
@@ -360,15 +363,13 @@ func (m *ComputeDomainManager) GetNodesUpdateChan() chan []*nvapi.ComputeDomainN
360363
361364// removeNodeFromComputeDomain removes the current node's entry from the ComputeDomain status.
362365func (m * ComputeDomainManager ) removeNodeFromComputeDomain (ctx context.Context ) error {
363- objs := m .informer .GetIndexer ().List ()
364- if len (objs ) == 0 {
365- klog .Infof ("No ComputeDomain objects found in informer cache during cleanup" )
366- return nil
366+ cd , err := m .Get (m .config .computeDomainUUID )
367+ if err != nil {
368+ return fmt .Errorf ("error getting ComputeDomain from mutation cache: %w" , err )
367369 }
368-
369- cd , ok := objs [0 ].(* nvapi.ComputeDomain )
370- if ! ok {
371- return fmt .Errorf ("failed to cast object to ComputeDomain" )
370+ if cd == nil {
371+ klog .Infof ("No ComputeDomain object found in mutation cache during cleanup" )
372+ return nil
372373 }
373374
374375 newCD := cd .DeepCopy ()
@@ -391,10 +392,13 @@ func (m *ComputeDomainManager) removeNodeFromComputeDomain(ctx context.Context)
391392 newCD .Status .Status = nvapi .ComputeDomainStatusNotReady
392393 }
393394
395+ // Update status and (upon success) store the latest version of the object
396+ // (as returned by the API server) in the mutation cache.
394397 newCD .Status .Nodes = updatedNodes
395398 if _ , err := m .config .clientsets .Nvidia .ResourceV1beta1 ().ComputeDomains (newCD .Namespace ).UpdateStatus (ctx , newCD , metav1.UpdateOptions {}); err != nil {
396399 return fmt .Errorf ("error removing node from ComputeDomain status: %w" , err )
397400 }
401+ m .mutationCache .Mutation (newCD )
398402
399403 klog .Infof ("Successfully removed node with IP %s from ComputeDomain %s/%s" , m .config .podIP , newCD .Namespace , newCD .Name )
400404 return nil
0 commit comments