@@ -113,7 +113,19 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) {
113113}
114114
115115// Stop stops the compute domain manager.
116+ //
117+ //nolint:contextcheck
116118func (m * ComputeDomainManager ) Stop () error {
119+ // Create a new context for cleanup operations since the original context might be cancelled
120+ cleanupCtx , cleanupCancel := context .WithTimeout (context .Background (), 10 * time .Second )
121+ defer cleanupCancel ()
122+
123+ // Attempt to remove this node from the ComputeDomain status before shutting down
124+ // Don't return error here as we still want to proceed with shutdown
125+ if err := m .removeNodeFromComputeDomain (cleanupCtx ); err != nil {
126+ klog .Errorf ("Failed to remove node from ComputeDomain during shutdown: %v" , err )
127+ }
128+
117129 if m .cancelContext != nil {
118130 m .cancelContext ()
119131 }
@@ -226,6 +238,44 @@ func (m *ComputeDomainManager) GetNodesUpdateChan() chan []*nvapi.ComputeDomainN
226238 return m .updatedNodesChan
227239}
228240
241+ // removeNodeFromComputeDomain removes the current node's entry from the ComputeDomain status.
242+ func (m * ComputeDomainManager ) removeNodeFromComputeDomain (ctx context.Context ) error {
243+ objs := m .informer .GetIndexer ().List ()
244+ if len (objs ) == 0 {
245+ klog .Infof ("No ComputeDomain objects found in informer cache during cleanup" )
246+ return nil
247+ }
248+
249+ cd , ok := objs [0 ].(* nvapi.ComputeDomain )
250+ if ! ok {
251+ return fmt .Errorf ("failed to cast object to ComputeDomain" )
252+ }
253+
254+ newCD := cd .DeepCopy ()
255+ var updatedNodes []* nvapi.ComputeDomainNode
256+
257+ for _ , node := range newCD .Status .Nodes {
258+ if node .Name != m .config .nodeName {
259+ updatedNodes = append (updatedNodes , node )
260+ } else {
261+ klog .Infof ("Removing node %s (%s) from ComputeDomain %s/%s during shutdown" , node .Name , node .IPAddress , cd .Namespace , cd .Name )
262+ }
263+ }
264+
265+ if len (updatedNodes ) == len (newCD .Status .Nodes ) {
266+ klog .Infof ("Node %s not found in ComputeDomain %s/%s during cleanup" , m .config .nodeName , cd .Namespace , cd .Name )
267+ return nil
268+ }
269+
270+ newCD .Status .Nodes = updatedNodes
271+ if _ , err := m .config .clientsets .Nvidia .ResourceV1beta1 ().ComputeDomains (newCD .Namespace ).UpdateStatus (ctx , newCD , metav1.UpdateOptions {}); err != nil {
272+ return fmt .Errorf ("error removing node from ComputeDomain status: %w" , err )
273+ }
274+
275+ klog .Infof ("Successfully removed node %s from ComputeDomain %s/%s" , m .config .nodeName , cd .Namespace , cd .Name )
276+ return nil
277+ }
278+
229279func getIPSet (nodeInfos []* nvapi.ComputeDomainNode ) IPSet {
230280 set := make (IPSet )
231281 for _ , n := range nodeInfos {
0 commit comments