Skip to content

Commit 16cea5a

Browse files
committed
Add logic for a compute-domain daemon to remove its IP on shutdown
Signed-off-by: Kevin Klues <[email protected]>
1 parent 7db9d06 commit 16cea5a

File tree

1 file changed

+50
-0
lines changed

1 file changed

+50
-0
lines changed

cmd/compute-domain-daemon/computedomain.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,19 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) {
113113
}
114114

115115
// Stop stops the compute domain manager.
116+
//
117+
//nolint:contextcheck
116118
func (m *ComputeDomainManager) Stop() error {
119+
// Create a new context for cleanup operations since the original context might be cancelled
120+
cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 10*time.Second)
121+
defer cleanupCancel()
122+
123+
// Attempt to remove this node from the ComputeDomain status before shutting down
124+
// Don't return error here as we still want to proceed with shutdown
125+
if err := m.removeNodeFromComputeDomain(cleanupCtx); err != nil {
126+
klog.Errorf("Failed to remove node from ComputeDomain during shutdown: %v", err)
127+
}
128+
117129
if m.cancelContext != nil {
118130
m.cancelContext()
119131
}
@@ -226,6 +238,44 @@ func (m *ComputeDomainManager) GetNodesUpdateChan() chan []*nvapi.ComputeDomainN
226238
return m.updatedNodesChan
227239
}
228240

241+
// removeNodeFromComputeDomain removes the current node's entry from the ComputeDomain status.
242+
func (m *ComputeDomainManager) removeNodeFromComputeDomain(ctx context.Context) error {
243+
objs := m.informer.GetIndexer().List()
244+
if len(objs) == 0 {
245+
klog.Infof("No ComputeDomain objects found in informer cache during cleanup")
246+
return nil
247+
}
248+
249+
cd, ok := objs[0].(*nvapi.ComputeDomain)
250+
if !ok {
251+
return fmt.Errorf("failed to cast object to ComputeDomain")
252+
}
253+
254+
newCD := cd.DeepCopy()
255+
var updatedNodes []*nvapi.ComputeDomainNode
256+
257+
for _, node := range newCD.Status.Nodes {
258+
if node.Name != m.config.nodeName {
259+
updatedNodes = append(updatedNodes, node)
260+
} else {
261+
klog.Infof("Removing node %s (%s) from ComputeDomain %s/%s during shutdown", node.Name, node.IPAddress, cd.Namespace, cd.Name)
262+
}
263+
}
264+
265+
if len(updatedNodes) == len(newCD.Status.Nodes) {
266+
klog.Infof("Node %s not found in ComputeDomain %s/%s during cleanup", m.config.nodeName, cd.Namespace, cd.Name)
267+
return nil
268+
}
269+
270+
newCD.Status.Nodes = updatedNodes
271+
if _, err := m.config.clientsets.Nvidia.ResourceV1beta1().ComputeDomains(newCD.Namespace).UpdateStatus(ctx, newCD, metav1.UpdateOptions{}); err != nil {
272+
return fmt.Errorf("error removing node from ComputeDomain status: %w", err)
273+
}
274+
275+
klog.Infof("Successfully removed node %s from ComputeDomain %s/%s", m.config.nodeName, cd.Namespace, cd.Name)
276+
return nil
277+
}
278+
229279
func getIPSet(nodeInfos []*nvapi.ComputeDomainNode) IPSet {
230280
set := make(IPSet)
231281
for _, n := range nodeInfos {

0 commit comments

Comments
 (0)