Skip to content

Commit c961d5c

Browse files
committed
Change semantics to force IMEX shutdown on ComputeDomain delete
Previously we would block deletion of the ComputeDomaon until all workloads making use of it completed (though this was actually buggy). The new semantics don't do this anymore -- instead deletion of a ComputeDomain now actively removes the node labels referencing that ComputeDomain, forcing any IMEX daemons running on those nodes to shutdown. A cleanup handler is used to remove any stray labels in case a workload happens to slip in a label after the ComputeDomain has been deleted (which is possible if the ComputeDomain gets deleted while the workload is still coming online). Signed-off-by: Kevin Klues <[email protected]>
1 parent 9732ac3 commit c961d5c

File tree

1 file changed

+24
-5
lines changed

1 file changed

+24
-5
lines changed

cmd/compute-domain-controller/computedomain.go

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"sync"
2323
"time"
2424

25+
corev1 "k8s.io/api/core/v1"
2526
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2627
"k8s.io/client-go/tools/cache"
2728
"k8s.io/klog/v2"
@@ -57,6 +58,7 @@ type ComputeDomainManager struct {
5758

5859
daemonSetManager *DaemonSetManager
5960
resourceClaimTemplateManager *WorkloadResourceClaimTemplateManager
61+
cleanupManager *CleanupManager[*corev1.Node]
6062
}
6163

6264
// NewComputeDomainManager creates a new ComputeDomainManager.
@@ -71,6 +73,7 @@ func NewComputeDomainManager(config *ManagerConfig) *ComputeDomainManager {
7173
}
7274
m.daemonSetManager = NewDaemonSetManager(config, m.Get)
7375
m.resourceClaimTemplateManager = NewWorkloadResourceClaimTemplateManager(config, m.Get)
76+
m.cleanupManager = NewCleanupManager[*corev1.Node](informer, m.Get, m.cleanup)
7477

7578
return m
7679
}
@@ -125,6 +128,10 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) {
125128
return fmt.Errorf("error creating ResourceClaim manager: %w", err)
126129
}
127130

131+
if err := m.cleanupManager.Start(ctx); err != nil {
132+
return fmt.Errorf("error starting cleanup manager: %w", err)
133+
}
134+
128135
return nil
129136
}
130137

@@ -196,7 +203,7 @@ func (m *ComputeDomainManager) RemoveFinalizer(ctx context.Context, uid string)
196203
// TODO: We should probably also check to ensure that all ResourceClaims
197204
// generated from our ResourceClaimTemplate for workloads are gone. Doing
198205
// something is better than nothing for now though.
199-
func (m *ComputeDomainManager) AssertWorkloadsCompleted(ctx context.Context, cdUID string) error {
206+
func (m *ComputeDomainManager) RemoveNodeLabels(ctx context.Context, cdUID string) error {
200207
labelSelector := &metav1.LabelSelector{
201208
MatchExpressions: []metav1.LabelSelectorRequirement{
202209
{
@@ -214,8 +221,13 @@ func (m *ComputeDomainManager) AssertWorkloadsCompleted(ctx context.Context, cdU
214221
return fmt.Errorf("error retrieving nodes: %w", err)
215222
}
216223

217-
if len(nodes.Items) != 0 {
218-
return fmt.Errorf("nodes exist with label for ComputeDomain %s", cdUID)
224+
for _, node := range nodes.Items {
225+
newNode := node.DeepCopy()
226+
delete(newNode.Labels, computeDomainLabelKey)
227+
228+
if _, err := m.config.clientsets.Core.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}); err != nil {
229+
return fmt.Errorf("error updating node %s: %w", node.Name, err)
230+
}
219231
}
220232

221233
return nil
@@ -254,8 +266,8 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
254266
return fmt.Errorf("error deleting DaemonSet: %w", err)
255267
}
256268

257-
if err := m.AssertWorkloadsCompleted(ctx, string(cd.UID)); err != nil {
258-
return fmt.Errorf("error asserting workloads completed: %w", err)
269+
if err := m.RemoveNodeLabels(ctx, string(cd.UID)); err != nil {
270+
return fmt.Errorf("error removing ComputeDomain node labels: %w", err)
259271
}
260272

261273
if err := m.resourceClaimTemplateManager.RemoveFinalizer(ctx, string(cd.UID)); err != nil {
@@ -295,3 +307,10 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
295307

296308
return nil
297309
}
310+
311+
func (m *ComputeDomainManager) cleanup(ctx context.Context, cdUID string) error {
312+
if err := m.RemoveNodeLabels(ctx, cdUID); err != nil {
313+
return fmt.Errorf("error removing ComputeDomain node labels: %w", err)
314+
}
315+
return nil
316+
}

0 commit comments

Comments
 (0)