Skip to content

Commit d670f87

Browse files
committed
sync changes
1 parent 2758a69 commit d670f87

File tree

6 files changed

+354
-380
lines changed

6 files changed

+354
-380
lines changed

distros/kubernetes/nvsentinel/charts/node-drainer/templates/clusterrole.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ rules:
4747
- list
4848
- patch
4949
- update
50+
- watch
5051
- apiGroups:
5152
- ""
5253
resources:

node-drainer-module/pkg/informers/informers.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ const (
4646
type Informers struct {
4747
podInformer cache.SharedIndexInformer
4848
eventInformer cache.SharedIndexInformer
49+
nodeInformer cache.SharedIndexInformer
4950
clientset kubernetes.Interface
5051
notReadyTimeoutMinutes *int
5152
dryRunMode []string
@@ -86,6 +87,8 @@ func NewInformers(clientset kubernetes.Interface, resyncPeriod time.Duration,
8687
return nil, fmt.Errorf("failed to add event indexer: %w", err)
8788
}
8889

90+
nodeInformer := informerFactory.Core().V1().Nodes().Informer()
91+
8992
dryRunMode := []string{}
9093
if dryRun {
9194
dryRunMode = []string{metav1.DryRunAll}
@@ -95,14 +98,15 @@ func NewInformers(clientset kubernetes.Interface, resyncPeriod time.Duration,
9598
clientset: clientset,
9699
podInformer: podInformer,
97100
eventInformer: eventInformer,
101+
nodeInformer: nodeInformer,
98102
notReadyTimeoutMinutes: notReadyTimeoutMinutes,
99103
dryRunMode: dryRunMode,
100104
namespace: metav1.NamespaceDefault,
101105
}, nil
102106
}
103107

104108
func (i *Informers) HasSynced() bool {
105-
return i.podInformer.HasSynced() && i.eventInformer.HasSynced()
109+
return i.podInformer.HasSynced() && i.eventInformer.HasSynced() && i.nodeInformer.HasSynced()
106110
}
107111

108112
func NodeIndexFunc(obj any) ([]string, error) {
@@ -151,6 +155,7 @@ func NodeEventReasonIndexFunc(obj any) ([]string, error) {
151155
func (i *Informers) Run(ctx context.Context) error {
152156
go i.podInformer.Run(ctx.Done())
153157
go i.eventInformer.Run(ctx.Done())
158+
go i.nodeInformer.Run(ctx.Done())
154159

155160
if ok := cache.WaitForCacheSync(ctx.Done(),
156161
i.HasSynced); !ok {
@@ -382,6 +387,23 @@ func (i *Informers) UpdateNodeEvent(ctx context.Context, nodeName string, reason
382387
}
383388
}
384389

390+
// Get node from informer cache to retrieve its UID for proper event association
391+
nodeObj, exists, err := i.nodeInformer.GetIndexer().GetByKey(nodeName)
392+
if err != nil {
393+
klog.Errorf("Failed to get node %s from cache: %v", nodeName, err)
394+
return fmt.Errorf("error getting node %s from cache: %w", nodeName, err)
395+
}
396+
397+
if !exists {
398+
klog.Errorf("Node %s not found in cache", nodeName)
399+
return fmt.Errorf("node %s not found in cache", nodeName)
400+
}
401+
402+
node, ok := nodeObj.(*v1.Node)
403+
if !ok {
404+
return fmt.Errorf("failed to cast node object for %s", nodeName)
405+
}
406+
385407
newEvent := &v1.Event{
386408
ObjectMeta: metav1.ObjectMeta{
387409
GenerateName: nodeName + "-",
@@ -390,6 +412,7 @@ func (i *Informers) UpdateNodeEvent(ctx context.Context, nodeName string, reason
390412
InvolvedObject: v1.ObjectReference{
391413
Kind: "Node",
392414
Name: nodeName,
415+
UID: node.UID,
393416
APIVersion: "v1",
394417
},
395418
Reason: reason,

node-drainer-module/pkg/queue/types.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,6 @@ type MongoCollectionAPI interface {
3636
Find(ctx context.Context, filter interface{}, opts ...*options.FindOptions) (*mongo.Cursor, error)
3737
}
3838

39-
const (
40-
maxRetries = 5
41-
)
42-
4339
type EventQueueManager interface {
4440
EnqueueEvent(ctx context.Context, nodeName string, event bson.M, collection MongoCollectionAPI) error
4541
Start(ctx context.Context)

node-drainer-module/pkg/queue/worker.go

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,9 @@ func (m *eventQueueManager) processNextWorkItem(ctx context.Context) bool {
5050

5151
err := m.processEvent(ctx, *nodeEvent.Event, nodeEvent.Collection, nodeEvent.NodeName)
5252
if err != nil {
53-
if m.queue.NumRequeues(nodeEvent) < maxRetries {
54-
klog.Warningf("Error processing event for node %s (attempt %d/%d): %v",
55-
nodeEvent.NodeName, m.queue.NumRequeues(nodeEvent)+1, maxRetries, err)
56-
m.queue.AddRateLimited(nodeEvent)
57-
} else {
58-
klog.Errorf("Failed to process event for node %s after %d retries: %v",
59-
nodeEvent.NodeName, maxRetries, err)
60-
m.queue.Forget(nodeEvent)
61-
}
53+
klog.Warningf("Error processing event for node %s (attempt %d): %v (will retry)",
54+
nodeEvent.NodeName, m.queue.NumRequeues(nodeEvent)+1, err)
55+
m.queue.AddRateLimited(nodeEvent)
6256
} else {
6357
m.queue.Forget(nodeEvent)
6458
}

node-drainer-module/pkg/reconciler/reconciler.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -162,15 +162,20 @@ func (r *Reconciler) executeImmediateEviction(ctx context.Context,
162162
}
163163
}
164164

165-
return nil
165+
return fmt.Errorf("immediate eviction completed, requeuing for status verification")
166166
}
167167

168168
func (r *Reconciler) executeTimeoutEviction(ctx context.Context,
169169
action *evaluator.DrainActionResult, healthEvent storeconnector.HealthEventWithStatus) error {
170170
nodeName := healthEvent.HealthEvent.NodeName
171171
timeoutMinutes := int(action.Timeout.Minutes())
172172

173-
return r.informers.DeletePodsAfterTimeout(ctx, nodeName, action.Namespaces, timeoutMinutes, &healthEvent)
173+
if err := r.informers.DeletePodsAfterTimeout(ctx,
174+
nodeName, action.Namespaces, timeoutMinutes, &healthEvent); err != nil {
175+
return err
176+
}
177+
178+
return fmt.Errorf("timeout eviction initiated, requeuing for status verification")
174179
}
175180

176181
func (r *Reconciler) executeCheckCompletion(ctx context.Context,
@@ -211,7 +216,7 @@ func (r *Reconciler) executeCheckCompletion(ctx context.Context,
211216

212217
klog.Infof("All pods completed on node %s", nodeName)
213218

214-
return nil
219+
return fmt.Errorf("pod completion verified, requeuing for status update")
215220
}
216221

217222
func (r *Reconciler) executeMarkAlreadyDrained(ctx context.Context,

0 commit comments

Comments
 (0)