Skip to content

Commit 0858065

Browse files
committed
feat: generalize condition check, add tests
1 parent 023702b commit 0858065

File tree

11 files changed

+326
-101
lines changed

11 files changed

+326
-101
lines changed

janitor/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ generate: ## Generate CRDs and move them to Helm chart directory
6060
@# Install controller-gen if not present
6161
@which controller-gen > /dev/null || (echo "Installing controller-gen..." && go install sigs.k8s.io/controller-tools/cmd/controller-gen@latest)
6262
@# Generate CRDs into api/v1alpha1 directory
63+
@# Note: Generated CRD YAML files do not include license headers (this is expected)
64+
@# YAML files are excluded from license header checks via main Makefile: -ignore '**/*.yaml'
6365
@controller-gen crd paths=./$(API_DIR) output:crd:dir=./$(API_DIR)
6466
@# Move generated CRDs to Helm chart crds directory
6567
@echo "Moving generated CRDs to $(CRD_OUTPUT_DIR)..."

janitor/api/v1alpha1/rebootnode_types.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,33 @@ func (r *RebootNode) SetCompletionTime() {
192192
}
193193
}
194194

195+
// Interface implementation for generic status update handling
196+
197+
// GetRetryCount returns the retry count
198+
func (s *RebootNodeStatus) GetRetryCount() int32 {
199+
return s.RetryCount
200+
}
201+
202+
// GetConsecutiveFailures returns the consecutive failures count
203+
func (s *RebootNodeStatus) GetConsecutiveFailures() int32 {
204+
return s.ConsecutiveFailures
205+
}
206+
207+
// GetStartTime returns the start time
208+
func (s *RebootNodeStatus) GetStartTime() *metav1.Time {
209+
return s.StartTime
210+
}
211+
212+
// GetCompletionTime returns the completion time
213+
func (s *RebootNodeStatus) GetCompletionTime() *metav1.Time {
214+
return s.CompletionTime
215+
}
216+
217+
// GetConditions returns the conditions
218+
func (s *RebootNodeStatus) GetConditions() []metav1.Condition {
219+
return s.Conditions
220+
}
221+
195222
func init() {
196223
SchemeBuilder.Register(&RebootNode{}, &RebootNodeList{})
197224
}

janitor/api/v1alpha1/terminatenode_types.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,33 @@ func (t *TerminateNode) SetCompletionTime() {
182182
}
183183
}
184184

185+
// Interface implementation for generic status update handling
186+
187+
// GetRetryCount returns the retry count
188+
func (s *TerminateNodeStatus) GetRetryCount() int32 {
189+
return s.RetryCount
190+
}
191+
192+
// GetConsecutiveFailures returns the consecutive failures count
193+
func (s *TerminateNodeStatus) GetConsecutiveFailures() int32 {
194+
return s.ConsecutiveFailures
195+
}
196+
197+
// GetStartTime returns the start time
198+
func (s *TerminateNodeStatus) GetStartTime() *metav1.Time {
199+
return s.StartTime
200+
}
201+
202+
// GetCompletionTime returns the completion time
203+
func (s *TerminateNodeStatus) GetCompletionTime() *metav1.Time {
204+
return s.CompletionTime
205+
}
206+
207+
// GetConditions returns the conditions
208+
func (s *TerminateNodeStatus) GetConditions() []metav1.Condition {
209+
return s.Conditions
210+
}
211+
185212
func init() {
186213
SchemeBuilder.Register(&TerminateNode{}, &TerminateNodeList{})
187214
}

janitor/pkg/controller/backoff.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package controller
16+
17+
import "time"
18+
19+
// getNextRequeueDelay calculates per-resource exponential backoff delay based on consecutive failures.
20+
// This is used with ctrl.Result{RequeueAfter: delay} rather than the controller's built-in rate limiter
21+
// because we need independent backoff per node based on each node's failure history, not global controller
22+
// rate limiting. Each resource (RebootNode/TerminateNode) tracks its own ConsecutiveFailures counter
23+
// and gets its own backoff schedule.
24+
//
25+
// Backoff schedule: 30s, 1m, 2m, 5m (capped at max after 3+ failures)
26+
func getNextRequeueDelay(consecutiveFailures int32) time.Duration {
27+
delays := []time.Duration{
28+
30 * time.Second, // First retry after initial failure
29+
1 * time.Minute, // Second retry
30+
2 * time.Minute, // Third retry
31+
5 * time.Minute, // Fourth+ retry (capped)
32+
}
33+
34+
// Safely convert int32 to int for array indexing
35+
idx := int(consecutiveFailures)
36+
if idx >= len(delays) {
37+
return delays[len(delays)-1] // Cap at maximum
38+
}
39+
40+
return delays[idx]
41+
}
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package controller
16+
17+
import (
18+
"testing"
19+
"time"
20+
)
21+
22+
func TestGetNextRequeueDelay(t *testing.T) {
23+
tests := []struct {
24+
name string
25+
consecutiveFailures int32
26+
expectedDelay time.Duration
27+
}{
28+
{
29+
name: "first retry after initial failure",
30+
consecutiveFailures: 0,
31+
expectedDelay: 30 * time.Second,
32+
},
33+
{
34+
name: "second retry",
35+
consecutiveFailures: 1,
36+
expectedDelay: 1 * time.Minute,
37+
},
38+
{
39+
name: "third retry",
40+
consecutiveFailures: 2,
41+
expectedDelay: 2 * time.Minute,
42+
},
43+
{
44+
name: "fourth retry",
45+
consecutiveFailures: 3,
46+
expectedDelay: 5 * time.Minute,
47+
},
48+
{
49+
name: "fifth retry - capped at max",
50+
consecutiveFailures: 4,
51+
expectedDelay: 5 * time.Minute,
52+
},
53+
{
54+
name: "many retries - still capped at max",
55+
consecutiveFailures: 10,
56+
expectedDelay: 5 * time.Minute,
57+
},
58+
{
59+
name: "very large failure count - still capped",
60+
consecutiveFailures: 100,
61+
expectedDelay: 5 * time.Minute,
62+
},
63+
}
64+
65+
for _, tt := range tests {
66+
t.Run(tt.name, func(t *testing.T) {
67+
got := getNextRequeueDelay(tt.consecutiveFailures)
68+
if got != tt.expectedDelay {
69+
t.Errorf("getNextRequeueDelay(%d) = %v, want %v",
70+
tt.consecutiveFailures, got, tt.expectedDelay)
71+
}
72+
})
73+
}
74+
}
75+
76+
func TestGetNextRequeueDelay_BackoffProgression(t *testing.T) {
77+
// Test that delays increase monotonically until capped
78+
var prevDelay time.Duration
79+
for i := int32(0); i < 10; i++ {
80+
delay := getNextRequeueDelay(i)
81+
82+
if i > 0 && delay < prevDelay {
83+
t.Errorf("Backoff delay decreased at failure count %d: prev=%v, current=%v",
84+
i, prevDelay, delay)
85+
}
86+
87+
prevDelay = delay
88+
}
89+
}

janitor/pkg/controller/rebootnode_controller.go

Lines changed: 12 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"time"
2323

2424
corev1 "k8s.io/api/core/v1"
25-
apierrors "k8s.io/apimachinery/pkg/api/errors"
2625
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2726
"k8s.io/apimachinery/pkg/runtime"
2827
ctrl "sigs.k8s.io/controller-runtime"
@@ -48,28 +47,6 @@ const (
4847
MaxRebootRetries = 20 // 10 minutes at 30s base intervals
4948
)
5049

51-
// getNextRequeueDelay calculates per-resource exponential backoff delay based on consecutive failures.
52-
// This is used with ctrl.Result{RequeueAfter: delay} rather than the controller's built-in rate limiter
53-
// because we need independent backoff per node based on each node's failure history, not global controller
54-
// rate limiting. Each RebootNode tracks its own ConsecutiveFailures counter and gets its own backoff schedule.
55-
// Returns: 30s, 1m, 2m, 5m (capped at max after 3+ failures)
56-
func getNextRequeueDelay(consecutiveFailures int32) time.Duration {
57-
delays := []time.Duration{
58-
30 * time.Second, // First retry after initial failure
59-
1 * time.Minute, // Second retry
60-
2 * time.Minute, // Third retry
61-
5 * time.Minute, // Fourth+ retry (capped)
62-
}
63-
64-
// Safely convert int32 to int for array indexing
65-
idx := int(consecutiveFailures)
66-
if idx >= len(delays) {
67-
return delays[len(delays)-1] // Cap at maximum
68-
}
69-
70-
return delays[idx]
71-
}
72-
7350
// conditionsChanged compares two sets of conditions and returns true if they differ.
7451
// It checks the type, status, reason, and message of each condition.
7552
func conditionsChanged(original, updated []metav1.Condition) bool {
@@ -102,47 +79,25 @@ func conditionsChanged(original, updated []metav1.Condition) bool {
10279
}
10380

10481
// updateRebootNodeStatus is a helper function that handles status updates with proper error handling.
105-
// It centralizes the status update logic to avoid code duplication and provides consistent handling
106-
// of status updates across different code paths in the reconciliation loop.
107-
//
108-
//nolint:dupl // Similar to updateTerminateNodeStatus but operates on RebootNode type
82+
// It delegates to the generic updateNodeActionStatus function.
10983
func (r *RebootNodeReconciler) updateRebootNodeStatus(
11084
ctx context.Context,
11185
req ctrl.Request,
11286
original *janitordgxcnvidiacomv1alpha1.RebootNode,
11387
updated *janitordgxcnvidiacomv1alpha1.RebootNode,
11488
result ctrl.Result,
11589
) (ctrl.Result, error) {
116-
logger := log.FromContext(ctx)
117-
118-
// Check if status changed by comparing individual fields.
119-
// This status update is safe because controller-runtime uses leader election
120-
// to ensure only one controller instance is active at a time, even with multiple replicas.
121-
// The active controller has exclusive write access to RebootNode status.
122-
statusChanged := original.Status.RetryCount != updated.Status.RetryCount ||
123-
original.Status.ConsecutiveFailures != updated.Status.ConsecutiveFailures ||
124-
(original.Status.StartTime == nil) != (updated.Status.StartTime == nil) ||
125-
(original.Status.CompletionTime == nil) != (updated.Status.CompletionTime == nil) ||
126-
conditionsChanged(original.Status.Conditions, updated.Status.Conditions)
127-
128-
if statusChanged {
129-
if err := r.Status().Update(ctx, updated); err != nil {
130-
if apierrors.IsNotFound(err) {
131-
logger.V(0).Info("post-reconciliation status update: object not found, assumed deleted",
132-
"name", updated.Name)
133-
return ctrl.Result{}, nil
134-
}
135-
logger.Error(err, "failed to update rebootnode status",
136-
"node", updated.Spec.NodeName)
137-
return ctrl.Result{}, err
138-
}
139-
logger.Info("rebootnode status updated",
140-
"node", updated.Spec.NodeName,
141-
"retryCount", int(updated.Status.RetryCount),
142-
"consecutiveFailures", int(updated.Status.ConsecutiveFailures))
143-
}
144-
145-
return result, nil
90+
return updateNodeActionStatus(
91+
ctx,
92+
r.Status(),
93+
original,
94+
updated,
95+
&original.Status,
96+
&updated.Status,
97+
updated.Spec.NodeName,
98+
"rebootnode",
99+
result,
100+
)
146101
}
147102

148103
// RebootNodeReconciler reconciles a RebootNode object

0 commit comments

Comments
 (0)