Skip to content

Commit 364837a

Browse files
committed
Improve Day 2 retry logic for hardware configuration
Enable retry of hardware configuration operations after timeouts or failures by allowing spec changes to trigger new configuration attempts. Key changes: - Add webhook validation to prevent spec updates for Day 0 provisioning timeouts/failures (requires delete and recreate) - Track ObservedConfigTransactionId to detect configuration spec changes - Skip timeout checking when ConfigTransactionId changes (indicates retry) - Add waitingForConfigStart logic to handle new configuration attempts - Update shouldUpdateHardwareStatus to properly handle terminal states - Add Day 2 retry test scenarios Assisted-by: Cursor/claude-4-sonnet Signed-off-by: Tao Liu <[email protected]>
1 parent b5b05e0 commit 364837a

File tree

7 files changed

+341
-19
lines changed

7 files changed

+341
-19
lines changed

api/provisioning/v1alpha1/provisioningrequest_webhook.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ package v1alpha1
99
import (
1010
"context"
1111
"fmt"
12+
"reflect"
1213
"strings"
1314

1415
"github.com/google/uuid"
@@ -139,6 +140,22 @@ func (v *provisioningRequestValidator) validateCreateOrUpdate(ctx context.Contex
139140
return nil
140141
}
141142

143+
// Check if hardware provisioning has timed out or failed
144+
// If so, reject any spec updates - user must delete and recreate the PR
145+
hwProvisionedCond := meta.FindStatusCondition(
146+
newPr.Status.Conditions, string(PRconditionTypes.HardwareProvisioned))
147+
if hwProvisionedCond != nil &&
148+
hwProvisionedCond.Status == "False" &&
149+
(hwProvisionedCond.Reason == string(CRconditionReasons.TimedOut) ||
150+
hwProvisionedCond.Reason == string(CRconditionReasons.Failed)) {
151+
// Compare specs to see if there's an actual spec change
152+
if !reflect.DeepEqual(oldPr.Spec, newPr.Spec) {
153+
return fmt.Errorf("hardware provisioning has timed out or failed. " +
154+
"Spec changes are not allowed. " +
155+
"Please delete and recreate the ProvisioningRequest to retry")
156+
}
157+
}
158+
142159
crProvisionedCond := meta.FindStatusCondition(
143160
newPr.Status.Conditions, string(PRconditionTypes.ClusterProvisioned))
144161
if crProvisionedCond == nil ||

hwmgr-plugins/controller/utils/node_allocation_request.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,11 @@ func UpdateNodeAllocationRequestStatusCondition(
201201
conditionStatus,
202202
message)
203203

204+
// Update the observed config transaction id if the condition is Configured
205+
if conditionType == hwmgmtv1alpha1.Configured {
206+
newNodeAllocationRequest.Status.ObservedConfigTransactionId = nodeAllocationRequest.Spec.ConfigTransactionId
207+
}
208+
204209
// Copy both start time fields from the local object to persist them
205210
newNodeAllocationRequest.Status.ProvisioningStartTime = nodeAllocationRequest.Status.ProvisioningStartTime
206211
newNodeAllocationRequest.Status.ConfiguringStartTime = nodeAllocationRequest.Status.ConfiguringStartTime

hwmgr-plugins/metal3/controller/metal3_nodeallocationrequest_controller.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -684,6 +684,13 @@ func (r *NodeAllocationRequestReconciler) handleNodeAllocationRequestSpecChanged
684684
nodeAllocationRequest.Status.Conditions,
685685
string(hwmgmtv1alpha1.Configured))
686686

687+
if configuredCondition != nil {
688+
if configuredCondition.Reason == string(hwmgmtv1alpha1.TimedOut) ||
689+
configuredCondition.Reason == string(hwmgmtv1alpha1.Failed) {
690+
r.Logger.InfoContext(ctx, "NodeAllocationRequest configuration is in terminal state, but config change detected - allowing retry",
691+
slog.String("reason", configuredCondition.Reason))
692+
}
693+
}
687694
// Set a default status that will be updated during the configuration process
688695
if configuredCondition == nil || configuredCondition.Status == metav1.ConditionTrue {
689696
if result, err := setAwaitConfigCondition(ctx, r.Client, nodeAllocationRequest); err != nil {
@@ -859,6 +866,19 @@ func (r *NodeAllocationRequestReconciler) checkHardwareTimeout(
859866

860867
// If provisioning is complete (or not in-progress) and configuring is in-progress, use ConfiguringStartTime.
861868
if inProgress(cfg) {
869+
// For Day 2 retry: if there's a spec change (ConfigTransactionId mismatch),
870+
// skip timeout checking as this is a new configuration attempt
871+
// Note: ObservedConfigTransactionId is 0 when not set, so we check for 0 or mismatch
872+
if nar.Spec.ConfigTransactionId != 0 &&
873+
(nar.Status.ObservedConfigTransactionId == 0 ||
874+
nar.Status.ObservedConfigTransactionId != nar.Spec.ConfigTransactionId) {
875+
r.Logger.InfoContext(context.Background(), "Configuration spec change detected, skipping timeout check for retry",
876+
slog.String("nar", nar.Name),
877+
slog.Int64("specConfigTransactionId", nar.Spec.ConfigTransactionId),
878+
slog.Int64("observedConfigTransactionId", nar.Status.ObservedConfigTransactionId))
879+
return false, hwmgmtv1alpha1.ConditionType(""), nil
880+
}
881+
862882
if nar.Status.ConfiguringStartTime == nil || nar.Status.ConfiguringStartTime.Time.IsZero() {
863883
// Inconsistent state: configuring says in-progress but no start time.
864884
r.Logger.WarnContext(context.Background(), "Configuration in progress but no start time set",

hwmgr-plugins/metal3/controller/metal3_nodeallocationrequest_controller_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,4 +286,92 @@ var _ = Describe("Metal3 NodeAllocationRequest Controller Timeout Handling", fun
286286
})
287287
})
288288
})
289+
290+
Describe("Day 2 retry scenarios", func() {
291+
var nar *pluginsv1alpha1.NodeAllocationRequest
292+
293+
BeforeEach(func() {
294+
nar = &pluginsv1alpha1.NodeAllocationRequest{
295+
ObjectMeta: metav1.ObjectMeta{
296+
Name: "test-nar-day2",
297+
Namespace: "default",
298+
},
299+
Spec: pluginsv1alpha1.NodeAllocationRequestSpec{
300+
HardwareProvisioningTimeout: "5m",
301+
ConfigTransactionId: 2, // Indicates spec change
302+
},
303+
Status: pluginsv1alpha1.NodeAllocationRequestStatus{
304+
Conditions: []metav1.Condition{},
305+
},
306+
}
307+
})
308+
309+
Context("when configuration failed and spec changed", func() {
310+
BeforeEach(func() {
311+
// Set provisioning as completed
312+
hwmgrutils.SetStatusCondition(&nar.Status.Conditions,
313+
string(hwmgmtv1alpha1.Provisioned),
314+
string(hwmgmtv1alpha1.Completed),
315+
metav1.ConditionTrue,
316+
"Hardware provisioning completed")
317+
318+
// Set configuration as failed
319+
hwmgrutils.SetStatusCondition(&nar.Status.Conditions,
320+
string(hwmgmtv1alpha1.Configured),
321+
string(hwmgmtv1alpha1.Failed),
322+
metav1.ConditionFalse,
323+
"Hardware configuration failed")
324+
325+
// Set configuration start time to old (exceeded timeout) - this should be ignored when spec changes
326+
startTime := metav1.Time{Time: time.Now().Add(-10 * time.Minute)}
327+
nar.Status.ConfiguringStartTime = &startTime
328+
329+
// Set ObservedConfigTransactionId to 1, but Spec.ConfigTransactionId is 2 (mismatch = spec change)
330+
nar.Status.ObservedConfigTransactionId = 1
331+
})
332+
333+
It("should allow retry when spec changes", func() {
334+
// The Metal3 controller should detect the spec change and skip timeout checking
335+
// This allows retry even when the previous configuration failed/timed out
336+
timeoutExceeded, conditionType, err := reconciler.checkHardwareTimeout(nar)
337+
Expect(err).ToNot(HaveOccurred())
338+
Expect(timeoutExceeded).To(BeFalse())
339+
Expect(conditionType).To(Equal(hwmgmtv1alpha1.ConditionType("")))
340+
})
341+
})
342+
343+
Context("when configuration timed out and spec changed", func() {
344+
BeforeEach(func() {
345+
// Set provisioning as completed
346+
hwmgrutils.SetStatusCondition(&nar.Status.Conditions,
347+
string(hwmgmtv1alpha1.Provisioned),
348+
string(hwmgmtv1alpha1.Completed),
349+
metav1.ConditionTrue,
350+
"Hardware provisioning completed")
351+
352+
// Set configuration as timed out
353+
hwmgrutils.SetStatusCondition(&nar.Status.Conditions,
354+
string(hwmgmtv1alpha1.Configured),
355+
string(hwmgmtv1alpha1.TimedOut),
356+
metav1.ConditionFalse,
357+
"Hardware configuration timed out")
358+
359+
// Set configuration start time to old (exceeded timeout) - this should be ignored when spec changes
360+
startTime := metav1.Time{Time: time.Now().Add(-10 * time.Minute)}
361+
nar.Status.ConfiguringStartTime = &startTime
362+
363+
// Set ObservedConfigTransactionId to 1, but Spec.ConfigTransactionId is 2 (mismatch = spec change)
364+
nar.Status.ObservedConfigTransactionId = 1
365+
})
366+
367+
It("should allow retry when spec changes", func() {
368+
// Similar to failed case, should allow retry with spec change
369+
// Timeout check should be skipped when spec changes
370+
timeoutExceeded, conditionType, err := reconciler.checkHardwareTimeout(nar)
371+
Expect(err).ToNot(HaveOccurred())
372+
Expect(timeoutExceeded).To(BeFalse())
373+
Expect(conditionType).To(Equal(hwmgmtv1alpha1.ConditionType("")))
374+
})
375+
})
376+
})
289377
})

internal/controllers/provisioningrequest_hwprovision.go

Lines changed: 53 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -548,8 +548,21 @@ func (t *provisioningRequestReconcilerTask) updateHardwareStatus(
548548
}
549549
}
550550

551+
// Check if we're waiting for a new configuration to start (only when condition doesn't exist)
552+
waitingForConfigStart := condition == hwmgmtv1alpha1.Configured &&
553+
hwCondition == nil &&
554+
(nodeAllocationRequest.Status.ObservedConfigTransactionId == nil ||
555+
*nodeAllocationRequest.Status.ObservedConfigTransactionId == 0 ||
556+
*nodeAllocationRequest.Status.ObservedConfigTransactionId != t.object.Generation)
557+
551558
if hwCondition == nil {
552-
// Condition does not exist
559+
// Condition does not exist in plugin response
560+
if waitingForConfigStart {
561+
// We're waiting for a new configuration to start - return ConditionDoesNotExistsErr
562+
// to indicate that configuration hasn't started yet for this transaction
563+
return false, false, &ctlrutils.ConditionDoesNotExistsErr{ConditionName: string(condition)}
564+
}
565+
// Condition doesn't exist and we're not waiting for config start
553566
status = metav1.ConditionFalse
554567
reason = string(provisioningv1alpha1.CRconditionReasons.InProgress)
555568
message = fmt.Sprintf("Hardware %s is in progress", ctlrutils.GetStatusMessage(condition))
@@ -561,7 +574,8 @@ func (t *provisioningRequestReconcilerTask) updateHardwareStatus(
561574
}
562575
ctlrutils.SetProvisioningStateInProgress(t.object, message)
563576
} else {
564-
// A hardware condition was found and should be processed; use its details.
577+
// A hardware condition was found in plugin response - always process it
578+
// (even if we're waiting for config start, the plugin has provided valid state)
565579
status, reason, message, timedOutOrFailed = t.processExistingHardwareCondition(hwCondition, condition)
566580
}
567581

@@ -624,31 +638,50 @@ func (t *provisioningRequestReconcilerTask) isCallbackTriggeredReconciliation()
624638
return true
625639
}
626640

627-
// shouldUpdateHardwareStatus determines if we should update hardware status based on callback
628-
func (t *provisioningRequestReconcilerTask) shouldUpdateHardwareStatus(condition hwmgmtv1alpha1.ConditionType) bool {
629-
// Always update on callback-triggered reconciliation
641+
// shouldUpdateHardwareStatus decides whether to update hardware status during reconciliation.
642+
// Rules:
643+
// - Always update on callback-triggered reconciliations.
644+
// - During polling (non-callback), for Provisioned/Configured:
645+
// - If no prior condition exists, update.
646+
// - If prior condition is terminal (TimedOut/Failed), do not update.
647+
// - Otherwise, update only if prior Status != True.
648+
func (t *provisioningRequestReconcilerTask) shouldUpdateHardwareStatus(
649+
condition hwmgmtv1alpha1.ConditionType,
650+
) bool {
651+
// Callbacks always allowed to update.
630652
if t.isCallbackTriggeredReconciliation() {
631653
return true
632654
}
633655

634-
// For non-callback reconciliation, check different conditions
635-
switch condition {
636-
case hwmgmtv1alpha1.Provisioned:
637-
hwProvisionedCond := meta.FindStatusCondition(t.object.Status.Conditions, string(provisioningv1alpha1.PRconditionTypes.HardwareProvisioned))
638-
return hwProvisionedCond == nil // Only update if no status set yet
639-
case hwmgmtv1alpha1.Configured:
640-
hwConfiguredCond := meta.FindStatusCondition(t.object.Status.Conditions, string(provisioningv1alpha1.PRconditionTypes.HardwareConfigured))
656+
// Map HW condition → PR condition type stored in Status.Conditions.
657+
prTypeByHW := map[hwmgmtv1alpha1.ConditionType]string{
658+
hwmgmtv1alpha1.Provisioned: string(provisioningv1alpha1.PRconditionTypes.HardwareProvisioned),
659+
hwmgmtv1alpha1.Configured: string(provisioningv1alpha1.PRconditionTypes.HardwareConfigured),
660+
}
641661

642-
// If no status exists yet, allow update
643-
if hwConfiguredCond == nil {
644-
return true
645-
}
662+
prCondType, ok := prTypeByHW[condition]
663+
if !ok {
664+
// Unknown/non-actionable condition types: do not update during polling.
665+
return false
666+
}
667+
668+
hwCond := meta.FindStatusCondition(t.object.Status.Conditions, prCondType)
669+
if hwCond == nil {
670+
// No status yet → allow first write.
671+
return true
672+
}
646673

647-
// Allow updates for in-progress states (not completed)
648-
return hwConfiguredCond.Status != metav1.ConditionTrue
674+
if isTerminalReason(hwCond.Reason) {
675+
return false
649676
}
650677

651-
return false
678+
// Update only if not already True (i.e., False or Unknown).
679+
return hwCond.Status != metav1.ConditionTrue
680+
}
681+
682+
func isTerminalReason(reason string) bool {
683+
return reason == string(provisioningv1alpha1.CRconditionReasons.TimedOut) ||
684+
reason == string(provisioningv1alpha1.CRconditionReasons.Failed)
652685
}
653686

654687
// checkExistingNodeAllocationRequest checks for an existing NodeAllocationRequest and verifies changes if necessary
@@ -730,6 +763,7 @@ func (t *provisioningRequestReconcilerTask) buildNodeAllocationRequest(clusterIn
730763
nodeAllocationRequest.ClusterId = clusterId.(string)
731764
nodeAllocationRequest.NodeGroup = nodeGroups
732765
nodeAllocationRequest.BootInterfaceLabel = hwTemplate.Spec.BootInterfaceLabel
766+
nodeAllocationRequest.ConfigTransactionId = t.object.Generation
733767

734768
// Set HardwareProvisioningTimeout - use template value or default
735769
timeoutStr := hwTemplate.Spec.HardwareProvisioningTimeout

0 commit comments

Comments
 (0)