Skip to content

Commit d2bf2fc

Browse files
trashhaloclaude
andcommitted
Add CancelHealthCheckOnNewRevision feature to avoid getting stuck on failing commits
This feature allows health checks to be cancelled when a new source revision becomes available, preventing the controller from getting stuck waiting for full timeout durations when fixes are already available. Features: - New opt-in feature flag: CancelHealthCheckOnNewRevision (default: false) - Health checks are cancelled early when new revisions are detected - Reduces delay from ~30s timeout to ~5s when fixes are pushed - Preserves existing behavior when feature is disabled - Comprehensive test coverage demonstrating both behaviors The implementation monitors source revisions during health checks and cancels ongoing checks when new revisions are available, allowing immediate processing of potential fixes instead of waiting for full timeout periods. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]>
1 parent 14d88d4 commit d2bf2fc

File tree

3 files changed

+520
-8
lines changed

3 files changed

+520
-8
lines changed

internal/controller/kustomization_controller.go

Lines changed: 74 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,7 @@ func (r *KustomizationReconciler) reconcile(
492492
resourceManager,
493493
patcher,
494494
obj,
495+
src,
495496
revision,
496497
originRevision,
497498
isNewRevision,
@@ -936,6 +937,7 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context,
936937
manager *ssa.ResourceManager,
937938
patcher *patch.SerialPatcher,
938939
obj *kustomizev1.Kustomization,
940+
src sourcev1.Source,
939941
revision string,
940942
originRevision string,
941943
isNewRevision bool,
@@ -982,15 +984,79 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context,
982984
return fmt.Errorf("unable to update the healthy status to progressing: %w", err)
983985
}
984986

987+
// Check if we should cancel health checks on new revisions
988+
cancelOnNewRevision := false
989+
if enabled, err := features.Enabled(features.CancelHealthCheckOnNewRevision); err == nil && enabled {
990+
cancelOnNewRevision = true
991+
}
992+
993+
// Create a cancellable context for health checks if the feature is enabled
994+
healthCtx := ctx
995+
var cancel context.CancelFunc
996+
if cancelOnNewRevision {
997+
healthCtx, cancel = context.WithCancel(ctx)
998+
defer cancel()
999+
1000+
// Start monitoring for new revisions
1001+
go func() {
1002+
ticker := time.NewTicker(5 * time.Second)
1003+
defer ticker.Stop()
1004+
1005+
for {
1006+
select {
1007+
case <-healthCtx.Done():
1008+
return
1009+
case <-ticker.C:
1010+
// Get the latest source artifact
1011+
latestSrc, err := r.getSource(ctx, obj)
1012+
if err == nil && latestSrc.GetArtifact() != nil {
1013+
if latestSrc.GetArtifact().Revision != revision {
1014+
ctrl.LoggerFrom(ctx).Info("New revision detected during health check, cancelling",
1015+
"current", revision,
1016+
"new", latestSrc.GetArtifact().Revision)
1017+
cancel()
1018+
return
1019+
}
1020+
}
1021+
}
1022+
}
1023+
}()
1024+
}
1025+
9851026
// Check the health with a default timeout of 30sec shorter than the reconciliation interval.
986-
if err := manager.WaitForSet(toCheck, ssa.WaitOptions{
987-
Interval: 5 * time.Second,
988-
Timeout: obj.GetTimeout(),
989-
FailFast: r.FailFast,
990-
}); err != nil {
991-
conditions.MarkFalse(obj, meta.ReadyCondition, meta.HealthCheckFailedReason, "%s", err)
992-
conditions.MarkFalse(obj, meta.HealthyCondition, meta.HealthCheckFailedReason, "%s", err)
993-
return fmt.Errorf("health check failed after %s: %w", time.Since(checkStart).String(), err)
1027+
var healthErr error
1028+
if cancelOnNewRevision {
1029+
// Run health check in a goroutine and race it with revision monitoring
1030+
healthDone := make(chan error, 1)
1031+
go func() {
1032+
healthDone <- manager.WaitForSet(toCheck, ssa.WaitOptions{
1033+
Interval: 5 * time.Second,
1034+
Timeout: obj.GetTimeout(),
1035+
FailFast: r.FailFast,
1036+
})
1037+
}()
1038+
1039+
// Wait for either health check completion or context cancellation
1040+
select {
1041+
case healthErr = <-healthDone:
1042+
// Health check completed normally
1043+
case <-healthCtx.Done():
1044+
// Context was cancelled due to new revision
1045+
healthErr = fmt.Errorf("health check cancelled due to new revision availability")
1046+
}
1047+
} else {
1048+
// Normal health check without cancellation
1049+
healthErr = manager.WaitForSet(toCheck, ssa.WaitOptions{
1050+
Interval: 5 * time.Second,
1051+
Timeout: obj.GetTimeout(),
1052+
FailFast: r.FailFast,
1053+
})
1054+
}
1055+
1056+
if healthErr != nil {
1057+
conditions.MarkFalse(obj, meta.ReadyCondition, meta.HealthCheckFailedReason, "%s", healthErr)
1058+
conditions.MarkFalse(obj, meta.HealthyCondition, meta.HealthCheckFailedReason, "%s", healthErr)
1059+
return fmt.Errorf("health check failed after %s: %w", time.Since(checkStart).String(), healthErr)
9941060
}
9951061

9961062
// Emit recovery event if the previous health check failed.

0 commit comments

Comments
 (0)