Skip to content

Commit c1db411

Browse files
trashhaloclaude
andcommitted
Add CancelHealthCheckOnNewRevision feature to avoid getting stuck on failing commits
This feature allows health checks to be cancelled when a new source revision becomes available, preventing the controller from getting stuck waiting for full timeout durations when fixes are already available. Features: - New opt-in feature flag: CancelHealthCheckOnNewRevision (default: false) - Health checks are cancelled early when new revisions are detected - Reduces delay from ~30s timeout to ~5s when fixes are pushed - Preserves existing behavior when feature is disabled - Comprehensive test coverage demonstrating both behaviors The implementation monitors source revisions during health checks and cancels ongoing checks when new revisions are available, allowing immediate processing of potential fixes instead of waiting for full timeout periods. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]>
1 parent 44cddcb commit c1db411

File tree

3 files changed

+521
-8
lines changed

3 files changed

+521
-8
lines changed

internal/controller/kustomization_controller.go

Lines changed: 75 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ import (
6969
kustomizev1 "github.com/fluxcd/kustomize-controller/api/v1"
7070
intcache "github.com/fluxcd/kustomize-controller/internal/cache"
7171
"github.com/fluxcd/kustomize-controller/internal/decryptor"
72+
"github.com/fluxcd/kustomize-controller/internal/features"
7273
"github.com/fluxcd/kustomize-controller/internal/inventory"
7374
intruntime "github.com/fluxcd/kustomize-controller/internal/runtime"
7475
)
@@ -457,6 +458,7 @@ func (r *KustomizationReconciler) reconcile(
457458
resourceManager,
458459
patcher,
459460
obj,
461+
src,
460462
revision,
461463
originRevision,
462464
isNewRevision,
@@ -875,6 +877,7 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context,
875877
manager *ssa.ResourceManager,
876878
patcher *patch.SerialPatcher,
877879
obj *kustomizev1.Kustomization,
880+
src sourcev1.Source,
878881
revision string,
879882
originRevision string,
880883
isNewRevision bool,
@@ -921,15 +924,79 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context,
921924
return fmt.Errorf("unable to update the healthy status to progressing: %w", err)
922925
}
923926

927+
// Check if we should cancel health checks on new revisions
928+
cancelOnNewRevision := false
929+
if enabled, err := features.Enabled(features.CancelHealthCheckOnNewRevision); err == nil && enabled {
930+
cancelOnNewRevision = true
931+
}
932+
933+
// Create a cancellable context for health checks if the feature is enabled
934+
healthCtx := ctx
935+
var cancel context.CancelFunc
936+
if cancelOnNewRevision {
937+
healthCtx, cancel = context.WithCancel(ctx)
938+
defer cancel()
939+
940+
// Start monitoring for new revisions
941+
go func() {
942+
ticker := time.NewTicker(5 * time.Second)
943+
defer ticker.Stop()
944+
945+
for {
946+
select {
947+
case <-healthCtx.Done():
948+
return
949+
case <-ticker.C:
950+
// Get the latest source artifact
951+
latestSrc, err := r.getSource(ctx, obj)
952+
if err == nil && latestSrc.GetArtifact() != nil {
953+
if latestSrc.GetArtifact().Revision != revision {
954+
ctrl.LoggerFrom(ctx).Info("New revision detected during health check, cancelling",
955+
"current", revision,
956+
"new", latestSrc.GetArtifact().Revision)
957+
cancel()
958+
return
959+
}
960+
}
961+
}
962+
}
963+
}()
964+
}
965+
924966
// Check the health with a default timeout of 30sec shorter than the reconciliation interval.
925-
if err := manager.WaitForSet(toCheck, ssa.WaitOptions{
926-
Interval: 5 * time.Second,
927-
Timeout: obj.GetTimeout(),
928-
FailFast: r.FailFast,
929-
}); err != nil {
930-
conditions.MarkFalse(obj, meta.ReadyCondition, meta.HealthCheckFailedReason, "%s", err)
931-
conditions.MarkFalse(obj, meta.HealthyCondition, meta.HealthCheckFailedReason, "%s", err)
932-
return fmt.Errorf("health check failed after %s: %w", time.Since(checkStart).String(), err)
967+
var healthErr error
968+
if cancelOnNewRevision {
969+
// Run health check in a goroutine and race it with revision monitoring
970+
healthDone := make(chan error, 1)
971+
go func() {
972+
healthDone <- manager.WaitForSet(toCheck, ssa.WaitOptions{
973+
Interval: 5 * time.Second,
974+
Timeout: obj.GetTimeout(),
975+
FailFast: r.FailFast,
976+
})
977+
}()
978+
979+
// Wait for either health check completion or context cancellation
980+
select {
981+
case healthErr = <-healthDone:
982+
// Health check completed normally
983+
case <-healthCtx.Done():
984+
// Context was cancelled due to new revision
985+
healthErr = fmt.Errorf("health check cancelled due to new revision availability")
986+
}
987+
} else {
988+
// Normal health check without cancellation
989+
healthErr = manager.WaitForSet(toCheck, ssa.WaitOptions{
990+
Interval: 5 * time.Second,
991+
Timeout: obj.GetTimeout(),
992+
FailFast: r.FailFast,
993+
})
994+
}
995+
996+
if healthErr != nil {
997+
conditions.MarkFalse(obj, meta.ReadyCondition, meta.HealthCheckFailedReason, "%s", healthErr)
998+
conditions.MarkFalse(obj, meta.HealthyCondition, meta.HealthCheckFailedReason, "%s", healthErr)
999+
return fmt.Errorf("health check failed after %s: %w", time.Since(checkStart).String(), healthErr)
9331000
}
9341001

9351002
// Emit recovery event if the previous health check failed.

0 commit comments

Comments
 (0)