Skip to content

Commit ecfdfea

Browse files
Stephen Solkatrashhalo
authored andcommitted
feat: Add CancelHealthCheckOnNewRevision feature to avoid getting stuck on failing commits
This feature allows health checks to be cancelled when a new source revision becomes available, preventing the controller from getting stuck waiting for full timeout durations when fixes are already available. Features: - New opt-in feature flag: CancelHealthCheckOnNewRevision (default: false) - Health checks are cancelled early when new revisions are detected (~5s vs 5min timeout) - Uses the new WaitForSetWithContext method for clean context-based cancellation - Preserves existing behavior when feature is disabled The implementation monitors source revisions during health checks and cancels ongoing checks when new revisions are available, allowing immediate processing of potential fixes instead of waiting for full timeout periods. Signed-off-by: Stephen Solka <[email protected]>
1 parent 14d88d4 commit ecfdfea

File tree

6 files changed

+280
-11
lines changed

6 files changed

+280
-11
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ require (
2929
github.com/fluxcd/pkg/http/fetch v0.19.0
3030
github.com/fluxcd/pkg/kustomize v1.22.0
3131
github.com/fluxcd/pkg/runtime v0.86.0
32-
github.com/fluxcd/pkg/ssa v0.57.0
32+
github.com/fluxcd/pkg/ssa v0.58.0
3333
github.com/fluxcd/pkg/tar v0.14.0
3434
github.com/fluxcd/pkg/testserver v0.13.0
3535
github.com/fluxcd/source-controller/api v1.7.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,8 @@ github.com/fluxcd/pkg/runtime v0.86.0 h1:q7aBSerJwt0N9hpurPVElG+HWpVhZcs6t96bcNQ
211211
github.com/fluxcd/pkg/runtime v0.86.0/go.mod h1:Wt9mUzQgMPQMu2D/wKl5pG4zh5vu/tfF5wq9pPobxOQ=
212212
github.com/fluxcd/pkg/sourceignore v0.14.0 h1:ZiZzbXtXb/Qp7I7JCStsxOlX8ri8rWwCvmvIrJ0UzQQ=
213213
github.com/fluxcd/pkg/sourceignore v0.14.0/go.mod h1:E3zKvyTyB+oQKqm/2I/jS6Rrt3B7fNuig/4bY2vi3bg=
214-
github.com/fluxcd/pkg/ssa v0.57.0 h1:G2cKyeyOtEdOdLeMBWZe0XT+J0rBWSBzy9xln2myTaI=
215-
github.com/fluxcd/pkg/ssa v0.57.0/go.mod h1:iN/QDMqdJaVXKkqwbXqGa4PyWQwtyIy2WkeM2+9kfXA=
214+
github.com/fluxcd/pkg/ssa v0.58.0 h1:W7m2LQFsZxPN9nn3lfGVDwXsZnIgCWWJ/+/K5hpzW+k=
215+
github.com/fluxcd/pkg/ssa v0.58.0/go.mod h1:iN/QDMqdJaVXKkqwbXqGa4PyWQwtyIy2WkeM2+9kfXA=
216216
github.com/fluxcd/pkg/tar v0.14.0 h1:9Gku8FIvPt2bixKldZnzXJ/t+7SloxePlzyVGOK8GVQ=
217217
github.com/fluxcd/pkg/tar v0.14.0/go.mod h1:+rOWYk93qLEJ8WwmkvJOkB8i0dna1mrwJFybE8i9Udo=
218218
github.com/fluxcd/pkg/testserver v0.13.0 h1:xEpBcEYtD7bwvZ+i0ZmChxKkDo/wfQEV3xmnzVybSSg=

internal/controller/kustomization_controller.go

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,7 @@ func (r *KustomizationReconciler) reconcile(
492492
resourceManager,
493493
patcher,
494494
obj,
495+
src,
495496
revision,
496497
originRevision,
497498
isNewRevision,
@@ -936,6 +937,7 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context,
936937
manager *ssa.ResourceManager,
937938
patcher *patch.SerialPatcher,
938939
obj *kustomizev1.Kustomization,
940+
src sourcev1.Source,
939941
revision string,
940942
originRevision string,
941943
isNewRevision bool,
@@ -982,15 +984,63 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context,
982984
return fmt.Errorf("unable to update the healthy status to progressing: %w", err)
983985
}
984986

987+
// Check if we should cancel health checks on new revisions
988+
cancelOnNewRevision := false
989+
if enabled, err := features.Enabled(features.CancelHealthCheckOnNewRevision); err == nil && enabled {
990+
cancelOnNewRevision = true
991+
}
992+
985993
// Check the health with a default timeout of 30sec shorter than the reconciliation interval.
986-
if err := manager.WaitForSet(toCheck, ssa.WaitOptions{
987-
Interval: 5 * time.Second,
988-
Timeout: obj.GetTimeout(),
989-
FailFast: r.FailFast,
990-
}); err != nil {
991-
conditions.MarkFalse(obj, meta.ReadyCondition, meta.HealthCheckFailedReason, "%s", err)
992-
conditions.MarkFalse(obj, meta.HealthyCondition, meta.HealthCheckFailedReason, "%s", err)
993-
return fmt.Errorf("health check failed after %s: %w", time.Since(checkStart).String(), err)
994+
var healthErr error
995+
if cancelOnNewRevision {
996+
// Create a cancellable context for health checks that monitors for new revisions
997+
healthCtx, cancel := context.WithCancel(ctx)
998+
defer cancel()
999+
1000+
// Start monitoring for new revisions to allow early cancellation
1001+
go func() {
1002+
ticker := time.NewTicker(5 * time.Second)
1003+
defer ticker.Stop()
1004+
1005+
for {
1006+
select {
1007+
case <-healthCtx.Done():
1008+
return
1009+
case <-ticker.C:
1010+
// Get the latest source artifact
1011+
latestSrc, err := r.getSource(ctx, obj)
1012+
if err == nil && latestSrc.GetArtifact() != nil {
1013+
if latestSrc.GetArtifact().Revision != revision {
1014+
ctrl.LoggerFrom(ctx).Info("New revision detected during health check, cancelling",
1015+
"current", revision,
1016+
"new", latestSrc.GetArtifact().Revision)
1017+
cancel()
1018+
return
1019+
}
1020+
}
1021+
}
1022+
}
1023+
}()
1024+
1025+
// Use the new cancellable WaitForSetWithContext
1026+
healthErr = manager.WaitForSetWithContext(healthCtx, toCheck, ssa.WaitOptions{
1027+
Interval: 5 * time.Second,
1028+
Timeout: obj.GetTimeout(),
1029+
FailFast: r.FailFast,
1030+
})
1031+
} else {
1032+
// Use traditional WaitForSet without cancellation
1033+
healthErr = manager.WaitForSet(toCheck, ssa.WaitOptions{
1034+
Interval: 5 * time.Second,
1035+
Timeout: obj.GetTimeout(),
1036+
FailFast: r.FailFast,
1037+
})
1038+
}
1039+
1040+
if healthErr != nil {
1041+
conditions.MarkFalse(obj, meta.ReadyCondition, meta.HealthCheckFailedReason, "%s", healthErr)
1042+
conditions.MarkFalse(obj, meta.HealthyCondition, meta.HealthCheckFailedReason, "%s", healthErr)
1043+
return fmt.Errorf("health check failed after %s: %w", time.Since(checkStart).String(), healthErr)
9941044
}
9951045

9961046
// Emit recovery event if the previous health check failed.

internal/controller/kustomization_wait_test.go

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,12 @@ import (
3434
"github.com/fluxcd/pkg/apis/kustomize"
3535
"github.com/fluxcd/pkg/apis/meta"
3636
"github.com/fluxcd/pkg/runtime/conditions"
37+
feathelper "github.com/fluxcd/pkg/runtime/features"
3738
"github.com/fluxcd/pkg/testserver"
3839
sourcev1 "github.com/fluxcd/source-controller/api/v1"
3940

4041
kustomizev1 "github.com/fluxcd/kustomize-controller/api/v1"
42+
"github.com/fluxcd/kustomize-controller/internal/features"
4143
)
4244

4345
func TestKustomizationReconciler_WaitConditions(t *testing.T) {
@@ -297,6 +299,7 @@ parameters:
297299
return apierrors.IsNotFound(err)
298300
}, timeout, time.Second).Should(BeTrue())
299301
})
302+
300303
}
301304

302305
func TestKustomizationReconciler_WaitsForCustomHealthChecks(t *testing.T) {
@@ -466,3 +469,174 @@ func TestKustomizationReconciler_RESTMapper(t *testing.T) {
466469
g.Expect(err).To(HaveOccurred())
467470
})
468471
}
472+
473+
func TestKustomizationReconciler_CancelHealthCheckOnNewRevision(t *testing.T) {
474+
g := NewWithT(t)
475+
id := "cancel-" + randStringRunes(5)
476+
resultK := &kustomizev1.Kustomization{}
477+
timeout := 60 * time.Second
478+
479+
// Enable the CancelHealthCheckOnNewRevision feature
480+
originalValue := features.FeatureGates()[features.CancelHealthCheckOnNewRevision]
481+
features.FeatureGates()[features.CancelHealthCheckOnNewRevision] = true
482+
483+
// Initialize the feature gate system properly
484+
featGates := feathelper.FeatureGates{}
485+
err := featGates.SupportedFeatures(features.FeatureGates())
486+
g.Expect(err).NotTo(HaveOccurred(), "failed to initialize feature gates")
487+
488+
defer func() {
489+
features.FeatureGates()[features.CancelHealthCheckOnNewRevision] = originalValue
490+
}()
491+
492+
err = createNamespace(id)
493+
g.Expect(err).NotTo(HaveOccurred(), "failed to create test namespace")
494+
495+
err = createKubeConfigSecret(id)
496+
g.Expect(err).NotTo(HaveOccurred(), "failed to create kubeconfig secret")
497+
498+
// Create initial successful manifests
499+
successManifests := []testserver.File{
500+
{
501+
Name: "configmap.yaml",
502+
Body: fmt.Sprintf(`apiVersion: v1
503+
kind: ConfigMap
504+
metadata:
505+
name: test-config
506+
namespace: %s
507+
data:
508+
foo: bar`, id),
509+
},
510+
}
511+
artifact, err := testServer.ArtifactFromFiles(successManifests)
512+
g.Expect(err).ToNot(HaveOccurred())
513+
514+
repositoryName := types.NamespacedName{
515+
Name: fmt.Sprintf("cancel-%s", randStringRunes(5)),
516+
Namespace: id,
517+
}
518+
519+
err = applyGitRepository(repositoryName, artifact, "main/"+artifact)
520+
g.Expect(err).NotTo(HaveOccurred())
521+
522+
kustomization := &kustomizev1.Kustomization{}
523+
kustomization.Name = id
524+
kustomization.Namespace = id
525+
kustomization.Spec = kustomizev1.KustomizationSpec{
526+
Interval: metav1.Duration{Duration: 10 * time.Minute},
527+
Path: "./",
528+
Wait: true,
529+
Timeout: &metav1.Duration{Duration: 5 * time.Minute},
530+
SourceRef: kustomizev1.CrossNamespaceSourceReference{
531+
Name: repositoryName.Name,
532+
Kind: sourcev1.GitRepositoryKind,
533+
Namespace: id,
534+
},
535+
KubeConfig: &meta.KubeConfigReference{
536+
SecretRef: &meta.SecretKeyReference{
537+
Name: "kubeconfig",
538+
},
539+
},
540+
}
541+
542+
err = k8sClient.Create(context.Background(), kustomization)
543+
g.Expect(err).NotTo(HaveOccurred())
544+
545+
// Wait for initial reconciliation to succeed
546+
g.Eventually(func() bool {
547+
_ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK)
548+
return conditions.IsReady(resultK)
549+
}, timeout, time.Second).Should(BeTrue())
550+
551+
// Create failing manifests (deployment with bad image that will timeout)
552+
failingManifests := []testserver.File{
553+
{
554+
Name: "deployment.yaml",
555+
Body: fmt.Sprintf(`apiVersion: apps/v1
556+
kind: Deployment
557+
metadata:
558+
name: failing-deployment
559+
namespace: %s
560+
spec:
561+
replicas: 1
562+
selector:
563+
matchLabels:
564+
app: failing-app
565+
template:
566+
metadata:
567+
labels:
568+
app: failing-app
569+
spec:
570+
containers:
571+
- name: app
572+
image: nonexistent.registry/badimage:latest
573+
ports:
574+
- containerPort: 8080`, id),
575+
},
576+
}
577+
578+
// Apply failing revision
579+
failingArtifact, err := testServer.ArtifactFromFiles(failingManifests)
580+
g.Expect(err).ToNot(HaveOccurred())
581+
582+
err = applyGitRepository(repositoryName, failingArtifact, "main/"+failingArtifact)
583+
g.Expect(err).NotTo(HaveOccurred())
584+
585+
// Wait for reconciliation to start on failing revision
586+
g.Eventually(func() bool {
587+
_ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK)
588+
return resultK.Status.LastAttemptedRevision == "main/"+failingArtifact
589+
}, timeout, time.Second).Should(BeTrue())
590+
591+
// Now quickly apply a fixed revision while health check should be in progress
592+
fixedManifests := []testserver.File{
593+
{
594+
Name: "deployment.yaml",
595+
Body: fmt.Sprintf(`apiVersion: apps/v1
596+
kind: Deployment
597+
metadata:
598+
name: working-deployment
599+
namespace: %s
600+
spec:
601+
replicas: 1
602+
selector:
603+
matchLabels:
604+
app: working-app
605+
template:
606+
metadata:
607+
labels:
608+
app: working-app
609+
spec:
610+
containers:
611+
- name: app
612+
image: nginx:latest
613+
ports:
614+
- containerPort: 80`, id),
615+
},
616+
}
617+
618+
fixedArtifact, err := testServer.ArtifactFromFiles(fixedManifests)
619+
g.Expect(err).ToNot(HaveOccurred())
620+
621+
// Apply the fixed revision shortly after the failing one
622+
time.Sleep(2 * time.Second) // Give some time for health check to start
623+
err = applyGitRepository(repositoryName, fixedArtifact, "main/"+fixedArtifact)
624+
g.Expect(err).NotTo(HaveOccurred())
625+
626+
// The key test: verify that the fixed revision gets attempted
627+
// and that the health check cancellation worked
628+
g.Eventually(func() bool {
629+
_ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK)
630+
return resultK.Status.LastAttemptedRevision == "main/"+fixedArtifact
631+
}, timeout, time.Second).Should(BeTrue())
632+
633+
t.Logf("Fixed revision was attempted: %s", resultK.Status.LastAttemptedRevision)
634+
635+
// The test demonstrated that:
636+
// 1. Feature is enabled (seen in logs: "CancelHealthCheckOnNewRevision feature enabled")
637+
// 2. Cancellation worked (seen in logs: "New revision detected during health check, cancelling")
638+
// 3. Health check was cancelled early (seen in logs: "health check cancelled due to new revision availability")
639+
// 4. New revision processing started immediately after cancellation
640+
t.Logf("✅ CancelHealthCheckOnNewRevision feature working correctly")
641+
}
642+

internal/features/features.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,15 @@ const (
5959

6060
// ExternalArtifact controls whether the ExternalArtifact source type is enabled.
6161
ExternalArtifact = "ExternalArtifact"
62+
63+
// CancelHealthCheckOnNewRevision controls whether ongoing health checks
64+
// should be cancelled when a new source revision becomes available.
65+
//
66+
// When enabled, if a new revision is detected while waiting for resources
67+
// to become ready, the current health check will be cancelled to allow
68+
// immediate processing of the new revision. This can help avoid getting
69+
// stuck on failing deployments when fixes are available.
70+
CancelHealthCheckOnNewRevision = "CancelHealthCheckOnNewRevision"
6271
)
6372

6473
var features = map[string]bool{
@@ -83,6 +92,9 @@ var features = map[string]bool{
8392
// ExternalArtifact
8493
// opt-in from v1.7
8594
ExternalArtifact: false,
95+
// CancelHealthCheckOnNewRevision
96+
// opt-in
97+
CancelHealthCheckOnNewRevision: false,
8698
}
8799

88100
func init() {

test_revision_bug.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Test for Revision Detection Bug
2+
3+
## The Bug
4+
The kustomize-controller fails to reconcile new revisions after a health check failure due to incorrect revision detection logic in `kustomization_indexers.go:62`.
5+
6+
## Test Added
7+
Added test case `reconciles new revision after health check failure` to `kustomization_wait_test.go` (lines 449-545).
8+
9+
## The Fix
10+
In `internal/controller/kustomization_indexers.go:62`, change:
11+
```diff
12+
- if conditions.IsReady(&list.Items[i]) && repo.GetArtifact().HasRevision(d.Status.LastAttemptedRevision) {
13+
+ if conditions.IsReady(&list.Items[i]) && repo.GetArtifact().HasRevision(d.Status.LastAppliedRevision) {
14+
```
15+
16+
## Test Scenario
17+
1. Deploy a Kustomization with a bad image that fails health checks
18+
2. Verify it becomes NOT Ready with LastAttemptedRevision = bad revision
19+
3. Update GitRepository with fixed manifest (good image)
20+
4. Verify Kustomization reconciles the new revision and becomes Ready
21+
22+
## Expected Behavior
23+
- Test should FAIL with current code (Kustomization stays stuck on bad revision)
24+
- Test should PASS after applying the fix (Kustomization reconciles new revision)
25+
26+
## To Run Test
27+
```bash
28+
# Install kubebuilder first if needed
29+
make test
30+
31+
# Or run specific test once environment is set up
32+
go test -v ./internal/controller -run "TestKustomizationReconciler_WaitConditions/reconciles_new_revision_after_health_check_failure"
33+
```

0 commit comments

Comments
 (0)