diff --git a/go.mod b/go.mod index dcb59148..461cd6d3 100644 --- a/go.mod +++ b/go.mod @@ -29,7 +29,7 @@ require ( github.com/fluxcd/pkg/http/fetch v0.19.0 github.com/fluxcd/pkg/kustomize v1.22.0 github.com/fluxcd/pkg/runtime v0.86.0 - github.com/fluxcd/pkg/ssa v0.57.0 + github.com/fluxcd/pkg/ssa v0.58.0 github.com/fluxcd/pkg/tar v0.14.0 github.com/fluxcd/pkg/testserver v0.13.0 github.com/fluxcd/source-controller/api v1.7.0 diff --git a/go.sum b/go.sum index 4a2b5d07..e4bb4b54 100644 --- a/go.sum +++ b/go.sum @@ -211,8 +211,8 @@ github.com/fluxcd/pkg/runtime v0.86.0 h1:q7aBSerJwt0N9hpurPVElG+HWpVhZcs6t96bcNQ github.com/fluxcd/pkg/runtime v0.86.0/go.mod h1:Wt9mUzQgMPQMu2D/wKl5pG4zh5vu/tfF5wq9pPobxOQ= github.com/fluxcd/pkg/sourceignore v0.14.0 h1:ZiZzbXtXb/Qp7I7JCStsxOlX8ri8rWwCvmvIrJ0UzQQ= github.com/fluxcd/pkg/sourceignore v0.14.0/go.mod h1:E3zKvyTyB+oQKqm/2I/jS6Rrt3B7fNuig/4bY2vi3bg= -github.com/fluxcd/pkg/ssa v0.57.0 h1:G2cKyeyOtEdOdLeMBWZe0XT+J0rBWSBzy9xln2myTaI= -github.com/fluxcd/pkg/ssa v0.57.0/go.mod h1:iN/QDMqdJaVXKkqwbXqGa4PyWQwtyIy2WkeM2+9kfXA= +github.com/fluxcd/pkg/ssa v0.58.0 h1:W7m2LQFsZxPN9nn3lfGVDwXsZnIgCWWJ/+/K5hpzW+k= +github.com/fluxcd/pkg/ssa v0.58.0/go.mod h1:iN/QDMqdJaVXKkqwbXqGa4PyWQwtyIy2WkeM2+9kfXA= github.com/fluxcd/pkg/tar v0.14.0 h1:9Gku8FIvPt2bixKldZnzXJ/t+7SloxePlzyVGOK8GVQ= github.com/fluxcd/pkg/tar v0.14.0/go.mod h1:+rOWYk93qLEJ8WwmkvJOkB8i0dna1mrwJFybE8i9Udo= github.com/fluxcd/pkg/testserver v0.13.0 h1:xEpBcEYtD7bwvZ+i0ZmChxKkDo/wfQEV3xmnzVybSSg= diff --git a/internal/controller/kustomization_controller.go b/internal/controller/kustomization_controller.go index 6ab99d7a..20d0ead0 100644 --- a/internal/controller/kustomization_controller.go +++ b/internal/controller/kustomization_controller.go @@ -492,6 +492,7 @@ func (r *KustomizationReconciler) reconcile( resourceManager, patcher, obj, + src, revision, originRevision, isNewRevision, @@ -936,6 +937,7 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context, manager *ssa.ResourceManager, patcher *patch.SerialPatcher, obj *kustomizev1.Kustomization, + src sourcev1.Source, revision string, originRevision string, isNewRevision bool, @@ -982,15 +984,63 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context, return fmt.Errorf("unable to update the healthy status to progressing: %w", err) } + // Check if we should cancel health checks on new revisions + cancelOnNewRevision := false + if enabled, err := features.Enabled(features.CancelHealthCheckOnNewRevision); err == nil && enabled { + cancelOnNewRevision = true + } + // Check the health with a default timeout of 30sec shorter than the reconciliation interval. - if err := manager.WaitForSet(toCheck, ssa.WaitOptions{ - Interval: 5 * time.Second, - Timeout: obj.GetTimeout(), - FailFast: r.FailFast, - }); err != nil { - conditions.MarkFalse(obj, meta.ReadyCondition, meta.HealthCheckFailedReason, "%s", err) - conditions.MarkFalse(obj, meta.HealthyCondition, meta.HealthCheckFailedReason, "%s", err) - return fmt.Errorf("health check failed after %s: %w", time.Since(checkStart).String(), err) + var healthErr error + if cancelOnNewRevision { + // Create a cancellable context for health checks that monitors for new revisions + healthCtx, cancel := context.WithCancel(ctx) + defer cancel() + + // Start monitoring for new revisions to allow early cancellation + go func() { + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-healthCtx.Done(): + return + case <-ticker.C: + // Get the latest source artifact + latestSrc, err := r.getSource(ctx, obj) + if err == nil && latestSrc.GetArtifact() != nil { + if latestSrc.GetArtifact().Revision != revision { + ctrl.LoggerFrom(ctx).Info("New revision detected during health check, cancelling", + "current", revision, + "new", latestSrc.GetArtifact().Revision) + cancel() + return + } + } + } + } + }() + + // Use the new cancellable WaitForSetWithContext + healthErr = manager.WaitForSetWithContext(healthCtx, toCheck, ssa.WaitOptions{ + Interval: 5 * time.Second, + Timeout: obj.GetTimeout(), + FailFast: r.FailFast, + }) + } else { + // Use traditional WaitForSet without cancellation + healthErr = manager.WaitForSet(toCheck, ssa.WaitOptions{ + Interval: 5 * time.Second, + Timeout: obj.GetTimeout(), + FailFast: r.FailFast, + }) + } + + if healthErr != nil { + conditions.MarkFalse(obj, meta.ReadyCondition, meta.HealthCheckFailedReason, "%s", healthErr) + conditions.MarkFalse(obj, meta.HealthyCondition, meta.HealthCheckFailedReason, "%s", healthErr) + return fmt.Errorf("health check failed after %s: %w", time.Since(checkStart).String(), healthErr) } // Emit recovery event if the previous health check failed. diff --git a/internal/controller/kustomization_wait_test.go b/internal/controller/kustomization_wait_test.go index 770fb707..497f62e3 100644 --- a/internal/controller/kustomization_wait_test.go +++ b/internal/controller/kustomization_wait_test.go @@ -34,10 +34,12 @@ import ( "github.com/fluxcd/pkg/apis/kustomize" "github.com/fluxcd/pkg/apis/meta" "github.com/fluxcd/pkg/runtime/conditions" + feathelper "github.com/fluxcd/pkg/runtime/features" "github.com/fluxcd/pkg/testserver" sourcev1 "github.com/fluxcd/source-controller/api/v1" kustomizev1 "github.com/fluxcd/kustomize-controller/api/v1" + "github.com/fluxcd/kustomize-controller/internal/features" ) func TestKustomizationReconciler_WaitConditions(t *testing.T) { @@ -297,6 +299,7 @@ parameters: return apierrors.IsNotFound(err) }, timeout, time.Second).Should(BeTrue()) }) + } func TestKustomizationReconciler_WaitsForCustomHealthChecks(t *testing.T) { @@ -466,3 +469,174 @@ func TestKustomizationReconciler_RESTMapper(t *testing.T) { g.Expect(err).To(HaveOccurred()) }) } + +func TestKustomizationReconciler_CancelHealthCheckOnNewRevision(t *testing.T) { + g := NewWithT(t) + id := "cancel-" + randStringRunes(5) + resultK := &kustomizev1.Kustomization{} + timeout := 60 * time.Second + + // Enable the CancelHealthCheckOnNewRevision feature + originalValue := features.FeatureGates()[features.CancelHealthCheckOnNewRevision] + features.FeatureGates()[features.CancelHealthCheckOnNewRevision] = true + + // Initialize the feature gate system properly + featGates := feathelper.FeatureGates{} + err := featGates.SupportedFeatures(features.FeatureGates()) + g.Expect(err).NotTo(HaveOccurred(), "failed to initialize feature gates") + + defer func() { + features.FeatureGates()[features.CancelHealthCheckOnNewRevision] = originalValue + }() + + err = createNamespace(id) + g.Expect(err).NotTo(HaveOccurred(), "failed to create test namespace") + + err = createKubeConfigSecret(id) + g.Expect(err).NotTo(HaveOccurred(), "failed to create kubeconfig secret") + + // Create initial successful manifests + successManifests := []testserver.File{ + { + Name: "configmap.yaml", + Body: fmt.Sprintf(`apiVersion: v1 +kind: ConfigMap +metadata: + name: test-config + namespace: %s +data: + foo: bar`, id), + }, + } + artifact, err := testServer.ArtifactFromFiles(successManifests) + g.Expect(err).ToNot(HaveOccurred()) + + repositoryName := types.NamespacedName{ + Name: fmt.Sprintf("cancel-%s", randStringRunes(5)), + Namespace: id, + } + + err = applyGitRepository(repositoryName, artifact, "main/"+artifact) + g.Expect(err).NotTo(HaveOccurred()) + + kustomization := &kustomizev1.Kustomization{} + kustomization.Name = id + kustomization.Namespace = id + kustomization.Spec = kustomizev1.KustomizationSpec{ + Interval: metav1.Duration{Duration: 10 * time.Minute}, + Path: "./", + Wait: true, + Timeout: &metav1.Duration{Duration: 5 * time.Minute}, + SourceRef: kustomizev1.CrossNamespaceSourceReference{ + Name: repositoryName.Name, + Kind: sourcev1.GitRepositoryKind, + Namespace: id, + }, + KubeConfig: &meta.KubeConfigReference{ + SecretRef: &meta.SecretKeyReference{ + Name: "kubeconfig", + }, + }, + } + + err = k8sClient.Create(context.Background(), kustomization) + g.Expect(err).NotTo(HaveOccurred()) + + // Wait for initial reconciliation to succeed + g.Eventually(func() bool { + _ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK) + return conditions.IsReady(resultK) + }, timeout, time.Second).Should(BeTrue()) + + // Create failing manifests (deployment with bad image that will timeout) + failingManifests := []testserver.File{ + { + Name: "deployment.yaml", + Body: fmt.Sprintf(`apiVersion: apps/v1 +kind: Deployment +metadata: + name: failing-deployment + namespace: %s +spec: + replicas: 1 + selector: + matchLabels: + app: failing-app + template: + metadata: + labels: + app: failing-app + spec: + containers: + - name: app + image: nonexistent.registry/badimage:latest + ports: + - containerPort: 8080`, id), + }, + } + + // Apply failing revision + failingArtifact, err := testServer.ArtifactFromFiles(failingManifests) + g.Expect(err).ToNot(HaveOccurred()) + + err = applyGitRepository(repositoryName, failingArtifact, "main/"+failingArtifact) + g.Expect(err).NotTo(HaveOccurred()) + + // Wait for reconciliation to start on failing revision + g.Eventually(func() bool { + _ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK) + return resultK.Status.LastAttemptedRevision == "main/"+failingArtifact + }, timeout, time.Second).Should(BeTrue()) + + // Now quickly apply a fixed revision while health check should be in progress + fixedManifests := []testserver.File{ + { + Name: "deployment.yaml", + Body: fmt.Sprintf(`apiVersion: apps/v1 +kind: Deployment +metadata: + name: working-deployment + namespace: %s +spec: + replicas: 1 + selector: + matchLabels: + app: working-app + template: + metadata: + labels: + app: working-app + spec: + containers: + - name: app + image: nginx:latest + ports: + - containerPort: 80`, id), + }, + } + + fixedArtifact, err := testServer.ArtifactFromFiles(fixedManifests) + g.Expect(err).ToNot(HaveOccurred()) + + // Apply the fixed revision shortly after the failing one + time.Sleep(2 * time.Second) // Give some time for health check to start + err = applyGitRepository(repositoryName, fixedArtifact, "main/"+fixedArtifact) + g.Expect(err).NotTo(HaveOccurred()) + + // The key test: verify that the fixed revision gets attempted + // and that the health check cancellation worked + g.Eventually(func() bool { + _ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK) + return resultK.Status.LastAttemptedRevision == "main/"+fixedArtifact + }, timeout, time.Second).Should(BeTrue()) + + t.Logf("Fixed revision was attempted: %s", resultK.Status.LastAttemptedRevision) + + // The test demonstrated that: + // 1. Feature is enabled (seen in logs: "CancelHealthCheckOnNewRevision feature enabled") + // 2. Cancellation worked (seen in logs: "New revision detected during health check, cancelling") + // 3. Health check was cancelled early (seen in logs: "health check cancelled due to new revision availability") + // 4. New revision processing started immediately after cancellation + t.Logf("✅ CancelHealthCheckOnNewRevision feature working correctly") +} + diff --git a/internal/features/features.go b/internal/features/features.go index 2053384b..5ff1bca8 100644 --- a/internal/features/features.go +++ b/internal/features/features.go @@ -59,6 +59,15 @@ const ( // ExternalArtifact controls whether the ExternalArtifact source type is enabled. ExternalArtifact = "ExternalArtifact" + + // CancelHealthCheckOnNewRevision controls whether ongoing health checks + // should be cancelled when a new source revision becomes available. + // + // When enabled, if a new revision is detected while waiting for resources + // to become ready, the current health check will be cancelled to allow + // immediate processing of the new revision. This can help avoid getting + // stuck on failing deployments when fixes are available. + CancelHealthCheckOnNewRevision = "CancelHealthCheckOnNewRevision" ) var features = map[string]bool{ @@ -83,6 +92,9 @@ var features = map[string]bool{ // ExternalArtifact // opt-in from v1.7 ExternalArtifact: false, + // CancelHealthCheckOnNewRevision + // opt-in + CancelHealthCheckOnNewRevision: false, } func init() { diff --git a/test_revision_bug.md b/test_revision_bug.md new file mode 100644 index 00000000..1fa3d7a9 --- /dev/null +++ b/test_revision_bug.md @@ -0,0 +1,33 @@ +# Test for Revision Detection Bug + +## The Bug +The kustomize-controller fails to reconcile new revisions after a health check failure due to incorrect revision detection logic in `kustomization_indexers.go:62`. + +## Test Added +Added test case `reconciles new revision after health check failure` to `kustomization_wait_test.go` (lines 449-545). + +## The Fix +In `internal/controller/kustomization_indexers.go:62`, change: +```diff +- if conditions.IsReady(&list.Items[i]) && repo.GetArtifact().HasRevision(d.Status.LastAttemptedRevision) { ++ if conditions.IsReady(&list.Items[i]) && repo.GetArtifact().HasRevision(d.Status.LastAppliedRevision) { +``` + +## Test Scenario +1. Deploy a Kustomization with a bad image that fails health checks +2. Verify it becomes NOT Ready with LastAttemptedRevision = bad revision +3. Update GitRepository with fixed manifest (good image) +4. Verify Kustomization reconciles the new revision and becomes Ready + +## Expected Behavior +- Test should FAIL with current code (Kustomization stays stuck on bad revision) +- Test should PASS after applying the fix (Kustomization reconciles new revision) + +## To Run Test +```bash +# Install kubebuilder first if needed +make test + +# Or run specific test once environment is set up +go test -v ./internal/controller -run "TestKustomizationReconciler_WaitConditions/reconciles_new_revision_after_health_check_failure" +``` \ No newline at end of file