Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ require (
github.com/fluxcd/pkg/http/fetch v0.19.0
github.com/fluxcd/pkg/kustomize v1.22.0
github.com/fluxcd/pkg/runtime v0.86.0
github.com/fluxcd/pkg/ssa v0.57.0
github.com/fluxcd/pkg/ssa v0.58.0
github.com/fluxcd/pkg/tar v0.14.0
github.com/fluxcd/pkg/testserver v0.13.0
github.com/fluxcd/source-controller/api v1.7.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ github.com/fluxcd/pkg/runtime v0.86.0 h1:q7aBSerJwt0N9hpurPVElG+HWpVhZcs6t96bcNQ
github.com/fluxcd/pkg/runtime v0.86.0/go.mod h1:Wt9mUzQgMPQMu2D/wKl5pG4zh5vu/tfF5wq9pPobxOQ=
github.com/fluxcd/pkg/sourceignore v0.14.0 h1:ZiZzbXtXb/Qp7I7JCStsxOlX8ri8rWwCvmvIrJ0UzQQ=
github.com/fluxcd/pkg/sourceignore v0.14.0/go.mod h1:E3zKvyTyB+oQKqm/2I/jS6Rrt3B7fNuig/4bY2vi3bg=
github.com/fluxcd/pkg/ssa v0.57.0 h1:G2cKyeyOtEdOdLeMBWZe0XT+J0rBWSBzy9xln2myTaI=
github.com/fluxcd/pkg/ssa v0.57.0/go.mod h1:iN/QDMqdJaVXKkqwbXqGa4PyWQwtyIy2WkeM2+9kfXA=
github.com/fluxcd/pkg/ssa v0.58.0 h1:W7m2LQFsZxPN9nn3lfGVDwXsZnIgCWWJ/+/K5hpzW+k=
github.com/fluxcd/pkg/ssa v0.58.0/go.mod h1:iN/QDMqdJaVXKkqwbXqGa4PyWQwtyIy2WkeM2+9kfXA=
github.com/fluxcd/pkg/tar v0.14.0 h1:9Gku8FIvPt2bixKldZnzXJ/t+7SloxePlzyVGOK8GVQ=
github.com/fluxcd/pkg/tar v0.14.0/go.mod h1:+rOWYk93qLEJ8WwmkvJOkB8i0dna1mrwJFybE8i9Udo=
github.com/fluxcd/pkg/testserver v0.13.0 h1:xEpBcEYtD7bwvZ+i0ZmChxKkDo/wfQEV3xmnzVybSSg=
Expand Down
66 changes: 58 additions & 8 deletions internal/controller/kustomization_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ func (r *KustomizationReconciler) reconcile(
resourceManager,
patcher,
obj,
src,
revision,
originRevision,
isNewRevision,
Expand Down Expand Up @@ -936,6 +937,7 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context,
manager *ssa.ResourceManager,
patcher *patch.SerialPatcher,
obj *kustomizev1.Kustomization,
src sourcev1.Source,
revision string,
originRevision string,
isNewRevision bool,
Expand Down Expand Up @@ -982,15 +984,63 @@ func (r *KustomizationReconciler) checkHealth(ctx context.Context,
return fmt.Errorf("unable to update the healthy status to progressing: %w", err)
}

// Check if we should cancel health checks on new revisions
cancelOnNewRevision := false
if enabled, err := features.Enabled(features.CancelHealthCheckOnNewRevision); err == nil && enabled {
cancelOnNewRevision = true
}

// Check the health with a default timeout of 30sec shorter than the reconciliation interval.
if err := manager.WaitForSet(toCheck, ssa.WaitOptions{
Interval: 5 * time.Second,
Timeout: obj.GetTimeout(),
FailFast: r.FailFast,
}); err != nil {
conditions.MarkFalse(obj, meta.ReadyCondition, meta.HealthCheckFailedReason, "%s", err)
conditions.MarkFalse(obj, meta.HealthyCondition, meta.HealthCheckFailedReason, "%s", err)
return fmt.Errorf("health check failed after %s: %w", time.Since(checkStart).String(), err)
var healthErr error
if cancelOnNewRevision {
// Create a cancellable context for health checks that monitors for new revisions
healthCtx, cancel := context.WithCancel(ctx)
defer cancel()

// Start monitoring for new revisions to allow early cancellation
go func() {
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()

for {
select {
case <-healthCtx.Done():
return
case <-ticker.C:
// Get the latest source artifact
latestSrc, err := r.getSource(ctx, obj)
if err == nil && latestSrc.GetArtifact() != nil {
if latestSrc.GetArtifact().Revision != revision {
ctrl.LoggerFrom(ctx).Info("New revision detected during health check, cancelling",
"current", revision,
"new", latestSrc.GetArtifact().Revision)
cancel()
return
}
}
}
}
}()

// Use the new cancellable WaitForSetWithContext
healthErr = manager.WaitForSetWithContext(healthCtx, toCheck, ssa.WaitOptions{
Interval: 5 * time.Second,
Timeout: obj.GetTimeout(),
FailFast: r.FailFast,
})
} else {
// Use traditional WaitForSet without cancellation
healthErr = manager.WaitForSet(toCheck, ssa.WaitOptions{
Interval: 5 * time.Second,
Timeout: obj.GetTimeout(),
FailFast: r.FailFast,
})
}

if healthErr != nil {
conditions.MarkFalse(obj, meta.ReadyCondition, meta.HealthCheckFailedReason, "%s", healthErr)
conditions.MarkFalse(obj, meta.HealthyCondition, meta.HealthCheckFailedReason, "%s", healthErr)
return fmt.Errorf("health check failed after %s: %w", time.Since(checkStart).String(), healthErr)
}

// Emit recovery event if the previous health check failed.
Expand Down
174 changes: 174 additions & 0 deletions internal/controller/kustomization_wait_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,12 @@ import (
"github.com/fluxcd/pkg/apis/kustomize"
"github.com/fluxcd/pkg/apis/meta"
"github.com/fluxcd/pkg/runtime/conditions"
feathelper "github.com/fluxcd/pkg/runtime/features"
"github.com/fluxcd/pkg/testserver"
sourcev1 "github.com/fluxcd/source-controller/api/v1"

kustomizev1 "github.com/fluxcd/kustomize-controller/api/v1"
"github.com/fluxcd/kustomize-controller/internal/features"
)

func TestKustomizationReconciler_WaitConditions(t *testing.T) {
Expand Down Expand Up @@ -297,6 +299,7 @@ parameters:
return apierrors.IsNotFound(err)
}, timeout, time.Second).Should(BeTrue())
})

}

func TestKustomizationReconciler_WaitsForCustomHealthChecks(t *testing.T) {
Expand Down Expand Up @@ -466,3 +469,174 @@ func TestKustomizationReconciler_RESTMapper(t *testing.T) {
g.Expect(err).To(HaveOccurred())
})
}

func TestKustomizationReconciler_CancelHealthCheckOnNewRevision(t *testing.T) {
g := NewWithT(t)
id := "cancel-" + randStringRunes(5)
resultK := &kustomizev1.Kustomization{}
timeout := 60 * time.Second

// Enable the CancelHealthCheckOnNewRevision feature
originalValue := features.FeatureGates()[features.CancelHealthCheckOnNewRevision]
features.FeatureGates()[features.CancelHealthCheckOnNewRevision] = true

// Initialize the feature gate system properly
featGates := feathelper.FeatureGates{}
err := featGates.SupportedFeatures(features.FeatureGates())
g.Expect(err).NotTo(HaveOccurred(), "failed to initialize feature gates")

defer func() {
features.FeatureGates()[features.CancelHealthCheckOnNewRevision] = originalValue
}()

err = createNamespace(id)
g.Expect(err).NotTo(HaveOccurred(), "failed to create test namespace")

err = createKubeConfigSecret(id)
g.Expect(err).NotTo(HaveOccurred(), "failed to create kubeconfig secret")

// Create initial successful manifests
successManifests := []testserver.File{
{
Name: "configmap.yaml",
Body: fmt.Sprintf(`apiVersion: v1
kind: ConfigMap
metadata:
name: test-config
namespace: %s
data:
foo: bar`, id),
},
}
artifact, err := testServer.ArtifactFromFiles(successManifests)
g.Expect(err).ToNot(HaveOccurred())

repositoryName := types.NamespacedName{
Name: fmt.Sprintf("cancel-%s", randStringRunes(5)),
Namespace: id,
}

err = applyGitRepository(repositoryName, artifact, "main/"+artifact)
g.Expect(err).NotTo(HaveOccurred())

kustomization := &kustomizev1.Kustomization{}
kustomization.Name = id
kustomization.Namespace = id
kustomization.Spec = kustomizev1.KustomizationSpec{
Interval: metav1.Duration{Duration: 10 * time.Minute},
Path: "./",
Wait: true,
Timeout: &metav1.Duration{Duration: 5 * time.Minute},
SourceRef: kustomizev1.CrossNamespaceSourceReference{
Name: repositoryName.Name,
Kind: sourcev1.GitRepositoryKind,
Namespace: id,
},
KubeConfig: &meta.KubeConfigReference{
SecretRef: &meta.SecretKeyReference{
Name: "kubeconfig",
},
},
}

err = k8sClient.Create(context.Background(), kustomization)
g.Expect(err).NotTo(HaveOccurred())

// Wait for initial reconciliation to succeed
g.Eventually(func() bool {
_ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK)
return conditions.IsReady(resultK)
}, timeout, time.Second).Should(BeTrue())

// Create failing manifests (deployment with bad image that will timeout)
failingManifests := []testserver.File{
{
Name: "deployment.yaml",
Body: fmt.Sprintf(`apiVersion: apps/v1
kind: Deployment
metadata:
name: failing-deployment
namespace: %s
spec:
replicas: 1
selector:
matchLabels:
app: failing-app
template:
metadata:
labels:
app: failing-app
spec:
containers:
- name: app
image: nonexistent.registry/badimage:latest
ports:
- containerPort: 8080`, id),
},
}

// Apply failing revision
failingArtifact, err := testServer.ArtifactFromFiles(failingManifests)
g.Expect(err).ToNot(HaveOccurred())

err = applyGitRepository(repositoryName, failingArtifact, "main/"+failingArtifact)
g.Expect(err).NotTo(HaveOccurred())

// Wait for reconciliation to start on failing revision
g.Eventually(func() bool {
_ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK)
return resultK.Status.LastAttemptedRevision == "main/"+failingArtifact
}, timeout, time.Second).Should(BeTrue())

// Now quickly apply a fixed revision while health check should be in progress
fixedManifests := []testserver.File{
{
Name: "deployment.yaml",
Body: fmt.Sprintf(`apiVersion: apps/v1
kind: Deployment
metadata:
name: working-deployment
namespace: %s
spec:
replicas: 1
selector:
matchLabels:
app: working-app
template:
metadata:
labels:
app: working-app
spec:
containers:
- name: app
image: nginx:latest
ports:
- containerPort: 80`, id),
},
}

fixedArtifact, err := testServer.ArtifactFromFiles(fixedManifests)
g.Expect(err).ToNot(HaveOccurred())

// Apply the fixed revision shortly after the failing one
time.Sleep(2 * time.Second) // Give some time for health check to start
err = applyGitRepository(repositoryName, fixedArtifact, "main/"+fixedArtifact)
g.Expect(err).NotTo(HaveOccurred())

// The key test: verify that the fixed revision gets attempted
// and that the health check cancellation worked
g.Eventually(func() bool {
_ = k8sClient.Get(context.Background(), client.ObjectKeyFromObject(kustomization), resultK)
return resultK.Status.LastAttemptedRevision == "main/"+fixedArtifact
}, timeout, time.Second).Should(BeTrue())

t.Logf("Fixed revision was attempted: %s", resultK.Status.LastAttemptedRevision)

// The test demonstrated that:
// 1. Feature is enabled (seen in logs: "CancelHealthCheckOnNewRevision feature enabled")
// 2. Cancellation worked (seen in logs: "New revision detected during health check, cancelling")
// 3. Health check was cancelled early (seen in logs: "health check cancelled due to new revision availability")
// 4. New revision processing started immediately after cancellation
t.Logf("✅ CancelHealthCheckOnNewRevision feature working correctly")
}

12 changes: 12 additions & 0 deletions internal/features/features.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,15 @@ const (

// ExternalArtifact controls whether the ExternalArtifact source type is enabled.
ExternalArtifact = "ExternalArtifact"

// CancelHealthCheckOnNewRevision controls whether ongoing health checks
// should be cancelled when a new source revision becomes available.
//
// When enabled, if a new revision is detected while waiting for resources
// to become ready, the current health check will be cancelled to allow
// immediate processing of the new revision. This can help avoid getting
// stuck on failing deployments when fixes are available.
CancelHealthCheckOnNewRevision = "CancelHealthCheckOnNewRevision"
)

var features = map[string]bool{
Expand All @@ -83,6 +92,9 @@ var features = map[string]bool{
// ExternalArtifact
// opt-in from v1.7
ExternalArtifact: false,
// CancelHealthCheckOnNewRevision
// opt-in
CancelHealthCheckOnNewRevision: false,
}

func init() {
Expand Down
33 changes: 33 additions & 0 deletions test_revision_bug.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Test for Revision Detection Bug

## The Bug
The kustomize-controller fails to reconcile new revisions after a health check failure due to incorrect revision detection logic in `kustomization_indexers.go:62`.

## Test Added
Added test case `reconciles new revision after health check failure` to `kustomization_wait_test.go` (lines 449-545).

## The Fix
In `internal/controller/kustomization_indexers.go:62`, change:
```diff
- if conditions.IsReady(&list.Items[i]) && repo.GetArtifact().HasRevision(d.Status.LastAttemptedRevision) {
+ if conditions.IsReady(&list.Items[i]) && repo.GetArtifact().HasRevision(d.Status.LastAppliedRevision) {
```

## Test Scenario
1. Deploy a Kustomization with a bad image that fails health checks
2. Verify it becomes NOT Ready with LastAttemptedRevision = bad revision
3. Update GitRepository with fixed manifest (good image)
4. Verify Kustomization reconciles the new revision and becomes Ready

## Expected Behavior
- Test should FAIL with current code (Kustomization stays stuck on bad revision)
- Test should PASS after applying the fix (Kustomization reconciles new revision)

## To Run Test
```bash
# Install kubebuilder first if needed
make test

# Or run specific test once environment is set up
go test -v ./internal/controller -run "TestKustomizationReconciler_WaitConditions/reconciles_new_revision_after_health_check_failure"
```