diff --git a/docs/deployment_policy.md b/docs/deployment_policy.md index abdc080d..be5dff22 100644 --- a/docs/deployment_policy.md +++ b/docs/deployment_policy.md @@ -11,6 +11,8 @@ A **DeploymentPolicy** is a Kubernetes Custom Resource that separates rollout co - Apply different strategies to different node groups (e.g., production vs. test) - Control rollout speed and safety with configurable thresholds +**Important**: DeploymentPolicy controls **all node updates** in a Skyhook rollout, not just interrupt handling. + --- ## Basic Structure @@ -30,9 +32,7 @@ spec: fixed: # or linear, exponential initialBatch: 1 batchThreshold: 100 - failureThreshold: 3 safetyLimit: 50 - # Compartments define specific node groups compartments: - name: production @@ -141,20 +141,30 @@ All strategies share these parameters: - **`initialBatch`** (≥1): Starting number of nodes (default: 1) - **`batchThreshold`** (1-100): Minimum success percentage to continue (default: 100) -- **`failureThreshold`** (≥1): Max consecutive failures before stopping (default: 3) +- **`failureThreshold`** (≥1, optional): Max consecutive failures before stopping (default: none/unlimited) - **`safetyLimit`** (1-100): Progress threshold for failure handling (default: 50) +### Default Values + +When strategy parameters are not specified, the operator applies these defaults: +- `initialBatch`: 1 +- `batchThreshold`: 100 +- `safetyLimit`: 50 +- `failureThreshold`: **none** (rollout never stops due to consecutive failures) + +**Note**: `failureThreshold` is **nullable**. If omitted, the rollout will continue despite consecutive failures, only respecting batch success thresholds but never stopping the entire rollout. + ### Safety Limit Behavior **Before safetyLimit** (e.g., < 50% progress): -- Failures count toward `failureThreshold` +- Failures count toward `failureThreshold` (if set) - Batch sizes slow down (linear/exponential) -- Reaching `failureThreshold` stops the rollout +- Reaching `failureThreshold` stops the rollout (if set) **After safetyLimit** (e.g., ≥ 50% progress): - Rollout continues despite failures - Batch sizes don't slow down -- Assumes rollout is "safe enough" to complete +- `failureThreshold` is ignored (rollout assumed "safe enough" to complete) **Rationale**: Early failures indicate a problem. Late failures are less critical since most nodes are updated. @@ -283,7 +293,6 @@ spec: fixed: initialBatch: 1 batchThreshold: 100 - failureThreshold: 3 safetyLimit: 50 ``` diff --git a/operator/api/v1alpha1/deployment_policy_types.go b/operator/api/v1alpha1/deployment_policy_types.go index 13fb5dd7..17e59013 100644 --- a/operator/api/v1alpha1/deployment_policy_types.go +++ b/operator/api/v1alpha1/deployment_policy_types.go @@ -181,16 +181,13 @@ func (s *DeploymentStrategy) Default() { } // defaultCommonStrategyFields applies default values to common strategy fields -func defaultCommonStrategyFields(initialBatch, batchThreshold, failureThreshold, safetyLimit **int) { +func defaultCommonStrategyFields(initialBatch, batchThreshold, safetyLimit **int) { if *initialBatch == nil { *initialBatch = ptr.To(1) } if *batchThreshold == nil { *batchThreshold = ptr.To(100) } - if *failureThreshold == nil { - *failureThreshold = ptr.To(3) - } if *safetyLimit == nil { *safetyLimit = ptr.To(50) } @@ -198,12 +195,12 @@ func defaultCommonStrategyFields(initialBatch, batchThreshold, failureThreshold, // Default applies default values to FixedStrategy func (s *FixedStrategy) Default() { - defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.FailureThreshold, &s.SafetyLimit) + defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.SafetyLimit) } // Default applies default values to LinearStrategy func (s *LinearStrategy) Default() { - defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.FailureThreshold, &s.SafetyLimit) + defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.SafetyLimit) if s.Delta == nil { s.Delta = ptr.To(1) } @@ -211,7 +208,7 @@ func (s *LinearStrategy) Default() { // Default applies default values to ExponentialStrategy func (s *ExponentialStrategy) Default() { - defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.FailureThreshold, &s.SafetyLimit) + defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.SafetyLimit) if s.GrowthFactor == nil { s.GrowthFactor = ptr.To(2) } @@ -297,7 +294,8 @@ func (s *DeploymentStrategy) EvaluateBatchResult(state *BatchProcessingState, ba if batchFailed { state.ConsecutiveFailures++ // Check if we should stop processing - if progressPercent < s.getSafetyLimit() && state.ConsecutiveFailures >= s.getFailureThreshold() { + failureThreshold := s.getFailureThreshold() + if failureThreshold != nil && progressPercent < s.getSafetyLimit() && state.ConsecutiveFailures >= *failureThreshold { state.ShouldStop = true } } else { @@ -336,16 +334,17 @@ func (s *DeploymentStrategy) getSafetyLimit() int { } // getFailureThreshold returns the failure threshold from the active strategy -func (s *DeploymentStrategy) getFailureThreshold() int { +// Returns nil if failureThreshold is not set (indicating no limit on consecutive failures) +func (s *DeploymentStrategy) getFailureThreshold() *int { switch { case s.Fixed != nil: - return *s.Fixed.FailureThreshold + return s.Fixed.FailureThreshold case s.Linear != nil: - return *s.Linear.FailureThreshold + return s.Linear.FailureThreshold case s.Exponential != nil: - return *s.Exponential.FailureThreshold + return s.Exponential.FailureThreshold default: - return 3 + return nil } } diff --git a/operator/api/v1alpha1/deployment_policy_webhook_test.go b/operator/api/v1alpha1/deployment_policy_webhook_test.go index 0edbf9ed..4b621f77 100644 --- a/operator/api/v1alpha1/deployment_policy_webhook_test.go +++ b/operator/api/v1alpha1/deployment_policy_webhook_test.go @@ -64,7 +64,7 @@ var _ = Describe("DeploymentPolicy", func() { Expect(deploymentPolicy.Spec.Default.Strategy.Fixed).ToNot(BeNil()) Expect(*deploymentPolicy.Spec.Default.Strategy.Fixed.InitialBatch).To(Equal(1)) Expect(*deploymentPolicy.Spec.Default.Strategy.Fixed.BatchThreshold).To(Equal(100)) - Expect(*deploymentPolicy.Spec.Default.Strategy.Fixed.FailureThreshold).To(Equal(3)) + Expect(deploymentPolicy.Spec.Default.Strategy.Fixed.FailureThreshold).To(BeNil()) Expect(*deploymentPolicy.Spec.Default.Strategy.Fixed.SafetyLimit).To(Equal(50)) Expect(deploymentPolicy.Spec.Compartments[0].Strategy).ToNot(BeNil()) @@ -72,7 +72,7 @@ var _ = Describe("DeploymentPolicy", func() { Expect(*deploymentPolicy.Spec.Compartments[0].Strategy.Linear.InitialBatch).To(Equal(1)) Expect(*deploymentPolicy.Spec.Compartments[0].Strategy.Linear.Delta).To(Equal(1)) Expect(*deploymentPolicy.Spec.Compartments[0].Strategy.Linear.BatchThreshold).To(Equal(100)) - Expect(*deploymentPolicy.Spec.Compartments[0].Strategy.Linear.FailureThreshold).To(Equal(3)) + Expect(deploymentPolicy.Spec.Compartments[0].Strategy.Linear.FailureThreshold).To(BeNil()) Expect(*deploymentPolicy.Spec.Compartments[0].Strategy.Linear.SafetyLimit).To(Equal(50)) Expect(deploymentPolicy.Spec.Compartments[1].Strategy).ToNot(BeNil()) @@ -80,7 +80,7 @@ var _ = Describe("DeploymentPolicy", func() { Expect(*deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.InitialBatch).To(Equal(1)) Expect(*deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.GrowthFactor).To(Equal(2)) Expect(*deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.BatchThreshold).To(Equal(100)) - Expect(*deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.FailureThreshold).To(Equal(3)) + Expect(deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.FailureThreshold).To(BeNil()) Expect(*deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.SafetyLimit).To(Equal(50)) }) }) diff --git a/operator/api/v1alpha1/zz_generated.deepcopy.go b/operator/api/v1alpha1/zz_generated.deepcopy.go index a19c6fbe..1dc1be66 100644 --- a/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -1,5 +1,3 @@ -//go:build !ignore_autogenerated - /* * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 @@ -18,6 +16,8 @@ * limitations under the License. */ +//go:build !ignore_autogenerated + // Code generated by controller-gen. DO NOT EDIT. package v1alpha1 diff --git a/operator/config/samples/deploymentpolicy_v1alpha1_deploymentpolicy.yaml b/operator/config/samples/deploymentpolicy_v1alpha1_deploymentpolicy.yaml index f85bef8b..b5358560 100644 --- a/operator/config/samples/deploymentpolicy_v1alpha1_deploymentpolicy.yaml +++ b/operator/config/samples/deploymentpolicy_v1alpha1_deploymentpolicy.yaml @@ -18,7 +18,6 @@ spec: fixed: initialBatch: 1 # Start with 1 node batchThreshold: 100 # Require 100% success - failureThreshold: 3 # Stop after 3 consecutive failures safetyLimit: 50 # Only apply failure threshold below 50% progress # Compartments define specific node groups with custom rollout behavior @@ -52,7 +51,6 @@ spec: failureThreshold: 2 safetyLimit: 50 - # Test nodes: Fast exponential rollout - name: test selector: matchLabels: @@ -64,6 +62,4 @@ spec: initialBatch: 2 # Start with 2 nodes growthFactor: 2 batchThreshold: 90 # Allow 10% failures - failureThreshold: 3 safetyLimit: 60 # Higher safety limit (more tolerant) -