Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions docs/deployment_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ A **DeploymentPolicy** is a Kubernetes Custom Resource that separates rollout co
- Apply different strategies to different node groups (e.g., production vs. test)
- Control rollout speed and safety with configurable thresholds

**Important**: DeploymentPolicy controls **all node updates** in a Skyhook rollout, not just interrupt handling.

---

## Basic Structure
Expand All @@ -30,9 +32,7 @@ spec:
fixed: # or linear, exponential
initialBatch: 1
batchThreshold: 100
failureThreshold: 3
safetyLimit: 50

# Compartments define specific node groups
compartments:
- name: production
Expand Down Expand Up @@ -141,20 +141,30 @@ All strategies share these parameters:

- **`initialBatch`** (≥1): Starting number of nodes (default: 1)
- **`batchThreshold`** (1-100): Minimum success percentage to continue (default: 100)
- **`failureThreshold`** (≥1): Max consecutive failures before stopping (default: 3)
- **`failureThreshold`** (≥1, optional): Max consecutive failures before stopping (default: none/unlimited)
- **`safetyLimit`** (1-100): Progress threshold for failure handling (default: 50)

### Default Values

When strategy parameters are not specified, the operator applies these defaults:
- `initialBatch`: 1
- `batchThreshold`: 100
- `safetyLimit`: 50
- `failureThreshold`: **none** (rollout never stops due to consecutive failures)

**Note**: `failureThreshold` is **nullable**. If omitted, the rollout will continue despite consecutive failures, only respecting batch success thresholds but never stopping the entire rollout.

### Safety Limit Behavior

**Before safetyLimit** (e.g., < 50% progress):
- Failures count toward `failureThreshold`
- Failures count toward `failureThreshold` (if set)
- Batch sizes slow down (linear/exponential)
- Reaching `failureThreshold` stops the rollout
- Reaching `failureThreshold` stops the rollout (if set)

**After safetyLimit** (e.g., ≥ 50% progress):
- Rollout continues despite failures
- Batch sizes don't slow down
- Assumes rollout is "safe enough" to complete
- `failureThreshold` is ignored (rollout assumed "safe enough" to complete)

**Rationale**: Early failures indicate a problem. Late failures are less critical since most nodes are updated.

Expand Down Expand Up @@ -283,7 +293,6 @@ spec:
fixed:
initialBatch: 1
batchThreshold: 100
failureThreshold: 3
safetyLimit: 50
```

Expand Down
25 changes: 12 additions & 13 deletions operator/api/v1alpha1/deployment_policy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,37 +181,34 @@ func (s *DeploymentStrategy) Default() {
}

// defaultCommonStrategyFields applies default values to common strategy fields
func defaultCommonStrategyFields(initialBatch, batchThreshold, failureThreshold, safetyLimit **int) {
func defaultCommonStrategyFields(initialBatch, batchThreshold, safetyLimit **int) {
if *initialBatch == nil {
*initialBatch = ptr.To(1)
}
if *batchThreshold == nil {
*batchThreshold = ptr.To(100)
}
if *failureThreshold == nil {
*failureThreshold = ptr.To(3)
}
if *safetyLimit == nil {
*safetyLimit = ptr.To(50)
}
}

// Default applies default values to FixedStrategy
func (s *FixedStrategy) Default() {
defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.FailureThreshold, &s.SafetyLimit)
defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.SafetyLimit)
}

// Default applies default values to LinearStrategy
func (s *LinearStrategy) Default() {
defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.FailureThreshold, &s.SafetyLimit)
defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.SafetyLimit)
if s.Delta == nil {
s.Delta = ptr.To(1)
}
}

// Default applies default values to ExponentialStrategy
func (s *ExponentialStrategy) Default() {
defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.FailureThreshold, &s.SafetyLimit)
defaultCommonStrategyFields(&s.InitialBatch, &s.BatchThreshold, &s.SafetyLimit)
if s.GrowthFactor == nil {
s.GrowthFactor = ptr.To(2)
}
Expand Down Expand Up @@ -297,7 +294,8 @@ func (s *DeploymentStrategy) EvaluateBatchResult(state *BatchProcessingState, ba
if batchFailed {
state.ConsecutiveFailures++
// Check if we should stop processing
if progressPercent < s.getSafetyLimit() && state.ConsecutiveFailures >= s.getFailureThreshold() {
failureThreshold := s.getFailureThreshold()
if failureThreshold != nil && progressPercent < s.getSafetyLimit() && state.ConsecutiveFailures >= *failureThreshold {
state.ShouldStop = true
}
} else {
Expand Down Expand Up @@ -336,16 +334,17 @@ func (s *DeploymentStrategy) getSafetyLimit() int {
}

// getFailureThreshold returns the failure threshold from the active strategy
func (s *DeploymentStrategy) getFailureThreshold() int {
// Returns nil if failureThreshold is not set (indicating no limit on consecutive failures)
func (s *DeploymentStrategy) getFailureThreshold() *int {
switch {
case s.Fixed != nil:
return *s.Fixed.FailureThreshold
return s.Fixed.FailureThreshold
case s.Linear != nil:
return *s.Linear.FailureThreshold
return s.Linear.FailureThreshold
case s.Exponential != nil:
return *s.Exponential.FailureThreshold
return s.Exponential.FailureThreshold
default:
return 3
return nil
}
}

Expand Down
6 changes: 3 additions & 3 deletions operator/api/v1alpha1/deployment_policy_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,23 +64,23 @@ var _ = Describe("DeploymentPolicy", func() {
Expect(deploymentPolicy.Spec.Default.Strategy.Fixed).ToNot(BeNil())
Expect(*deploymentPolicy.Spec.Default.Strategy.Fixed.InitialBatch).To(Equal(1))
Expect(*deploymentPolicy.Spec.Default.Strategy.Fixed.BatchThreshold).To(Equal(100))
Expect(*deploymentPolicy.Spec.Default.Strategy.Fixed.FailureThreshold).To(Equal(3))
Expect(deploymentPolicy.Spec.Default.Strategy.Fixed.FailureThreshold).To(BeNil())
Expect(*deploymentPolicy.Spec.Default.Strategy.Fixed.SafetyLimit).To(Equal(50))

Expect(deploymentPolicy.Spec.Compartments[0].Strategy).ToNot(BeNil())
Expect(deploymentPolicy.Spec.Compartments[0].Strategy.Linear).ToNot(BeNil())
Expect(*deploymentPolicy.Spec.Compartments[0].Strategy.Linear.InitialBatch).To(Equal(1))
Expect(*deploymentPolicy.Spec.Compartments[0].Strategy.Linear.Delta).To(Equal(1))
Expect(*deploymentPolicy.Spec.Compartments[0].Strategy.Linear.BatchThreshold).To(Equal(100))
Expect(*deploymentPolicy.Spec.Compartments[0].Strategy.Linear.FailureThreshold).To(Equal(3))
Expect(deploymentPolicy.Spec.Compartments[0].Strategy.Linear.FailureThreshold).To(BeNil())
Expect(*deploymentPolicy.Spec.Compartments[0].Strategy.Linear.SafetyLimit).To(Equal(50))

Expect(deploymentPolicy.Spec.Compartments[1].Strategy).ToNot(BeNil())
Expect(deploymentPolicy.Spec.Compartments[1].Strategy.Exponential).ToNot(BeNil())
Expect(*deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.InitialBatch).To(Equal(1))
Expect(*deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.GrowthFactor).To(Equal(2))
Expect(*deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.BatchThreshold).To(Equal(100))
Expect(*deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.FailureThreshold).To(Equal(3))
Expect(deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.FailureThreshold).To(BeNil())
Expect(*deploymentPolicy.Spec.Compartments[1].Strategy.Exponential.SafetyLimit).To(Equal(50))
})
})
Expand Down
4 changes: 2 additions & 2 deletions operator/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ spec:
fixed:
initialBatch: 1 # Start with 1 node
batchThreshold: 100 # Require 100% success
failureThreshold: 3 # Stop after 3 consecutive failures
safetyLimit: 50 # Only apply failure threshold below 50% progress

# Compartments define specific node groups with custom rollout behavior
Expand Down Expand Up @@ -52,7 +51,6 @@ spec:
failureThreshold: 2
safetyLimit: 50

# Test nodes: Fast exponential rollout
- name: test
selector:
matchLabels:
Expand All @@ -64,6 +62,4 @@ spec:
initialBatch: 2 # Start with 2 nodes
growthFactor: 2
batchThreshold: 90 # Allow 10% failures
failureThreshold: 3
safetyLimit: 60 # Higher safety limit (more tolerant)