Skip to content

Commit ed2c33a

Browse files
committed
feat: implement deployment strategies with compartment-based batching
1 parent 299a07d commit ed2c33a

File tree

8 files changed

+768
-8
lines changed

8 files changed

+768
-8
lines changed

chart/templates/skyhook-crd.yaml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,40 @@ spec:
498498
status:
499499
description: SkyhookStatus defines the observed state of Skyhook
500500
properties:
501+
compartmentBatchStates:
502+
additionalProperties:
503+
description: BatchProcessingState tracks the current state of batch
504+
processing for a compartment
505+
properties:
506+
consecutiveFailures:
507+
description: Number of consecutive failures
508+
type: integer
509+
currentBatch:
510+
description: Current batch number (starts at 1)
511+
type: integer
512+
currentBatchNodes:
513+
description: Names of nodes in the current batch (persisted
514+
across reconciles)
515+
items:
516+
type: string
517+
type: array
518+
failedInBatch:
519+
description: Number of failed nodes in current batch
520+
type: integer
521+
processedNodes:
522+
description: Total number of nodes processed so far
523+
type: integer
524+
shouldStop:
525+
description: Whether the strategy should stop processing due
526+
to failures
527+
type: boolean
528+
successfulInBatch:
529+
description: Number of successful nodes in current batch
530+
type: integer
531+
type: object
532+
description: CompartmentBatchStates tracks batch processing state
533+
per compartment
534+
type: object
501535
completeNodes:
502536
default: 0/0
503537
description: |-

operator/api/v1alpha1/deployment_policy_types.go

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,161 @@ func (s *DeploymentStrategy) Validate() error {
232232
return nil
233233
}
234234

235+
// BatchProcessingState tracks the current state of batch processing for a compartment
236+
type BatchProcessingState struct {
237+
// Current batch number (starts at 1)
238+
CurrentBatch int `json:"currentBatch,omitempty"`
239+
// Number of consecutive failures
240+
ConsecutiveFailures int `json:"consecutiveFailures,omitempty"`
241+
// Total number of nodes processed so far
242+
ProcessedNodes int `json:"processedNodes,omitempty"`
243+
// Number of successful nodes in current batch
244+
SuccessfulInBatch int `json:"successfulInBatch,omitempty"`
245+
// Number of failed nodes in current batch
246+
FailedInBatch int `json:"failedInBatch,omitempty"`
247+
// Whether the strategy should stop processing due to failures
248+
ShouldStop bool `json:"shouldStop,omitempty"`
249+
// Names of nodes in the current batch (persisted across reconciles)
250+
CurrentBatchNodes []string `json:"currentBatchNodes,omitempty"`
251+
// Last successful batch size (for slowdown calculations)
252+
LastBatchSize int `json:"lastBatchSize,omitempty"`
253+
// Whether the last batch failed (for slowdown logic)
254+
LastBatchFailed bool `json:"lastBatchFailed,omitempty"`
255+
}
256+
257+
// CalculateBatchSize calculates the next batch size based on the strategy
258+
func (s *DeploymentStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessingState) int {
259+
switch {
260+
case s.Fixed != nil:
261+
return s.Fixed.CalculateBatchSize(totalNodes, state)
262+
case s.Linear != nil:
263+
return s.Linear.CalculateBatchSize(totalNodes, state)
264+
case s.Exponential != nil:
265+
return s.Exponential.CalculateBatchSize(totalNodes, state)
266+
default:
267+
return 1 // fallback
268+
}
269+
}
270+
271+
// EvaluateBatchResult evaluates the result of a batch and updates state
272+
func (s *DeploymentStrategy) EvaluateBatchResult(state *BatchProcessingState, batchSize int, successCount int, failureCount int, totalNodes int) {
273+
state.SuccessfulInBatch = successCount
274+
state.FailedInBatch = failureCount
275+
state.ProcessedNodes += batchSize
276+
277+
successPercentage := (successCount * 100) / batchSize
278+
progressPercent := (state.ProcessedNodes * 100) / totalNodes
279+
280+
if successPercentage >= s.getBatchThreshold() {
281+
state.ConsecutiveFailures = 0
282+
state.LastBatchFailed = false
283+
} else {
284+
state.ConsecutiveFailures++
285+
state.LastBatchFailed = true
286+
if progressPercent < s.getSafetyLimit() && state.ConsecutiveFailures >= s.getFailureThreshold() {
287+
state.ShouldStop = true
288+
}
289+
}
290+
291+
state.LastBatchSize = batchSize
292+
state.CurrentBatch++
293+
}
294+
295+
// getBatchThreshold returns the batch threshold from the active strategy
296+
func (s *DeploymentStrategy) getBatchThreshold() int {
297+
switch {
298+
case s.Fixed != nil:
299+
return *s.Fixed.BatchThreshold
300+
case s.Linear != nil:
301+
return *s.Linear.BatchThreshold
302+
case s.Exponential != nil:
303+
return *s.Exponential.BatchThreshold
304+
default:
305+
return 100
306+
}
307+
}
308+
309+
// getSafetyLimit returns the safety limit from the active strategy
310+
func (s *DeploymentStrategy) getSafetyLimit() int {
311+
switch {
312+
case s.Fixed != nil:
313+
return *s.Fixed.SafetyLimit
314+
case s.Linear != nil:
315+
return *s.Linear.SafetyLimit
316+
case s.Exponential != nil:
317+
return *s.Exponential.SafetyLimit
318+
default:
319+
return 50
320+
}
321+
}
322+
323+
// getFailureThreshold returns the failure threshold from the active strategy
324+
func (s *DeploymentStrategy) getFailureThreshold() int {
325+
switch {
326+
case s.Fixed != nil:
327+
return *s.Fixed.FailureThreshold
328+
case s.Linear != nil:
329+
return *s.Linear.FailureThreshold
330+
case s.Exponential != nil:
331+
return *s.Exponential.FailureThreshold
332+
default:
333+
return 3
334+
}
335+
}
336+
337+
func (s *FixedStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessingState) int {
338+
// Fixed strategy doesn't change batch size, but respects remaining nodes
339+
batchSize := *s.InitialBatch
340+
remaining := totalNodes - state.ProcessedNodes
341+
if batchSize > remaining {
342+
batchSize = remaining
343+
}
344+
return max(1, batchSize)
345+
}
346+
347+
func (s *LinearStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessingState) int {
348+
var batchSize int
349+
350+
// Check if we should slow down due to last batch failure
351+
progressPercent := (state.ProcessedNodes * 100) / totalNodes
352+
if state.LastBatchFailed && progressPercent < *s.SafetyLimit && state.LastBatchSize > 0 {
353+
// Slow down: reduce by delta from last batch size
354+
batchSize = max(1, state.LastBatchSize - *s.Delta)
355+
} else {
356+
// Normal growth: initialBatch + (currentBatch - 1) * delta
357+
batchSize = *s.InitialBatch + (state.CurrentBatch-1)*(*s.Delta)
358+
}
359+
360+
remaining := totalNodes - state.ProcessedNodes
361+
if batchSize > remaining {
362+
batchSize = remaining
363+
}
364+
return max(1, batchSize)
365+
}
366+
367+
func (s *ExponentialStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessingState) int {
368+
var batchSize int
369+
370+
// Check if we should slow down due to last batch failure
371+
progressPercent := (state.ProcessedNodes * 100) / totalNodes
372+
if state.LastBatchFailed && progressPercent < *s.SafetyLimit && state.LastBatchSize > 0 {
373+
// Slow down: divide last batch size by growth factor
374+
batchSize = max(1, state.LastBatchSize / *s.GrowthFactor)
375+
} else {
376+
// Normal growth: initialBatch * (growthFactor ^ (currentBatch - 1))
377+
batchSize = *s.InitialBatch
378+
for i := 1; i < state.CurrentBatch; i++ {
379+
batchSize *= *s.GrowthFactor
380+
}
381+
}
382+
383+
remaining := totalNodes - state.ProcessedNodes
384+
if batchSize > remaining {
385+
batchSize = remaining
386+
}
387+
return max(1, batchSize)
388+
}
389+
235390
// Validate validates the Compartment
236391
func (c *Compartment) Validate() error {
237392
// Validate compartment budget

operator/api/v1alpha1/skyhook_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,9 @@ type SkyhookStatus struct {
316316
// ConfigUpdates tracks config updates
317317
ConfigUpdates map[string][]string `json:"configUpdates,omitempty"`
318318

319+
// CompartmentBatchStates tracks batch processing state per compartment
320+
CompartmentBatchStates map[string]BatchProcessingState `json:"compartmentBatchStates,omitempty"`
321+
319322
// +kubebuilder:example=3
320323
// +kubebuilder:default=0
321324
// NodesInProgress displays the number of nodes that are currently in progress and is

operator/api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 27 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,40 @@ spec:
499499
status:
500500
description: SkyhookStatus defines the observed state of Skyhook
501501
properties:
502+
compartmentBatchStates:
503+
additionalProperties:
504+
description: BatchProcessingState tracks the current state of batch
505+
processing for a compartment
506+
properties:
507+
consecutiveFailures:
508+
description: Number of consecutive failures
509+
type: integer
510+
currentBatch:
511+
description: Current batch number (starts at 1)
512+
type: integer
513+
currentBatchNodes:
514+
description: Names of nodes in the current batch (persisted
515+
across reconciles)
516+
items:
517+
type: string
518+
type: array
519+
failedInBatch:
520+
description: Number of failed nodes in current batch
521+
type: integer
522+
processedNodes:
523+
description: Total number of nodes processed so far
524+
type: integer
525+
shouldStop:
526+
description: Whether the strategy should stop processing due
527+
to failures
528+
type: boolean
529+
successfulInBatch:
530+
description: Number of successful nodes in current batch
531+
type: integer
532+
type: object
533+
description: CompartmentBatchStates tracks batch processing state
534+
per compartment
535+
type: object
502536
completeNodes:
503537
default: 0/0
504538
description: |-

0 commit comments

Comments
 (0)