Skip to content

Commit 1142044

Browse files
committed
fix batch slowdown and persistence
1 parent 7ef3164 commit 1142044

File tree

7 files changed

+67
-5
lines changed

7 files changed

+67
-5
lines changed

chart/templates/skyhook-crd.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,12 @@ spec:
518518
failedInBatch:
519519
description: Number of failed nodes in current batch
520520
type: integer
521+
lastBatchFailed:
522+
description: Whether the last batch failed (for slowdown logic)
523+
type: boolean
524+
lastBatchSize:
525+
description: Last successful batch size (for slowdown calculations)
526+
type: integer
521527
processedNodes:
522528
description: Total number of nodes processed so far
523529
type: integer

operator/api/v1alpha1/deployment_policy_types.go

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,16 @@ func (s *DeploymentStrategy) EvaluateBatchResult(state *BatchProcessingState, ba
274274
state.FailedInBatch = failureCount
275275
state.ProcessedNodes += batchSize
276276

277+
// Avoid divide by zero
278+
if batchSize == 0 {
279+
return
280+
}
281+
277282
successPercentage := (successCount * 100) / batchSize
278-
progressPercent := (state.ProcessedNodes * 100) / totalNodes
283+
var progressPercent int
284+
if totalNodes > 0 {
285+
progressPercent = (state.ProcessedNodes * 100) / totalNodes
286+
}
279287

280288
if successPercentage >= s.getBatchThreshold() {
281289
state.ConsecutiveFailures = 0
@@ -347,6 +355,11 @@ func (s *FixedStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessin
347355
func (s *LinearStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessingState) int {
348356
var batchSize int
349357

358+
// Avoid divide by zero
359+
if totalNodes == 0 {
360+
return 0
361+
}
362+
350363
// Check if we should slow down due to last batch failure
351364
progressPercent := (state.ProcessedNodes * 100) / totalNodes
352365
if state.LastBatchFailed && progressPercent < *s.SafetyLimit && state.LastBatchSize > 0 {
@@ -367,9 +380,14 @@ func (s *LinearStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessi
367380
func (s *ExponentialStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessingState) int {
368381
var batchSize int
369382

383+
// Avoid divide by zero
384+
if totalNodes == 0 {
385+
return 0
386+
}
387+
370388
// Check if we should slow down due to last batch failure
371389
progressPercent := (state.ProcessedNodes * 100) / totalNodes
372-
if state.LastBatchFailed && progressPercent < *s.SafetyLimit && state.LastBatchSize > 0 {
390+
if state.LastBatchFailed && progressPercent < *s.SafetyLimit && state.LastBatchSize > 0 && *s.GrowthFactor > 0 {
373391
// Slow down: divide last batch size by growth factor
374392
batchSize = max(1, state.LastBatchSize / *s.GrowthFactor)
375393
} else {

operator/api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,12 @@ spec:
519519
failedInBatch:
520520
description: Number of failed nodes in current batch
521521
type: integer
522+
lastBatchFailed:
523+
description: Whether the last batch failed (for slowdown logic)
524+
type: boolean
525+
lastBatchSize:
526+
description: Last successful batch size (for slowdown calculations)
527+
type: integer
522528
processedNodes:
523529
description: Total number of nodes processed so far
524530
type: integer

operator/internal/controller/cluster_state_v2.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,34 @@ func (np *NodePicker) selectNodesWithCompartments(s SkyhookNodes, compartments m
465465
return selectedNodes
466466
}
467467

468+
// PersistCompartmentBatchStates saves the current batch state for all compartments to the Skyhook status
469+
func PersistCompartmentBatchStates(skyhook SkyhookNodes) bool {
470+
compartments := skyhook.GetCompartments()
471+
if len(compartments) == 0 {
472+
return false // No compartments, nothing to persist
473+
}
474+
475+
// Initialize the batch states map if needed
476+
if skyhook.GetSkyhook().Status.CompartmentBatchStates == nil {
477+
skyhook.GetSkyhook().Status.CompartmentBatchStates = make(map[string]v1alpha1.BatchProcessingState)
478+
}
479+
480+
changed := false
481+
for _, compartment := range compartments {
482+
// Only persist if there are nodes in the current batch
483+
if len(compartment.GetBatchState().CurrentBatchNodes) > 0 {
484+
skyhook.GetSkyhook().Status.CompartmentBatchStates[compartment.GetName()] = compartment.GetBatchState()
485+
changed = true
486+
}
487+
}
488+
489+
if changed {
490+
skyhook.GetSkyhook().Updated = true
491+
}
492+
493+
return changed
494+
}
495+
468496
// selectNodesLegacy implements the original node selection logic for backward compatibility
469497
func (np *NodePicker) selectNodesLegacy(s SkyhookNodes, tolerations []corev1.Toleration) []wrapper.SkyhookNode {
470498
nodes := make([]wrapper.SkyhookNode, 0)

operator/internal/controller/skyhook_controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,10 @@ func (r *SkyhookReconciler) RunSkyhookPackages(ctx context.Context, clusterState
560560
}
561561

562562
selectedNode := nodePicker.SelectNodes(skyhook)
563+
564+
// Persist compartment batch states after node selection
565+
PersistCompartmentBatchStates(skyhook)
566+
563567
for _, node := range selectedNode {
564568

565569
if node.IsComplete() && !node.Changed() {

operator/internal/wrapper/compartment.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ func (c *Compartment) createNewBatch() []SkyhookNode {
135135
}
136136

137137
selectedNodes := make([]SkyhookNode, 0)
138-
priority := []v1alpha1.Status{v1alpha1.StatusInProgress, v1alpha1.StatusUnknown, v1alpha1.StatusBlocked, v1alpha1.StatusErroring}
138+
priority := []v1alpha1.Status{v1alpha1.StatusInProgress, v1alpha1.StatusUnknown, v1alpha1.StatusErroring}
139139

140140
for _, status := range priority {
141141
for _, node := range c.Nodes {

0 commit comments

Comments
 (0)