Skip to content

Commit eb0312a

Browse files
committed
fix batch slowdown and persistence
1 parent 7ef3164 commit eb0312a

File tree

7 files changed

+77
-8
lines changed

7 files changed

+77
-8
lines changed

chart/templates/skyhook-crd.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,12 @@ spec:
518518
failedInBatch:
519519
description: Number of failed nodes in current batch
520520
type: integer
521+
lastBatchFailed:
522+
description: Whether the last batch failed (for slowdown logic)
523+
type: boolean
524+
lastBatchSize:
525+
description: Last successful batch size (for slowdown calculations)
526+
type: integer
521527
processedNodes:
522528
description: Total number of nodes processed so far
523529
type: integer

operator/api/v1alpha1/deployment_policy_types.go

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ package v1alpha1
2525

2626
import (
2727
"fmt"
28+
"math"
2829

2930
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031
"k8s.io/utils/ptr"
@@ -274,8 +275,16 @@ func (s *DeploymentStrategy) EvaluateBatchResult(state *BatchProcessingState, ba
274275
state.FailedInBatch = failureCount
275276
state.ProcessedNodes += batchSize
276277

278+
// Avoid divide by zero
279+
if batchSize == 0 {
280+
return
281+
}
282+
277283
successPercentage := (successCount * 100) / batchSize
278-
progressPercent := (state.ProcessedNodes * 100) / totalNodes
284+
var progressPercent int
285+
if totalNodes > 0 {
286+
progressPercent = (state.ProcessedNodes * 100) / totalNodes
287+
}
279288

280289
if successPercentage >= s.getBatchThreshold() {
281290
state.ConsecutiveFailures = 0
@@ -347,6 +356,11 @@ func (s *FixedStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessin
347356
func (s *LinearStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessingState) int {
348357
var batchSize int
349358

359+
// Avoid divide by zero
360+
if totalNodes == 0 {
361+
return 0
362+
}
363+
350364
// Check if we should slow down due to last batch failure
351365
progressPercent := (state.ProcessedNodes * 100) / totalNodes
352366
if state.LastBatchFailed && progressPercent < *s.SafetyLimit && state.LastBatchSize > 0 {
@@ -367,16 +381,27 @@ func (s *LinearStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessi
367381
func (s *ExponentialStrategy) CalculateBatchSize(totalNodes int, state *BatchProcessingState) int {
368382
var batchSize int
369383

384+
// Avoid divide by zero
385+
if totalNodes == 0 {
386+
return 0
387+
}
388+
370389
// Check if we should slow down due to last batch failure
371390
progressPercent := (state.ProcessedNodes * 100) / totalNodes
372-
if state.LastBatchFailed && progressPercent < *s.SafetyLimit && state.LastBatchSize > 0 {
391+
if state.LastBatchFailed && progressPercent < *s.SafetyLimit && state.LastBatchSize > 0 && *s.GrowthFactor > 0 {
373392
// Slow down: divide last batch size by growth factor
374393
batchSize = max(1, state.LastBatchSize / *s.GrowthFactor)
375394
} else {
376395
// Normal growth: initialBatch * (growthFactor ^ (currentBatch - 1))
377-
batchSize = *s.InitialBatch
378-
for i := 1; i < state.CurrentBatch; i++ {
379-
batchSize *= *s.GrowthFactor
396+
// Use math.Pow for efficiency and to avoid overflow issues with large batch numbers
397+
exponent := state.CurrentBatch - 1
398+
growthMultiplier := math.Pow(float64(*s.GrowthFactor), float64(exponent))
399+
batchSize = int(float64(*s.InitialBatch) * growthMultiplier)
400+
401+
// Cap at remaining nodes to prevent unreasonably large batch sizes
402+
// and potential integer overflow
403+
if batchSize > totalNodes {
404+
batchSize = totalNodes
380405
}
381406
}
382407

operator/api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,12 @@ spec:
519519
failedInBatch:
520520
description: Number of failed nodes in current batch
521521
type: integer
522+
lastBatchFailed:
523+
description: Whether the last batch failed (for slowdown logic)
524+
type: boolean
525+
lastBatchSize:
526+
description: Last successful batch size (for slowdown calculations)
527+
type: integer
522528
processedNodes:
523529
description: Total number of nodes processed so far
524530
type: integer

operator/internal/controller/cluster_state_v2.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,34 @@ func (np *NodePicker) selectNodesWithCompartments(s SkyhookNodes, compartments m
465465
return selectedNodes
466466
}
467467

468+
// PersistCompartmentBatchStates saves the current batch state for all compartments to the Skyhook status
469+
func PersistCompartmentBatchStates(skyhook SkyhookNodes) bool {
470+
compartments := skyhook.GetCompartments()
471+
if len(compartments) == 0 {
472+
return false // No compartments, nothing to persist
473+
}
474+
475+
// Initialize the batch states map if needed
476+
if skyhook.GetSkyhook().Status.CompartmentBatchStates == nil {
477+
skyhook.GetSkyhook().Status.CompartmentBatchStates = make(map[string]v1alpha1.BatchProcessingState)
478+
}
479+
480+
changed := false
481+
for _, compartment := range compartments {
482+
// Only persist if there are nodes in the current batch
483+
if len(compartment.GetBatchState().CurrentBatchNodes) > 0 {
484+
skyhook.GetSkyhook().Status.CompartmentBatchStates[compartment.GetName()] = compartment.GetBatchState()
485+
changed = true
486+
}
487+
}
488+
489+
if changed {
490+
skyhook.GetSkyhook().Updated = true
491+
}
492+
493+
return changed
494+
}
495+
468496
// selectNodesLegacy implements the original node selection logic for backward compatibility
469497
func (np *NodePicker) selectNodesLegacy(s SkyhookNodes, tolerations []corev1.Toleration) []wrapper.SkyhookNode {
470498
nodes := make([]wrapper.SkyhookNode, 0)

operator/internal/controller/skyhook_controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,10 @@ func (r *SkyhookReconciler) RunSkyhookPackages(ctx context.Context, clusterState
560560
}
561561

562562
selectedNode := nodePicker.SelectNodes(skyhook)
563+
564+
// Persist compartment batch states after node selection
565+
PersistCompartmentBatchStates(skyhook)
566+
563567
for _, node := range selectedNode {
564568

565569
if node.IsComplete() && !node.Changed() {

operator/internal/wrapper/compartment.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ func (c *Compartment) createNewBatch() []SkyhookNode {
135135
}
136136

137137
selectedNodes := make([]SkyhookNode, 0)
138-
priority := []v1alpha1.Status{v1alpha1.StatusInProgress, v1alpha1.StatusUnknown, v1alpha1.StatusBlocked, v1alpha1.StatusErroring}
138+
priority := []v1alpha1.Status{v1alpha1.StatusInProgress, v1alpha1.StatusUnknown, v1alpha1.StatusErroring}
139139

140140
for _, status := range priority {
141141
for _, node := range c.Nodes {

0 commit comments

Comments
 (0)