Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ spec:
../metrics_test.py skyhook_package_stage_count 1 -t package_name=dexter -t skyhook_name=delete-skyhook -t stage=config
../metrics_test.py skyhook_package_stage_count 1 -t package_name=spencer -t skyhook_name=delete-skyhook -t stage=config
../metrics_test.py skyhook_package_stage_count 1 -t package_name=foobar -t skyhook_name=delete-skyhook -t stage=config
# Rollout metrics checks
../metrics_test.py skyhook_rollout_matched_nodes 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
../metrics_test.py skyhook_rollout_completed 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
../metrics_test.py skyhook_rollout_progress_percent 100 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
- delete:
file: skyhook.yaml
- script:
Expand All @@ -55,6 +59,11 @@ spec:
../metrics_test.py skyhook_package_stage_count 1 -t package_name=dexter -t skyhook_name=delete-skyhook -t stage=config --not-found
../metrics_test.py skyhook_package_stage_count 1 -t package_name=spencer -t skyhook_name=delete-skyhook -t stage=config --not-found
../metrics_test.py skyhook_package_stage_count 1 -t package_name=foobar -t skyhook_name=delete-skyhook -t stage=config --not-found
# Rollout metrics should be cleaned up after deletion
../metrics_test.py skyhook_rollout_matched_nodes 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found
../metrics_test.py skyhook_rollout_ceiling 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found
../metrics_test.py skyhook_rollout_completed 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found
../metrics_test.py skyhook_rollout_progress_percent 100 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found
- finally:
- delete:
file: skyhook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,12 @@ spec:
../metrics_test.py skyhook_package_state_count 1 -t package_name=foobar -t skyhook_name=simple-skyhook -t state=complete
../metrics_test.py skyhook_package_stage_count 1 -t package_name=spencer -t skyhook_name=simple-skyhook -t stage=config
../metrics_test.py skyhook_package_stage_count 1 -t package_name=foobar -t skyhook_name=simple-skyhook -t stage=config
# Rollout metrics checks
../metrics_test.py skyhook_rollout_matched_nodes 1 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
../metrics_test.py skyhook_rollout_ceiling 1 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
../metrics_test.py skyhook_rollout_completed 1 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
../metrics_test.py skyhook_rollout_in_progress 0 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
../metrics_test.py skyhook_rollout_progress_percent 100 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
- finally:
- delete:
file: limitrange.yaml
15 changes: 15 additions & 0 deletions operator/internal/controller/cluster_state_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,21 @@ func (skyhook *skyhookNodes) ReportState() {
}
}

// Set rollout metrics for each compartment (follows same pattern as other metrics)
if len(skyhook.compartments) > 0 {
policyName := skyhook.GetSkyhook().Spec.DeploymentPolicy
if policyName == "" {
policyName = LegacyPolicyName
}

for name, compartment := range skyhook.compartments {
if status, ok := skyhook.skyhook.Status.CompartmentStatuses[name]; ok {
strategy := getStrategyType(compartment)
SetRolloutMetrics(skyhookName, policyName, name, strategy, status)
}
}
}

// Set current count of completed nodes
completeNodes := fmt.Sprintf("%d/%d", nodeStatusCounts[v1alpha1.StatusComplete], nodeCount)
if completeNodes != skyhook.skyhook.GetCompleteNodes() {
Expand Down
178 changes: 178 additions & 0 deletions operator/internal/controller/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,16 @@ package controller

import (
"github.com/NVIDIA/skyhook/operator/api/v1alpha1"
"github.com/NVIDIA/skyhook/operator/internal/wrapper"
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

const (
// LegacyPolicyName is used when no deployment policy is specified (backward compatibility)
LegacyPolicyName = "legacy"
)

var (
// skyhook metrics
skyhook_status = prometheus.NewGaugeVec(
Expand Down Expand Up @@ -75,6 +81,71 @@ var (
},
[]string{"skyhook_name", "package_name", "package_version"},
)

// rollout metrics (per-compartment)
skyhook_rollout_matched_nodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "skyhook_rollout_matched_nodes",
Help: "Number of nodes matched by this compartment's selector",
},
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
)

skyhook_rollout_ceiling = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "skyhook_rollout_ceiling",
Help: "Maximum number of nodes that can be in progress at once in this compartment",
},
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
)

skyhook_rollout_in_progress = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "skyhook_rollout_in_progress",
Help: "Number of nodes currently in progress in this compartment",
},
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
)

skyhook_rollout_completed = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "skyhook_rollout_completed",
Help: "Number of nodes completed in this compartment",
},
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
)

skyhook_rollout_progress_percent = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "skyhook_rollout_progress_percent",
Help: "Percentage of nodes completed in this compartment (0-100)",
},
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
)

skyhook_rollout_current_batch = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "skyhook_rollout_current_batch",
Help: "Current batch number in the rollout strategy (0 if no batch processing)",
},
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
)

skyhook_rollout_consecutive_failures = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "skyhook_rollout_consecutive_failures",
Help: "Number of consecutive batch failures in this compartment",
},
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
)

skyhook_rollout_should_stop = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "skyhook_rollout_should_stop",
Help: "Binary metric indicating if rollout should be stopped due to failures (1 = stopped, 0 = continuing)",
},
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
)
)

func zeroOutSkyhookMetrics(skyhook SkyhookNodes) {
Expand All @@ -96,6 +167,9 @@ func zeroOutSkyhookMetrics(skyhook SkyhookNodes) {
for _, _package := range skyhook.GetSkyhook().Spec.Packages {
zeroOutSkyhookPackageMetrics(skyhook.GetSkyhook().Name, _package.Name, _package.Version)
}

// Clean up all rollout metrics for this skyhook
zeroOutSkyhookRolloutMetrics(skyhook)
}

func zeroOutSkyhookPackageMetrics(skyhookName, packageName, packageVersion string) {
Expand Down Expand Up @@ -129,6 +203,9 @@ func ResetSkyhookMetricsToZero(skyhook SkyhookNodes) {
SetPackageStageMetrics(skyhookName, pkg.Name, pkg.Version, stage, 0)
}
}

// Reset rollout metrics to zero
ResetRolloutMetricsToZero(skyhook)
}

func SetNodeStatusMetrics(skyhookName string, status v1alpha1.Status, count float64) {
Expand Down Expand Up @@ -159,6 +236,99 @@ func SetNodeTargetCountMetrics(skyhookName string, count float64) {
skyhook_node_target_count.WithLabelValues(skyhookName).Set(count)
}

// zeroOutRolloutMetricsForCompartment removes rollout metrics for a specific compartment
func zeroOutRolloutMetricsForCompartment(skyhookName, policyName, compartmentName, strategy string) {
skyhook_rollout_matched_nodes.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
skyhook_rollout_ceiling.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
skyhook_rollout_in_progress.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
skyhook_rollout_completed.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
skyhook_rollout_progress_percent.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
skyhook_rollout_current_batch.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
skyhook_rollout_consecutive_failures.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
skyhook_rollout_should_stop.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
}

// zeroOutSkyhookRolloutMetrics removes all rollout metrics for a skyhook
// This is called when a Skyhook is deleted
func zeroOutSkyhookRolloutMetrics(skyhook SkyhookNodes) {
// Get the policy name from the skyhook spec
policyName := skyhook.GetSkyhook().Spec.DeploymentPolicy
if policyName == "" {
policyName = LegacyPolicyName
}

// Clean up metrics for all compartments
for compartmentName, compartment := range skyhook.GetCompartments() {
strategy := getStrategyType(compartment)
zeroOutRolloutMetricsForCompartment(skyhook.GetSkyhook().Name, policyName, compartmentName, strategy)
}

// Also clean up metrics from CompartmentStatuses in case compartments were removed
if skyhook.GetSkyhook().Status.CompartmentStatuses != nil {
for compartmentName := range skyhook.GetSkyhook().Status.CompartmentStatuses {
// We don't have the exact strategy here, so we'll need to try to delete with all possible strategy types
for _, strategyType := range []string{"fixed", "linear", "exponential", "unknown"} {
zeroOutRolloutMetricsForCompartment(skyhook.GetSkyhook().Name, policyName, compartmentName, strategyType)
}
}
}
}

// getStrategyType returns the strategy type name for a compartment
func getStrategyType(compartment *wrapper.Compartment) string {
strategyType := wrapper.GetStrategyType(compartment.Strategy)
return string(strategyType)
}

// ResetRolloutMetricsToZero resets rollout metrics to zero for all compartments in the skyhook
// This follows the same pattern as ResetSkyhookMetricsToZero for consistency
func ResetRolloutMetricsToZero(skyhook SkyhookNodes) {
policyName := skyhook.GetSkyhook().Spec.DeploymentPolicy
if policyName == "" {
policyName = LegacyPolicyName
}

// Reset metrics for all current compartments
for compartmentName, compartment := range skyhook.GetCompartments() {
strategy := getStrategyType(compartment)
emptyStatus := v1alpha1.CompartmentStatus{
Matched: 0,
Ceiling: 0,
InProgress: 0,
Completed: 0,
ProgressPercent: 0,
BatchState: nil,
}
SetRolloutMetrics(skyhook.GetSkyhook().Name, policyName, compartmentName, strategy, emptyStatus)
}
}

// SetRolloutMetrics sets the rollout metrics for a specific compartment
func SetRolloutMetrics(skyhookName, policyName, compartmentName, strategy string, status v1alpha1.CompartmentStatus) {
skyhook_rollout_matched_nodes.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.Matched))
skyhook_rollout_ceiling.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.Ceiling))
skyhook_rollout_in_progress.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.InProgress))
skyhook_rollout_completed.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.Completed))
skyhook_rollout_progress_percent.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.ProgressPercent))

// Set batch state metrics if present
if status.BatchState != nil {
skyhook_rollout_current_batch.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.BatchState.CurrentBatch))
skyhook_rollout_consecutive_failures.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.BatchState.ConsecutiveFailures))

shouldStop := float64(0)
if status.BatchState.ShouldStop {
shouldStop = 1
}
skyhook_rollout_should_stop.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(shouldStop)
} else {
// Set to 0 if no batch state
skyhook_rollout_current_batch.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(0)
skyhook_rollout_consecutive_failures.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(0)
skyhook_rollout_should_stop.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(0)
}
}

func init() {
metrics.Registry.MustRegister(
skyhook_status,
Expand All @@ -167,5 +337,13 @@ func init() {
skyhook_package_state_count,
skyhook_package_stage_count,
skyhook_package_restarts_count,
skyhook_rollout_matched_nodes,
skyhook_rollout_ceiling,
skyhook_rollout_in_progress,
skyhook_rollout_completed,
skyhook_rollout_progress_percent,
skyhook_rollout_current_batch,
skyhook_rollout_consecutive_failures,
skyhook_rollout_should_stop,
)
}