diff --git a/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml index 9cf9db62..012e7ecc 100644 --- a/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml @@ -43,6 +43,10 @@ spec: ../metrics_test.py skyhook_package_stage_count 1 -t package_name=dexter -t skyhook_name=delete-skyhook -t stage=config ../metrics_test.py skyhook_package_stage_count 1 -t package_name=spencer -t skyhook_name=delete-skyhook -t stage=config ../metrics_test.py skyhook_package_stage_count 1 -t package_name=foobar -t skyhook_name=delete-skyhook -t stage=config + # Rollout metrics checks + ../metrics_test.py skyhook_rollout_matched_nodes 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed + ../metrics_test.py skyhook_rollout_completed 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed + ../metrics_test.py skyhook_rollout_progress_percent 100 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed - delete: file: skyhook.yaml - script: @@ -55,6 +59,11 @@ spec: ../metrics_test.py skyhook_package_stage_count 1 -t package_name=dexter -t skyhook_name=delete-skyhook -t stage=config --not-found ../metrics_test.py skyhook_package_stage_count 1 -t package_name=spencer -t skyhook_name=delete-skyhook -t stage=config --not-found ../metrics_test.py skyhook_package_stage_count 1 -t package_name=foobar -t skyhook_name=delete-skyhook -t stage=config --not-found + # Rollout metrics should be cleaned up after deletion + ../metrics_test.py skyhook_rollout_matched_nodes 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found + ../metrics_test.py skyhook_rollout_ceiling 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found + ../metrics_test.py skyhook_rollout_completed 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found + ../metrics_test.py skyhook_rollout_progress_percent 100 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found - finally: - delete: file: skyhook.yaml diff --git a/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml index 4567aab7..b15eaefe 100644 --- a/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml @@ -57,6 +57,12 @@ spec: ../metrics_test.py skyhook_package_state_count 1 -t package_name=foobar -t skyhook_name=simple-skyhook -t state=complete ../metrics_test.py skyhook_package_stage_count 1 -t package_name=spencer -t skyhook_name=simple-skyhook -t stage=config ../metrics_test.py skyhook_package_stage_count 1 -t package_name=foobar -t skyhook_name=simple-skyhook -t stage=config + # Rollout metrics checks + ../metrics_test.py skyhook_rollout_matched_nodes 1 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed + ../metrics_test.py skyhook_rollout_ceiling 1 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed + ../metrics_test.py skyhook_rollout_completed 1 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed + ../metrics_test.py skyhook_rollout_in_progress 0 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed + ../metrics_test.py skyhook_rollout_progress_percent 100 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed - finally: - delete: file: limitrange.yaml diff --git a/operator/internal/controller/cluster_state_v2.go b/operator/internal/controller/cluster_state_v2.go index 1f8518c2..ed328dbd 100644 --- a/operator/internal/controller/cluster_state_v2.go +++ b/operator/internal/controller/cluster_state_v2.go @@ -852,6 +852,21 @@ func (skyhook *skyhookNodes) ReportState() { } } + // Set rollout metrics for each compartment (follows same pattern as other metrics) + if len(skyhook.compartments) > 0 { + policyName := skyhook.GetSkyhook().Spec.DeploymentPolicy + if policyName == "" { + policyName = LegacyPolicyName + } + + for name, compartment := range skyhook.compartments { + if status, ok := skyhook.skyhook.Status.CompartmentStatuses[name]; ok { + strategy := getStrategyType(compartment) + SetRolloutMetrics(skyhookName, policyName, name, strategy, status) + } + } + } + // Set current count of completed nodes completeNodes := fmt.Sprintf("%d/%d", nodeStatusCounts[v1alpha1.StatusComplete], nodeCount) if completeNodes != skyhook.skyhook.GetCompleteNodes() { diff --git a/operator/internal/controller/metrics.go b/operator/internal/controller/metrics.go index 85866ffe..1b2970ec 100644 --- a/operator/internal/controller/metrics.go +++ b/operator/internal/controller/metrics.go @@ -20,10 +20,16 @@ package controller import ( "github.com/NVIDIA/skyhook/operator/api/v1alpha1" + "github.com/NVIDIA/skyhook/operator/internal/wrapper" "github.com/prometheus/client_golang/prometheus" "sigs.k8s.io/controller-runtime/pkg/metrics" ) +const ( + // LegacyPolicyName is used when no deployment policy is specified (backward compatibility) + LegacyPolicyName = "legacy" +) + var ( // skyhook metrics skyhook_status = prometheus.NewGaugeVec( @@ -75,6 +81,71 @@ var ( }, []string{"skyhook_name", "package_name", "package_version"}, ) + + // rollout metrics (per-compartment) + skyhook_rollout_matched_nodes = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "skyhook_rollout_matched_nodes", + Help: "Number of nodes matched by this compartment's selector", + }, + []string{"skyhook_name", "policy_name", "compartment_name", "strategy"}, + ) + + skyhook_rollout_ceiling = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "skyhook_rollout_ceiling", + Help: "Maximum number of nodes that can be in progress at once in this compartment", + }, + []string{"skyhook_name", "policy_name", "compartment_name", "strategy"}, + ) + + skyhook_rollout_in_progress = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "skyhook_rollout_in_progress", + Help: "Number of nodes currently in progress in this compartment", + }, + []string{"skyhook_name", "policy_name", "compartment_name", "strategy"}, + ) + + skyhook_rollout_completed = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "skyhook_rollout_completed", + Help: "Number of nodes completed in this compartment", + }, + []string{"skyhook_name", "policy_name", "compartment_name", "strategy"}, + ) + + skyhook_rollout_progress_percent = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "skyhook_rollout_progress_percent", + Help: "Percentage of nodes completed in this compartment (0-100)", + }, + []string{"skyhook_name", "policy_name", "compartment_name", "strategy"}, + ) + + skyhook_rollout_current_batch = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "skyhook_rollout_current_batch", + Help: "Current batch number in the rollout strategy (0 if no batch processing)", + }, + []string{"skyhook_name", "policy_name", "compartment_name", "strategy"}, + ) + + skyhook_rollout_consecutive_failures = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "skyhook_rollout_consecutive_failures", + Help: "Number of consecutive batch failures in this compartment", + }, + []string{"skyhook_name", "policy_name", "compartment_name", "strategy"}, + ) + + skyhook_rollout_should_stop = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "skyhook_rollout_should_stop", + Help: "Binary metric indicating if rollout should be stopped due to failures (1 = stopped, 0 = continuing)", + }, + []string{"skyhook_name", "policy_name", "compartment_name", "strategy"}, + ) ) func zeroOutSkyhookMetrics(skyhook SkyhookNodes) { @@ -96,6 +167,9 @@ func zeroOutSkyhookMetrics(skyhook SkyhookNodes) { for _, _package := range skyhook.GetSkyhook().Spec.Packages { zeroOutSkyhookPackageMetrics(skyhook.GetSkyhook().Name, _package.Name, _package.Version) } + + // Clean up all rollout metrics for this skyhook + zeroOutSkyhookRolloutMetrics(skyhook) } func zeroOutSkyhookPackageMetrics(skyhookName, packageName, packageVersion string) { @@ -129,6 +203,9 @@ func ResetSkyhookMetricsToZero(skyhook SkyhookNodes) { SetPackageStageMetrics(skyhookName, pkg.Name, pkg.Version, stage, 0) } } + + // Reset rollout metrics to zero + ResetRolloutMetricsToZero(skyhook) } func SetNodeStatusMetrics(skyhookName string, status v1alpha1.Status, count float64) { @@ -159,6 +236,99 @@ func SetNodeTargetCountMetrics(skyhookName string, count float64) { skyhook_node_target_count.WithLabelValues(skyhookName).Set(count) } +// zeroOutRolloutMetricsForCompartment removes rollout metrics for a specific compartment +func zeroOutRolloutMetricsForCompartment(skyhookName, policyName, compartmentName, strategy string) { + skyhook_rollout_matched_nodes.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy) + skyhook_rollout_ceiling.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy) + skyhook_rollout_in_progress.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy) + skyhook_rollout_completed.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy) + skyhook_rollout_progress_percent.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy) + skyhook_rollout_current_batch.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy) + skyhook_rollout_consecutive_failures.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy) + skyhook_rollout_should_stop.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy) +} + +// zeroOutSkyhookRolloutMetrics removes all rollout metrics for a skyhook +// This is called when a Skyhook is deleted +func zeroOutSkyhookRolloutMetrics(skyhook SkyhookNodes) { + // Get the policy name from the skyhook spec + policyName := skyhook.GetSkyhook().Spec.DeploymentPolicy + if policyName == "" { + policyName = LegacyPolicyName + } + + // Clean up metrics for all compartments + for compartmentName, compartment := range skyhook.GetCompartments() { + strategy := getStrategyType(compartment) + zeroOutRolloutMetricsForCompartment(skyhook.GetSkyhook().Name, policyName, compartmentName, strategy) + } + + // Also clean up metrics from CompartmentStatuses in case compartments were removed + if skyhook.GetSkyhook().Status.CompartmentStatuses != nil { + for compartmentName := range skyhook.GetSkyhook().Status.CompartmentStatuses { + // We don't have the exact strategy here, so we'll need to try to delete with all possible strategy types + for _, strategyType := range []string{"fixed", "linear", "exponential", "unknown"} { + zeroOutRolloutMetricsForCompartment(skyhook.GetSkyhook().Name, policyName, compartmentName, strategyType) + } + } + } +} + +// getStrategyType returns the strategy type name for a compartment +func getStrategyType(compartment *wrapper.Compartment) string { + strategyType := wrapper.GetStrategyType(compartment.Strategy) + return string(strategyType) +} + +// ResetRolloutMetricsToZero resets rollout metrics to zero for all compartments in the skyhook +// This follows the same pattern as ResetSkyhookMetricsToZero for consistency +func ResetRolloutMetricsToZero(skyhook SkyhookNodes) { + policyName := skyhook.GetSkyhook().Spec.DeploymentPolicy + if policyName == "" { + policyName = LegacyPolicyName + } + + // Reset metrics for all current compartments + for compartmentName, compartment := range skyhook.GetCompartments() { + strategy := getStrategyType(compartment) + emptyStatus := v1alpha1.CompartmentStatus{ + Matched: 0, + Ceiling: 0, + InProgress: 0, + Completed: 0, + ProgressPercent: 0, + BatchState: nil, + } + SetRolloutMetrics(skyhook.GetSkyhook().Name, policyName, compartmentName, strategy, emptyStatus) + } +} + +// SetRolloutMetrics sets the rollout metrics for a specific compartment +func SetRolloutMetrics(skyhookName, policyName, compartmentName, strategy string, status v1alpha1.CompartmentStatus) { + skyhook_rollout_matched_nodes.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.Matched)) + skyhook_rollout_ceiling.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.Ceiling)) + skyhook_rollout_in_progress.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.InProgress)) + skyhook_rollout_completed.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.Completed)) + skyhook_rollout_progress_percent.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.ProgressPercent)) + + // Set batch state metrics if present + if status.BatchState != nil { + skyhook_rollout_current_batch.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.BatchState.CurrentBatch)) + skyhook_rollout_consecutive_failures.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.BatchState.ConsecutiveFailures)) + + shouldStop := float64(0) + if status.BatchState.ShouldStop { + shouldStop = 1 + } + skyhook_rollout_should_stop.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(shouldStop) + } else { + // Set to 0 if no batch state + skyhook_rollout_current_batch.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(0) + skyhook_rollout_consecutive_failures.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(0) + skyhook_rollout_should_stop.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(0) + } +} + func init() { metrics.Registry.MustRegister( skyhook_status, @@ -167,5 +337,13 @@ func init() { skyhook_package_state_count, skyhook_package_stage_count, skyhook_package_restarts_count, + skyhook_rollout_matched_nodes, + skyhook_rollout_ceiling, + skyhook_rollout_in_progress, + skyhook_rollout_completed, + skyhook_rollout_progress_percent, + skyhook_rollout_current_batch, + skyhook_rollout_consecutive_failures, + skyhook_rollout_should_stop, ) }