@@ -20,10 +20,16 @@ package controller
2020
2121import (
2222 "github.com/NVIDIA/skyhook/operator/api/v1alpha1"
23+ "github.com/NVIDIA/skyhook/operator/internal/wrapper"
2324 "github.com/prometheus/client_golang/prometheus"
2425 "sigs.k8s.io/controller-runtime/pkg/metrics"
2526)
2627
28+ const (
29+ // LegacyPolicyName is used when no deployment policy is specified (backward compatibility)
30+ LegacyPolicyName = "legacy"
31+ )
32+
2733var (
2834 // skyhook metrics
2935 skyhook_status = prometheus .NewGaugeVec (
7581 },
7682 []string {"skyhook_name" , "package_name" , "package_version" },
7783 )
84+
85+ // rollout metrics (per-compartment)
86+ skyhook_rollout_matched_nodes = prometheus .NewGaugeVec (
87+ prometheus.GaugeOpts {
88+ Name : "skyhook_rollout_matched_nodes" ,
89+ Help : "Number of nodes matched by this compartment's selector" ,
90+ },
91+ []string {"skyhook_name" , "policy_name" , "compartment_name" , "strategy" },
92+ )
93+
94+ skyhook_rollout_ceiling = prometheus .NewGaugeVec (
95+ prometheus.GaugeOpts {
96+ Name : "skyhook_rollout_ceiling" ,
97+ Help : "Maximum number of nodes that can be in progress at once in this compartment" ,
98+ },
99+ []string {"skyhook_name" , "policy_name" , "compartment_name" , "strategy" },
100+ )
101+
102+ skyhook_rollout_in_progress = prometheus .NewGaugeVec (
103+ prometheus.GaugeOpts {
104+ Name : "skyhook_rollout_in_progress" ,
105+ Help : "Number of nodes currently in progress in this compartment" ,
106+ },
107+ []string {"skyhook_name" , "policy_name" , "compartment_name" , "strategy" },
108+ )
109+
110+ skyhook_rollout_completed = prometheus .NewGaugeVec (
111+ prometheus.GaugeOpts {
112+ Name : "skyhook_rollout_completed" ,
113+ Help : "Number of nodes completed in this compartment" ,
114+ },
115+ []string {"skyhook_name" , "policy_name" , "compartment_name" , "strategy" },
116+ )
117+
118+ skyhook_rollout_progress_percent = prometheus .NewGaugeVec (
119+ prometheus.GaugeOpts {
120+ Name : "skyhook_rollout_progress_percent" ,
121+ Help : "Percentage of nodes completed in this compartment (0-100)" ,
122+ },
123+ []string {"skyhook_name" , "policy_name" , "compartment_name" , "strategy" },
124+ )
125+
126+ skyhook_rollout_current_batch = prometheus .NewGaugeVec (
127+ prometheus.GaugeOpts {
128+ Name : "skyhook_rollout_current_batch" ,
129+ Help : "Current batch number in the rollout strategy (0 if no batch processing)" ,
130+ },
131+ []string {"skyhook_name" , "policy_name" , "compartment_name" , "strategy" },
132+ )
133+
134+ skyhook_rollout_consecutive_failures = prometheus .NewGaugeVec (
135+ prometheus.GaugeOpts {
136+ Name : "skyhook_rollout_consecutive_failures" ,
137+ Help : "Number of consecutive batch failures in this compartment" ,
138+ },
139+ []string {"skyhook_name" , "policy_name" , "compartment_name" , "strategy" },
140+ )
141+
142+ skyhook_rollout_should_stop = prometheus .NewGaugeVec (
143+ prometheus.GaugeOpts {
144+ Name : "skyhook_rollout_should_stop" ,
145+ Help : "Binary metric indicating if rollout should be stopped due to failures (1 = stopped, 0 = continuing)" ,
146+ },
147+ []string {"skyhook_name" , "policy_name" , "compartment_name" , "strategy" },
148+ )
78149)
79150
80151func zeroOutSkyhookMetrics (skyhook SkyhookNodes ) {
@@ -96,6 +167,9 @@ func zeroOutSkyhookMetrics(skyhook SkyhookNodes) {
96167 for _ , _package := range skyhook .GetSkyhook ().Spec .Packages {
97168 zeroOutSkyhookPackageMetrics (skyhook .GetSkyhook ().Name , _package .Name , _package .Version )
98169 }
170+
171+ // Clean up all rollout metrics for this skyhook
172+ zeroOutSkyhookRolloutMetrics (skyhook )
99173}
100174
101175func zeroOutSkyhookPackageMetrics (skyhookName , packageName , packageVersion string ) {
@@ -129,6 +203,9 @@ func ResetSkyhookMetricsToZero(skyhook SkyhookNodes) {
129203 SetPackageStageMetrics (skyhookName , pkg .Name , pkg .Version , stage , 0 )
130204 }
131205 }
206+
207+ // Reset rollout metrics to zero
208+ ResetRolloutMetricsToZero (skyhook )
132209}
133210
134211func SetNodeStatusMetrics (skyhookName string , status v1alpha1.Status , count float64 ) {
@@ -159,6 +236,99 @@ func SetNodeTargetCountMetrics(skyhookName string, count float64) {
159236 skyhook_node_target_count .WithLabelValues (skyhookName ).Set (count )
160237}
161238
239+ // zeroOutRolloutMetricsForCompartment removes rollout metrics for a specific compartment
240+ func zeroOutRolloutMetricsForCompartment (skyhookName , policyName , compartmentName , strategy string ) {
241+ skyhook_rollout_matched_nodes .DeleteLabelValues (skyhookName , policyName , compartmentName , strategy )
242+ skyhook_rollout_ceiling .DeleteLabelValues (skyhookName , policyName , compartmentName , strategy )
243+ skyhook_rollout_in_progress .DeleteLabelValues (skyhookName , policyName , compartmentName , strategy )
244+ skyhook_rollout_completed .DeleteLabelValues (skyhookName , policyName , compartmentName , strategy )
245+ skyhook_rollout_progress_percent .DeleteLabelValues (skyhookName , policyName , compartmentName , strategy )
246+ skyhook_rollout_current_batch .DeleteLabelValues (skyhookName , policyName , compartmentName , strategy )
247+ skyhook_rollout_consecutive_failures .DeleteLabelValues (skyhookName , policyName , compartmentName , strategy )
248+ skyhook_rollout_should_stop .DeleteLabelValues (skyhookName , policyName , compartmentName , strategy )
249+ }
250+
251+ // zeroOutSkyhookRolloutMetrics removes all rollout metrics for a skyhook
252+ // This is called when a Skyhook is deleted
253+ func zeroOutSkyhookRolloutMetrics (skyhook SkyhookNodes ) {
254+ // Get the policy name from the skyhook spec
255+ policyName := skyhook .GetSkyhook ().Spec .DeploymentPolicy
256+ if policyName == "" {
257+ policyName = LegacyPolicyName
258+ }
259+
260+ // Clean up metrics for all compartments
261+ for compartmentName , compartment := range skyhook .GetCompartments () {
262+ strategy := getStrategyType (compartment )
263+ zeroOutRolloutMetricsForCompartment (skyhook .GetSkyhook ().Name , policyName , compartmentName , strategy )
264+ }
265+
266+ // Also clean up metrics from CompartmentStatuses in case compartments were removed
267+ if skyhook .GetSkyhook ().Status .CompartmentStatuses != nil {
268+ for compartmentName := range skyhook .GetSkyhook ().Status .CompartmentStatuses {
269+ // We don't have the exact strategy here, so we'll need to try to delete with all possible strategy types
270+ for _ , strategyType := range []string {"fixed" , "linear" , "exponential" , "unknown" } {
271+ zeroOutRolloutMetricsForCompartment (skyhook .GetSkyhook ().Name , policyName , compartmentName , strategyType )
272+ }
273+ }
274+ }
275+ }
276+
277+ // getStrategyType returns the strategy type name for a compartment
278+ func getStrategyType (compartment * wrapper.Compartment ) string {
279+ strategyType := wrapper .GetStrategyType (compartment .Strategy )
280+ return string (strategyType )
281+ }
282+
283+ // ResetRolloutMetricsToZero resets rollout metrics to zero for all compartments in the skyhook
284+ // This follows the same pattern as ResetSkyhookMetricsToZero for consistency
285+ func ResetRolloutMetricsToZero (skyhook SkyhookNodes ) {
286+ policyName := skyhook .GetSkyhook ().Spec .DeploymentPolicy
287+ if policyName == "" {
288+ policyName = LegacyPolicyName
289+ }
290+
291+ // Reset metrics for all current compartments
292+ for compartmentName , compartment := range skyhook .GetCompartments () {
293+ strategy := getStrategyType (compartment )
294+ emptyStatus := v1alpha1.CompartmentStatus {
295+ Matched : 0 ,
296+ Ceiling : 0 ,
297+ InProgress : 0 ,
298+ Completed : 0 ,
299+ ProgressPercent : 0 ,
300+ BatchState : nil ,
301+ }
302+ SetRolloutMetrics (skyhook .GetSkyhook ().Name , policyName , compartmentName , strategy , emptyStatus )
303+ }
304+ }
305+
306+ // SetRolloutMetrics sets the rollout metrics for a specific compartment
307+ func SetRolloutMetrics (skyhookName , policyName , compartmentName , strategy string , status v1alpha1.CompartmentStatus ) {
308+ skyhook_rollout_matched_nodes .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (float64 (status .Matched ))
309+ skyhook_rollout_ceiling .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (float64 (status .Ceiling ))
310+ skyhook_rollout_in_progress .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (float64 (status .InProgress ))
311+ skyhook_rollout_completed .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (float64 (status .Completed ))
312+ skyhook_rollout_progress_percent .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (float64 (status .ProgressPercent ))
313+
314+ // Set batch state metrics if present
315+ if status .BatchState != nil {
316+ skyhook_rollout_current_batch .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (float64 (status .BatchState .CurrentBatch ))
317+ skyhook_rollout_consecutive_failures .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (float64 (status .BatchState .ConsecutiveFailures ))
318+
319+ shouldStop := float64 (0 )
320+ if status .BatchState .ShouldStop {
321+ shouldStop = 1
322+ }
323+ skyhook_rollout_should_stop .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (shouldStop )
324+ } else {
325+ // Set to 0 if no batch state
326+ skyhook_rollout_current_batch .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (0 )
327+ skyhook_rollout_consecutive_failures .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (0 )
328+ skyhook_rollout_should_stop .WithLabelValues (skyhookName , policyName , compartmentName , strategy ).Set (0 )
329+ }
330+ }
331+
162332func init () {
163333 metrics .Registry .MustRegister (
164334 skyhook_status ,
@@ -167,5 +337,13 @@ func init() {
167337 skyhook_package_state_count ,
168338 skyhook_package_stage_count ,
169339 skyhook_package_restarts_count ,
340+ skyhook_rollout_matched_nodes ,
341+ skyhook_rollout_ceiling ,
342+ skyhook_rollout_in_progress ,
343+ skyhook_rollout_completed ,
344+ skyhook_rollout_progress_percent ,
345+ skyhook_rollout_current_batch ,
346+ skyhook_rollout_consecutive_failures ,
347+ skyhook_rollout_should_stop ,
170348 )
171349}
0 commit comments