Skip to content

Commit 84caf87

Browse files
authored
feat: add metrics for compartments (#110)
* feat: add metrics for compartments * update e2e tests and naming
1 parent 4073db0 commit 84caf87

File tree

4 files changed

+208
-0
lines changed

4 files changed

+208
-0
lines changed

k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ spec:
4343
../metrics_test.py skyhook_package_stage_count 1 -t package_name=dexter -t skyhook_name=delete-skyhook -t stage=config
4444
../metrics_test.py skyhook_package_stage_count 1 -t package_name=spencer -t skyhook_name=delete-skyhook -t stage=config
4545
../metrics_test.py skyhook_package_stage_count 1 -t package_name=foobar -t skyhook_name=delete-skyhook -t stage=config
46+
# Rollout metrics checks
47+
../metrics_test.py skyhook_rollout_matched_nodes 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
48+
../metrics_test.py skyhook_rollout_completed 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
49+
../metrics_test.py skyhook_rollout_progress_percent 100 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
4650
- delete:
4751
file: skyhook.yaml
4852
- script:
@@ -55,6 +59,11 @@ spec:
5559
../metrics_test.py skyhook_package_stage_count 1 -t package_name=dexter -t skyhook_name=delete-skyhook -t stage=config --not-found
5660
../metrics_test.py skyhook_package_stage_count 1 -t package_name=spencer -t skyhook_name=delete-skyhook -t stage=config --not-found
5761
../metrics_test.py skyhook_package_stage_count 1 -t package_name=foobar -t skyhook_name=delete-skyhook -t stage=config --not-found
62+
# Rollout metrics should be cleaned up after deletion
63+
../metrics_test.py skyhook_rollout_matched_nodes 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found
64+
../metrics_test.py skyhook_rollout_ceiling 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found
65+
../metrics_test.py skyhook_rollout_completed 1 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found
66+
../metrics_test.py skyhook_rollout_progress_percent 100 -t skyhook_name=delete-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed --not-found
5867
- finally:
5968
- delete:
6069
file: skyhook.yaml

k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ spec:
5757
../metrics_test.py skyhook_package_state_count 1 -t package_name=foobar -t skyhook_name=simple-skyhook -t state=complete
5858
../metrics_test.py skyhook_package_stage_count 1 -t package_name=spencer -t skyhook_name=simple-skyhook -t stage=config
5959
../metrics_test.py skyhook_package_stage_count 1 -t package_name=foobar -t skyhook_name=simple-skyhook -t stage=config
60+
# Rollout metrics checks
61+
../metrics_test.py skyhook_rollout_matched_nodes 1 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
62+
../metrics_test.py skyhook_rollout_ceiling 1 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
63+
../metrics_test.py skyhook_rollout_completed 1 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
64+
../metrics_test.py skyhook_rollout_in_progress 0 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
65+
../metrics_test.py skyhook_rollout_progress_percent 100 -t skyhook_name=simple-skyhook -t policy_name=legacy -t compartment_name=__default__ -t strategy=fixed
6066
- finally:
6167
- delete:
6268
file: limitrange.yaml

operator/internal/controller/cluster_state_v2.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -852,6 +852,21 @@ func (skyhook *skyhookNodes) ReportState() {
852852
}
853853
}
854854

855+
// Set rollout metrics for each compartment (follows same pattern as other metrics)
856+
if len(skyhook.compartments) > 0 {
857+
policyName := skyhook.GetSkyhook().Spec.DeploymentPolicy
858+
if policyName == "" {
859+
policyName = LegacyPolicyName
860+
}
861+
862+
for name, compartment := range skyhook.compartments {
863+
if status, ok := skyhook.skyhook.Status.CompartmentStatuses[name]; ok {
864+
strategy := getStrategyType(compartment)
865+
SetRolloutMetrics(skyhookName, policyName, name, strategy, status)
866+
}
867+
}
868+
}
869+
855870
// Set current count of completed nodes
856871
completeNodes := fmt.Sprintf("%d/%d", nodeStatusCounts[v1alpha1.StatusComplete], nodeCount)
857872
if completeNodes != skyhook.skyhook.GetCompleteNodes() {

operator/internal/controller/metrics.go

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,16 @@ package controller
2020

2121
import (
2222
"github.com/NVIDIA/skyhook/operator/api/v1alpha1"
23+
"github.com/NVIDIA/skyhook/operator/internal/wrapper"
2324
"github.com/prometheus/client_golang/prometheus"
2425
"sigs.k8s.io/controller-runtime/pkg/metrics"
2526
)
2627

28+
const (
29+
// LegacyPolicyName is used when no deployment policy is specified (backward compatibility)
30+
LegacyPolicyName = "legacy"
31+
)
32+
2733
var (
2834
// skyhook metrics
2935
skyhook_status = prometheus.NewGaugeVec(
@@ -75,6 +81,71 @@ var (
7581
},
7682
[]string{"skyhook_name", "package_name", "package_version"},
7783
)
84+
85+
// rollout metrics (per-compartment)
86+
skyhook_rollout_matched_nodes = prometheus.NewGaugeVec(
87+
prometheus.GaugeOpts{
88+
Name: "skyhook_rollout_matched_nodes",
89+
Help: "Number of nodes matched by this compartment's selector",
90+
},
91+
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
92+
)
93+
94+
skyhook_rollout_ceiling = prometheus.NewGaugeVec(
95+
prometheus.GaugeOpts{
96+
Name: "skyhook_rollout_ceiling",
97+
Help: "Maximum number of nodes that can be in progress at once in this compartment",
98+
},
99+
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
100+
)
101+
102+
skyhook_rollout_in_progress = prometheus.NewGaugeVec(
103+
prometheus.GaugeOpts{
104+
Name: "skyhook_rollout_in_progress",
105+
Help: "Number of nodes currently in progress in this compartment",
106+
},
107+
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
108+
)
109+
110+
skyhook_rollout_completed = prometheus.NewGaugeVec(
111+
prometheus.GaugeOpts{
112+
Name: "skyhook_rollout_completed",
113+
Help: "Number of nodes completed in this compartment",
114+
},
115+
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
116+
)
117+
118+
skyhook_rollout_progress_percent = prometheus.NewGaugeVec(
119+
prometheus.GaugeOpts{
120+
Name: "skyhook_rollout_progress_percent",
121+
Help: "Percentage of nodes completed in this compartment (0-100)",
122+
},
123+
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
124+
)
125+
126+
skyhook_rollout_current_batch = prometheus.NewGaugeVec(
127+
prometheus.GaugeOpts{
128+
Name: "skyhook_rollout_current_batch",
129+
Help: "Current batch number in the rollout strategy (0 if no batch processing)",
130+
},
131+
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
132+
)
133+
134+
skyhook_rollout_consecutive_failures = prometheus.NewGaugeVec(
135+
prometheus.GaugeOpts{
136+
Name: "skyhook_rollout_consecutive_failures",
137+
Help: "Number of consecutive batch failures in this compartment",
138+
},
139+
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
140+
)
141+
142+
skyhook_rollout_should_stop = prometheus.NewGaugeVec(
143+
prometheus.GaugeOpts{
144+
Name: "skyhook_rollout_should_stop",
145+
Help: "Binary metric indicating if rollout should be stopped due to failures (1 = stopped, 0 = continuing)",
146+
},
147+
[]string{"skyhook_name", "policy_name", "compartment_name", "strategy"},
148+
)
78149
)
79150

80151
func zeroOutSkyhookMetrics(skyhook SkyhookNodes) {
@@ -96,6 +167,9 @@ func zeroOutSkyhookMetrics(skyhook SkyhookNodes) {
96167
for _, _package := range skyhook.GetSkyhook().Spec.Packages {
97168
zeroOutSkyhookPackageMetrics(skyhook.GetSkyhook().Name, _package.Name, _package.Version)
98169
}
170+
171+
// Clean up all rollout metrics for this skyhook
172+
zeroOutSkyhookRolloutMetrics(skyhook)
99173
}
100174

101175
func zeroOutSkyhookPackageMetrics(skyhookName, packageName, packageVersion string) {
@@ -129,6 +203,9 @@ func ResetSkyhookMetricsToZero(skyhook SkyhookNodes) {
129203
SetPackageStageMetrics(skyhookName, pkg.Name, pkg.Version, stage, 0)
130204
}
131205
}
206+
207+
// Reset rollout metrics to zero
208+
ResetRolloutMetricsToZero(skyhook)
132209
}
133210

134211
func SetNodeStatusMetrics(skyhookName string, status v1alpha1.Status, count float64) {
@@ -159,6 +236,99 @@ func SetNodeTargetCountMetrics(skyhookName string, count float64) {
159236
skyhook_node_target_count.WithLabelValues(skyhookName).Set(count)
160237
}
161238

239+
// zeroOutRolloutMetricsForCompartment removes rollout metrics for a specific compartment
240+
func zeroOutRolloutMetricsForCompartment(skyhookName, policyName, compartmentName, strategy string) {
241+
skyhook_rollout_matched_nodes.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
242+
skyhook_rollout_ceiling.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
243+
skyhook_rollout_in_progress.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
244+
skyhook_rollout_completed.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
245+
skyhook_rollout_progress_percent.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
246+
skyhook_rollout_current_batch.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
247+
skyhook_rollout_consecutive_failures.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
248+
skyhook_rollout_should_stop.DeleteLabelValues(skyhookName, policyName, compartmentName, strategy)
249+
}
250+
251+
// zeroOutSkyhookRolloutMetrics removes all rollout metrics for a skyhook
252+
// This is called when a Skyhook is deleted
253+
func zeroOutSkyhookRolloutMetrics(skyhook SkyhookNodes) {
254+
// Get the policy name from the skyhook spec
255+
policyName := skyhook.GetSkyhook().Spec.DeploymentPolicy
256+
if policyName == "" {
257+
policyName = LegacyPolicyName
258+
}
259+
260+
// Clean up metrics for all compartments
261+
for compartmentName, compartment := range skyhook.GetCompartments() {
262+
strategy := getStrategyType(compartment)
263+
zeroOutRolloutMetricsForCompartment(skyhook.GetSkyhook().Name, policyName, compartmentName, strategy)
264+
}
265+
266+
// Also clean up metrics from CompartmentStatuses in case compartments were removed
267+
if skyhook.GetSkyhook().Status.CompartmentStatuses != nil {
268+
for compartmentName := range skyhook.GetSkyhook().Status.CompartmentStatuses {
269+
// We don't have the exact strategy here, so we'll need to try to delete with all possible strategy types
270+
for _, strategyType := range []string{"fixed", "linear", "exponential", "unknown"} {
271+
zeroOutRolloutMetricsForCompartment(skyhook.GetSkyhook().Name, policyName, compartmentName, strategyType)
272+
}
273+
}
274+
}
275+
}
276+
277+
// getStrategyType returns the strategy type name for a compartment
278+
func getStrategyType(compartment *wrapper.Compartment) string {
279+
strategyType := wrapper.GetStrategyType(compartment.Strategy)
280+
return string(strategyType)
281+
}
282+
283+
// ResetRolloutMetricsToZero resets rollout metrics to zero for all compartments in the skyhook
284+
// This follows the same pattern as ResetSkyhookMetricsToZero for consistency
285+
func ResetRolloutMetricsToZero(skyhook SkyhookNodes) {
286+
policyName := skyhook.GetSkyhook().Spec.DeploymentPolicy
287+
if policyName == "" {
288+
policyName = LegacyPolicyName
289+
}
290+
291+
// Reset metrics for all current compartments
292+
for compartmentName, compartment := range skyhook.GetCompartments() {
293+
strategy := getStrategyType(compartment)
294+
emptyStatus := v1alpha1.CompartmentStatus{
295+
Matched: 0,
296+
Ceiling: 0,
297+
InProgress: 0,
298+
Completed: 0,
299+
ProgressPercent: 0,
300+
BatchState: nil,
301+
}
302+
SetRolloutMetrics(skyhook.GetSkyhook().Name, policyName, compartmentName, strategy, emptyStatus)
303+
}
304+
}
305+
306+
// SetRolloutMetrics sets the rollout metrics for a specific compartment
307+
func SetRolloutMetrics(skyhookName, policyName, compartmentName, strategy string, status v1alpha1.CompartmentStatus) {
308+
skyhook_rollout_matched_nodes.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.Matched))
309+
skyhook_rollout_ceiling.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.Ceiling))
310+
skyhook_rollout_in_progress.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.InProgress))
311+
skyhook_rollout_completed.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.Completed))
312+
skyhook_rollout_progress_percent.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.ProgressPercent))
313+
314+
// Set batch state metrics if present
315+
if status.BatchState != nil {
316+
skyhook_rollout_current_batch.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.BatchState.CurrentBatch))
317+
skyhook_rollout_consecutive_failures.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(float64(status.BatchState.ConsecutiveFailures))
318+
319+
shouldStop := float64(0)
320+
if status.BatchState.ShouldStop {
321+
shouldStop = 1
322+
}
323+
skyhook_rollout_should_stop.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(shouldStop)
324+
} else {
325+
// Set to 0 if no batch state
326+
skyhook_rollout_current_batch.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(0)
327+
skyhook_rollout_consecutive_failures.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(0)
328+
skyhook_rollout_should_stop.WithLabelValues(skyhookName, policyName, compartmentName, strategy).Set(0)
329+
}
330+
}
331+
162332
func init() {
163333
metrics.Registry.MustRegister(
164334
skyhook_status,
@@ -167,5 +337,13 @@ func init() {
167337
skyhook_package_state_count,
168338
skyhook_package_stage_count,
169339
skyhook_package_restarts_count,
340+
skyhook_rollout_matched_nodes,
341+
skyhook_rollout_ceiling,
342+
skyhook_rollout_in_progress,
343+
skyhook_rollout_completed,
344+
skyhook_rollout_progress_percent,
345+
skyhook_rollout_current_batch,
346+
skyhook_rollout_consecutive_failures,
347+
skyhook_rollout_should_stop,
170348
)
171349
}

0 commit comments

Comments
 (0)