Skip to content

Commit 75f73b3

Browse files
committed
feat(operator): add a metric for taint scheduling
1 parent e4d8eb5 commit 75f73b3

File tree

5 files changed

+58
-2
lines changed

5 files changed

+58
-2
lines changed

docs/metrics/grafana_values.yaml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,27 @@
1+
#
2+
# LICENSE START
3+
#
4+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
# LICENSE END
19+
#
20+
121
datasources:
222
datasources.yaml:
323
apiVersion: 1
424
datasources:
525
- name: Prometheus
626
type: prometheus
7-
url: http://prometheus-server.default.svc.cluster.local
27+
url: http://prometheus-server.default.svc.cluster.local

docs/metrics/prometheus_values.yaml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,23 @@
1+
#
2+
# LICENSE START
3+
#
4+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
# LICENSE END
19+
#
20+
121
extraScrapeConfigs: |
222
- job_name: 'skyhook'
323
metrics_path: /metrics
@@ -7,4 +27,3 @@ extraScrapeConfigs: |
727
static_configs:
828
- targets:
929
- skyhook-operator-controller-manager-metrics-service.skyhook.svc.cluster.local:8443
10-

k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,16 @@ spec:
4949
file: skyhook.yaml
5050
- assert:
5151
file: assert.yaml
52+
- script:
53+
content: |
54+
../metrics_test.py skyhook_node_taint_tolerance_issue_count 1 -t skyhook_name=taint-scheduling
5255
- patch:
5356
file: update-skyhook.yaml
5457
- assert:
5558
file: assert-update.yaml
59+
- script:
60+
content: |
61+
../metrics_test.py skyhook_node_taint_tolerance_issue_count 0 -t skyhook_name=taint-scheduling
5662
finally:
5763
- script:
5864
content: |

operator/internal/controller/cluster_state_v2.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,7 @@ func (np *NodePicker) SelectNodes(s SkyhookNodes) []wrapper.SkyhookNode {
448448
nodesWithTaintTolerationIssue = append(nodesWithTaintTolerationIssue, node.GetNode().Name)
449449
}
450450
}
451+
skyhook_node_taint_tolerance_issue_count.WithLabelValues(s.GetSkyhook().Name).Set(float64(len(nodesWithTaintTolerationIssue)))
451452

452453
// if we have nodes that are not tolerable, we need to add a condition to the skyhook
453454
if len(nodesWithTaintTolerationIssue) > 0 {

operator/internal/controller/metrics.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,14 @@ var (
131131
},
132132
[]string{"skyhook_name", "package_name", "package_version", "stage"},
133133
)
134+
135+
skyhook_node_taint_tolerance_issue_count = prometheus.NewGaugeVec(
136+
prometheus.GaugeOpts{
137+
Name: "skyhook_node_taint_tolerance_issue_count",
138+
Help: "Number of nodes in the cluster that have taint tolerance issues",
139+
},
140+
[]string{"skyhook_name"},
141+
)
134142
)
135143

136144
func zeroOutSkyhookMetrics(skyhook SkyhookNodes) {
@@ -142,6 +150,7 @@ func zeroOutSkyhookMetrics(skyhook SkyhookNodes) {
142150
skyhook_node_complete_count.DeleteLabelValues(skyhook.GetSkyhook().Name)
143151
skyhook_node_error_count.DeleteLabelValues(skyhook.GetSkyhook().Name)
144152
skyhook_node_blocked_count.DeleteLabelValues(skyhook.GetSkyhook().Name)
153+
skyhook_node_taint_tolerance_issue_count.DeleteLabelValues(skyhook.GetSkyhook().Name)
145154
for _, _package := range skyhook.GetSkyhook().Spec.Packages {
146155
zeroOutSkyhookPackageMetrics(skyhook.GetSkyhook().Name, _package.Name, _package.Version)
147156
}
@@ -172,5 +181,6 @@ func init() {
172181
skyhook_package_complete_count,
173182
skyhook_package_stage_count,
174183
skyhook_package_restarts_count,
184+
skyhook_node_taint_tolerance_issue_count,
175185
)
176186
}

0 commit comments

Comments
 (0)