Skip to content

Commit 847bc71

Browse files
authored
Time aware fairness usage: prometheus client (#435)
* Added prometheus client for queue usage * Added query latency metric * Added unstructured, plugin-specific config for usage db clients
1 parent 68a87a2 commit 847bc71

File tree

21 files changed

+797
-339
lines changed

21 files changed

+797
-339
lines changed

docs/metrics/README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Kube-Prometheus-Stack
2+
install prometheus operator and enable prometheus instance (and grafana if you want):
3+
```
4+
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
5+
helm repo update prometheus-community
6+
helm upgrade -i --create-namespace -n monitoring kube-prometheus-stack prometheus-community/kube-prometheus-stack --values kube-prometheus-values.yaml
7+
```
8+
9+
# Service Monitors for kai services
10+
11+
Install a prometheus instance and the relevant service monitors in kai-scheduler namespace:
12+
13+
```sh
14+
kubectl apply -f prometheus.yaml
15+
kubectl apply -f service-monitors.yaml
16+
```
17+
18+
To enable the prometheus as a grafana datasource, if desired, apply grafana-datasource.yaml:
19+
20+
```sh
21+
kubectl apply -f grafana-datasource.yaml
22+
```
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Copyright 2025 NVIDIA CORPORATION
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: v1
5+
kind: ConfigMap
6+
metadata:
7+
name: grafana-datasource-kai-prom
8+
namespace: monitoring # Modify according to your Grafana namespace
9+
labels:
10+
grafana_datasource: "1"
11+
data:
12+
kai-prometheus.yaml: |
13+
apiVersion: 1
14+
datasources:
15+
- name: kai-prometheus
16+
type: prometheus
17+
access: proxy
18+
url: http://prometheus-operated.kai-scheduler.svc:9090 # Modify according to your Prometheus URL
19+
isDefault: false
20+
editable: true
21+
jsonData:
22+
httpMethod: POST
23+
timeInterval: 30s
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Copyright 2025 NVIDIA CORPORATION
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
prometheus:
5+
enabled: true
6+
grafana:
7+
enabled: true
8+
persistence:
9+
enabled: true
10+
type: pvc
11+
accessModes:
12+
- ReadWriteOnce
13+
size: 10Gi
14+
# storageClassName: "" # uncomment and set to your StorageClass if not using the default
15+
finalizers:
16+
- kubernetes.io/pvc-protection
17+
# Keep sidecars enabled so ConfigMaps like `docs/metrics/grafana-datasource.yaml`
18+
# (labeled with `grafana_datasource: "1"`) are automatically loaded.
19+
sidecar:
20+
datasources:
21+
enabled: true
22+
dashboards:
23+
enabled: true

docs/metrics/prometheus.yaml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Copyright 2025 NVIDIA CORPORATION
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: v1
5+
kind: ServiceAccount
6+
metadata:
7+
name: prometheus
8+
namespace: kai-scheduler
9+
---
10+
apiVersion: rbac.authorization.k8s.io/v1
11+
kind: ClusterRole
12+
metadata:
13+
name: prometheus
14+
rules:
15+
- apiGroups: [""]
16+
resources:
17+
- nodes
18+
- nodes/proxy
19+
- services
20+
- endpoints
21+
- pods
22+
verbs: ["get", "list", "watch"]
23+
- apiGroups: [""]
24+
resources:
25+
- configmaps
26+
verbs: ["get"]
27+
---
28+
apiVersion: rbac.authorization.k8s.io/v1
29+
kind: ClusterRoleBinding
30+
metadata:
31+
name: prometheus
32+
roleRef:
33+
apiGroup: rbac.authorization.k8s.io
34+
kind: ClusterRole
35+
name: prometheus
36+
subjects:
37+
- kind: ServiceAccount
38+
name: prometheus
39+
namespace: kai-scheduler
40+
---
41+
apiVersion: monitoring.coreos.com/v1
42+
kind: Prometheus
43+
metadata:
44+
name: kai
45+
namespace: kai-scheduler
46+
spec:
47+
replicas: 1
48+
serviceAccountName: prometheus
49+
enableFeatures:
50+
- promql-experimental-functions
51+
scrapeInterval: 1m
52+
storage:
53+
volumeClaimTemplate:
54+
spec:
55+
accessModes:
56+
- ReadWriteOnce
57+
resources:
58+
requests:
59+
storage: 20Gi
60+
serviceMonitorSelector:
61+
matchLabels:
62+
accounting: kai-scheduler
63+
serviceMonitorNamespaceSelector: {}
64+
podMonitorNamespaceSelector: {}
65+
probeNamespaceSelector: {}

docs/metrics/service-monitors.yaml

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright 2025 NVIDIA CORPORATION
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: monitoring.coreos.com/v1
5+
kind: ServiceMonitor
6+
metadata:
7+
name: binder
8+
namespace: kai-scheduler
9+
labels:
10+
accounting: kai-scheduler
11+
spec:
12+
jobLabel: binder
13+
namespaceSelector:
14+
matchNames:
15+
- kai-scheduler
16+
selector:
17+
matchLabels:
18+
app: binder
19+
endpoints:
20+
- port: http-metrics
21+
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
22+
---
23+
apiVersion: monitoring.coreos.com/v1
24+
kind: ServiceMonitor
25+
metadata:
26+
name: scheduler
27+
namespace: kai-scheduler
28+
labels:
29+
accounting: kai-scheduler
30+
spec:
31+
jobLabel: scheduler
32+
namespaceSelector:
33+
matchNames:
34+
- kai-scheduler
35+
selector:
36+
matchLabels:
37+
app: scheduler
38+
endpoints:
39+
- port: http-metrics
40+
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
41+
---
42+
apiVersion: monitoring.coreos.com/v1
43+
kind: ServiceMonitor
44+
metadata:
45+
name: queuecontroller
46+
namespace: kai-scheduler
47+
labels:
48+
accounting: kai-scheduler
49+
spec:
50+
jobLabel: queuecontroller
51+
namespaceSelector:
52+
matchNames:
53+
- kai-scheduler
54+
selector:
55+
matchLabels:
56+
app: queuecontroller
57+
endpoints:
58+
- port: metrics
59+
---
60+
apiVersion: monitoring.coreos.com/v1
61+
kind: ServiceMonitor
62+
metadata:
63+
name: dcgm-exporter
64+
namespace: kai-scheduler
65+
labels:
66+
accounting: kai-scheduler
67+
spec:
68+
jobLabel: dcgm-exporter
69+
namespaceSelector:
70+
matchNames:
71+
- gpu-operator
72+
selector:
73+
matchLabels:
74+
app: nvidia-dcgm-exporter
75+
endpoints:
76+
- port: gpu-metrics
77+
---
78+
apiVersion: monitoring.coreos.com/v1
79+
kind: ServiceMonitor
80+
metadata:
81+
name: kube-state-metrics
82+
namespace: kai-scheduler
83+
labels:
84+
accounting: kai-scheduler
85+
spec:
86+
jobLabel: kube-state-metrics
87+
namespaceSelector:
88+
matchNames:
89+
- monitoring
90+
selector:
91+
matchLabels:
92+
app.kubernetes.io/name: kube-state-metrics
93+
endpoints:
94+
- port: http
95+
interval: 30s
96+
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ require (
1818
github.com/onsi/gomega v1.37.0
1919
github.com/pkg/errors v0.9.1
2020
github.com/prometheus/client_golang v1.22.0
21+
github.com/prometheus/common v0.63.0
2122
github.com/ray-project/kuberay/ray-operator v1.3.1
2223
github.com/run-ai/kwok-operator v0.0.0-20240926063032-05b6364bc7c7
2324
github.com/spf13/pflag v1.0.6
@@ -142,7 +143,6 @@ require (
142143
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
143144
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
144145
github.com/prometheus/client_model v0.6.2 // indirect
145-
github.com/prometheus/common v0.63.0 // indirect
146146
github.com/prometheus/procfs v0.16.0 // indirect
147147
github.com/robfig/cron/v3 v3.0.1 // indirect
148148
github.com/samber/lo v1.47.0 // indirect

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,8 @@ github.com/jonboulle/clockwork v0.4.0 h1:p4Cf1aMWXnXAUh8lVfewRBx1zaTSYKrKMF2g3ST
192192
github.com/jonboulle/clockwork v0.4.0/go.mod h1:xgRqUGwRcjKCO1vbZUEtSLrqKoPSsUpK7fnezOII0kc=
193193
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
194194
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
195+
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
196+
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
195197
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
196198
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
197199
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
@@ -233,6 +235,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
233235
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
234236
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
235237
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
238+
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
239+
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
236240
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
237241
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
238242
github.com/nbio/st v0.0.0-20140626010706-e9e8d9816f32 h1:W6apQkHrMkS0Muv8G/TipAy/FJl/rCYT0+EuS8+Z0z4=

pkg/scheduler/api/queue_info/quota_info.go

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33

44
package queue_info
55

6-
import "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
6+
import (
7+
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
8+
v1 "k8s.io/api/core/v1"
9+
)
710

811
type QueueQuota struct {
912
GPU ResourceQuota `json:"gpu,omitempty"`
@@ -20,18 +23,14 @@ type ResourceQuota struct {
2023
Limit float64 `json:"limit"`
2124
}
2225

23-
type QueueUsage struct {
24-
GPU float64 `json:"gpu,omitempty"`
25-
CPU float64 `json:"cpu,omitempty"`
26-
Memory float64 `json:"memory,omitempty"`
27-
}
26+
type QueueUsage map[v1.ResourceName]float64
2827

2928
type ClusterUsage struct {
30-
Queues map[common_info.QueueID]*QueueUsage `json:"queues"`
29+
Queues map[common_info.QueueID]QueueUsage `json:"queues"`
3130
}
3231

3332
func NewClusterUsage() *ClusterUsage {
3433
return &ClusterUsage{
35-
Queues: make(map[common_info.QueueID]*QueueUsage),
34+
Queues: make(map[common_info.QueueID]QueueUsage),
3635
}
3736
}

0 commit comments

Comments
 (0)