Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions docs/metrics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Kube-Prometheus-Stack
install prometheus operator and enable prometheus instance (and grafana if you want):
```
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update prometheus-community
helm upgrade -i --create-namespace -n monitoring kube-prometheus-stack prometheus-community/kube-prometheus-stack --values kube-prometheus-values.yaml
```

# Service Monitors for kai services

Install a prometheus instance and the relevant service monitors in kai-scheduler namespace:

```sh
kubectl apply -f prometheus.yaml
kubectl apply -f service-monitors.yaml
```

To enable the prometheus as a grafana datasource, if desired, apply grafana-datasource.yaml:

```sh
kubectl apply -f grafana-datasource.yaml
```
23 changes: 23 additions & 0 deletions docs/metrics/grafana-datasource.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright 2025 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0

apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasource-kai-prom
namespace: monitoring # Modify according to your Grafana namespace
labels:
grafana_datasource: "1"
data:
kai-prometheus.yaml: |
apiVersion: 1
datasources:
- name: kai-prometheus
type: prometheus
access: proxy
url: http://prometheus-operated.kai-scheduler.svc:9090 # Modify according to your Prometheus URL
isDefault: false
editable: true
jsonData:
httpMethod: POST
timeInterval: 30s
23 changes: 23 additions & 0 deletions docs/metrics/kube-prometheus-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright 2025 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0

prometheus:
enabled: true
grafana:
enabled: true
persistence:
enabled: true
type: pvc
accessModes:
- ReadWriteOnce
size: 10Gi
# storageClassName: "" # uncomment and set to your StorageClass if not using the default
finalizers:
- kubernetes.io/pvc-protection
# Keep sidecars enabled so ConfigMaps like `docs/metrics/grafana-datasource.yaml`
# (labeled with `grafana_datasource: "1"`) are automatically loaded.
sidecar:
datasources:
enabled: true
dashboards:
enabled: true
65 changes: 65 additions & 0 deletions docs/metrics/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright 2025 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0

apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: kai-scheduler
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: kai-scheduler
---
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: kai
namespace: kai-scheduler
spec:
replicas: 1
serviceAccountName: prometheus
enableFeatures:
- promql-experimental-functions
scrapeInterval: 1m
storage:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 20Gi
serviceMonitorSelector:
matchLabels:
accounting: kai-scheduler
serviceMonitorNamespaceSelector: {}
podMonitorNamespaceSelector: {}
probeNamespaceSelector: {}
96 changes: 96 additions & 0 deletions docs/metrics/service-monitors.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright 2025 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: binder
namespace: kai-scheduler
labels:
accounting: kai-scheduler
spec:
jobLabel: binder
namespaceSelector:
matchNames:
- kai-scheduler
selector:
matchLabels:
app: binder
endpoints:
- port: http-metrics
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: scheduler
namespace: kai-scheduler
labels:
accounting: kai-scheduler
spec:
jobLabel: scheduler
namespaceSelector:
matchNames:
- kai-scheduler
selector:
matchLabels:
app: scheduler
endpoints:
- port: http-metrics
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: queuecontroller
namespace: kai-scheduler
labels:
accounting: kai-scheduler
spec:
jobLabel: queuecontroller
namespaceSelector:
matchNames:
- kai-scheduler
selector:
matchLabels:
app: queuecontroller
endpoints:
- port: metrics
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: dcgm-exporter
namespace: kai-scheduler
labels:
accounting: kai-scheduler
spec:
jobLabel: dcgm-exporter
namespaceSelector:
matchNames:
- gpu-operator
selector:
matchLabels:
app: nvidia-dcgm-exporter
endpoints:
- port: gpu-metrics
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-state-metrics
namespace: kai-scheduler
labels:
accounting: kai-scheduler
spec:
jobLabel: kube-state-metrics
namespaceSelector:
matchNames:
- monitoring
selector:
matchLabels:
app.kubernetes.io/name: kube-state-metrics
endpoints:
- port: http
interval: 30s
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ require (
github.com/onsi/gomega v1.37.0
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.22.0
github.com/prometheus/common v0.63.0
github.com/ray-project/kuberay/ray-operator v1.3.1
github.com/run-ai/kwok-operator v0.0.0-20240926063032-05b6364bc7c7
github.com/spf13/pflag v1.0.6
Expand Down Expand Up @@ -142,7 +143,6 @@ require (
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.63.0 // indirect
github.com/prometheus/procfs v0.16.0 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/samber/lo v1.47.0 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ github.com/jonboulle/clockwork v0.4.0 h1:p4Cf1aMWXnXAUh8lVfewRBx1zaTSYKrKMF2g3ST
github.com/jonboulle/clockwork v0.4.0/go.mod h1:xgRqUGwRcjKCO1vbZUEtSLrqKoPSsUpK7fnezOII0kc=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
Expand Down Expand Up @@ -233,6 +235,8 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
github.com/nbio/st v0.0.0-20140626010706-e9e8d9816f32 h1:W6apQkHrMkS0Muv8G/TipAy/FJl/rCYT0+EuS8+Z0z4=
Expand Down
15 changes: 7 additions & 8 deletions pkg/scheduler/api/queue_info/quota_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@

package queue_info

import "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
import (
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/common_info"
v1 "k8s.io/api/core/v1"
)

type QueueQuota struct {
GPU ResourceQuota `json:"gpu,omitempty"`
Expand All @@ -20,18 +23,14 @@ type ResourceQuota struct {
Limit float64 `json:"limit"`
}

type QueueUsage struct {
GPU float64 `json:"gpu,omitempty"`
CPU float64 `json:"cpu,omitempty"`
Memory float64 `json:"memory,omitempty"`
}
type QueueUsage map[v1.ResourceName]float64

type ClusterUsage struct {
Queues map[common_info.QueueID]*QueueUsage `json:"queues"`
Queues map[common_info.QueueID]QueueUsage `json:"queues"`
}

func NewClusterUsage() *ClusterUsage {
return &ClusterUsage{
Queues: make(map[common_info.QueueID]*QueueUsage),
Queues: make(map[common_info.QueueID]QueueUsage),
}
}
Loading
Loading