Skip to content

Commit 5bbe5b9

Browse files
committed
Add query latency metric
1 parent 011de52 commit 5bbe5b9

File tree

2 files changed

+16
-3
lines changed

2 files changed

+16
-3
lines changed

pkg/scheduler/cache/usagedb/usagedb.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/queue_info"
1212
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/cache/usagedb/api"
1313
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/log"
14+
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/metrics"
1415
)
1516

1617
var defaultFetchInterval = 1 * time.Minute
@@ -135,14 +136,13 @@ func (l *UsageLister) WaitForCacheSync(stopCh <-chan struct{}) bool {
135136
}
136137

137138
func (l *UsageLister) fetchAndUpdateUsage() {
138-
// TODO: Add metrics for fetch times
139+
now := time.Now()
139140
usage, err := l.client.GetResourceUsage()
140141
if err != nil {
141142
log.InfraLogger.V(1).Errorf("failed to fetch usage data: %v", err)
142143
return
143144
}
144-
145-
now := time.Now()
145+
metrics.UpdateUsageQueryLatency(time.Since(now))
146146

147147
l.lastUsageDataMutex.Lock()
148148
defer l.lastUsageDataMutex.Unlock()

pkg/scheduler/metrics/metrics.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ var (
5454
queueCPUUsage *prometheus.GaugeVec
5555
queueMemoryUsage *prometheus.GaugeVec
5656
queueGPUUsage *prometheus.GaugeVec
57+
usageQueryLatency *prometheus.HistogramVec
5758
)
5859

5960
func init() {
@@ -190,6 +191,14 @@ func InitMetrics(namespace string) {
190191
Name: "queue_gpu_usage_devices",
191192
Help: "GPU usage of queue, as a gauge. Value is proportional to gpu*hours usage with time decay applied",
192193
}, []string{"queue_name"})
194+
195+
usageQueryLatency = promauto.NewHistogramVec(
196+
prometheus.HistogramOpts{
197+
Namespace: namespace,
198+
Name: "usage_query_latency_milliseconds",
199+
Help: "Usage database query latency histogram in milliseconds",
200+
Buckets: prometheus.ExponentialBuckets(5, 2, 10),
201+
}, []string{})
193202
}
194203

195204
// UpdateOpenSessionDuration updates latency for open session, including all plugins
@@ -277,6 +286,10 @@ func ResetQueueUsage() {
277286
queueGPUUsage.Reset()
278287
}
279288

289+
func UpdateUsageQueryLatency(latency time.Duration) {
290+
usageQueryLatency.WithLabelValues().Observe(float64(latency.Milliseconds()))
291+
}
292+
280293
// RegisterPreemptionAttempts records number of attempts for preemption
281294
func RegisterPreemptionAttempts() {
282295
preemptionAttempts.Inc()

0 commit comments

Comments
 (0)