File tree Expand file tree Collapse file tree 2 files changed +16
-3
lines changed Expand file tree Collapse file tree 2 files changed +16
-3
lines changed Original file line number Diff line number Diff line change @@ -11,6 +11,7 @@ import (
1111 "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/api/queue_info"
1212 "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/cache/usagedb/api"
1313 "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/log"
14+ "github.com/NVIDIA/KAI-scheduler/pkg/scheduler/metrics"
1415)
1516
1617var defaultFetchInterval = 1 * time .Minute
@@ -135,14 +136,13 @@ func (l *UsageLister) WaitForCacheSync(stopCh <-chan struct{}) bool {
135136}
136137
137138func (l * UsageLister ) fetchAndUpdateUsage () {
138- // TODO: Add metrics for fetch times
139+ now := time . Now ()
139140 usage , err := l .client .GetResourceUsage ()
140141 if err != nil {
141142 log .InfraLogger .V (1 ).Errorf ("failed to fetch usage data: %v" , err )
142143 return
143144 }
144-
145- now := time .Now ()
145+ metrics .UpdateUsageQueryLatency (time .Since (now ))
146146
147147 l .lastUsageDataMutex .Lock ()
148148 defer l .lastUsageDataMutex .Unlock ()
Original file line number Diff line number Diff line change 5454 queueCPUUsage * prometheus.GaugeVec
5555 queueMemoryUsage * prometheus.GaugeVec
5656 queueGPUUsage * prometheus.GaugeVec
57+ usageQueryLatency * prometheus.HistogramVec
5758)
5859
5960func init () {
@@ -190,6 +191,14 @@ func InitMetrics(namespace string) {
190191 Name : "queue_gpu_usage_devices" ,
191192 Help : "GPU usage of queue, as a gauge. Value is proportional to gpu*hours usage with time decay applied" ,
192193 }, []string {"queue_name" })
194+
195+ usageQueryLatency = promauto .NewHistogramVec (
196+ prometheus.HistogramOpts {
197+ Namespace : namespace ,
198+ Name : "usage_query_latency_milliseconds" ,
199+ Help : "Usage database query latency histogram in milliseconds" ,
200+ Buckets : prometheus .ExponentialBuckets (5 , 2 , 10 ),
201+ }, []string {})
193202}
194203
195204// UpdateOpenSessionDuration updates latency for open session, including all plugins
@@ -277,6 +286,10 @@ func ResetQueueUsage() {
277286 queueGPUUsage .Reset ()
278287}
279288
289+ func UpdateUsageQueryLatency (latency time.Duration ) {
290+ usageQueryLatency .WithLabelValues ().Observe (float64 (latency .Milliseconds ()))
291+ }
292+
280293// RegisterPreemptionAttempts records number of attempts for preemption
281294func RegisterPreemptionAttempts () {
282295 preemptionAttempts .Inc ()
You can’t perform that action at this time.
0 commit comments