44package metrics
55
66import (
7+ "math"
78 "sort"
9+ "strings"
810
911 v2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2"
1012
1113 "github.com/prometheus/client_golang/prometheus"
1214 "github.com/prometheus/client_golang/prometheus/promauto" // auto-registry collectors in default registry
15+ v1 "k8s.io/api/core/v1"
16+ "k8s.io/apimachinery/pkg/api/resource"
1317 "sigs.k8s.io/controller-runtime/pkg/metrics"
1418)
1519
@@ -19,13 +23,18 @@ const (
1923 unlimitedQuota = float64 (- 1 )
2024
2125 queueNameLabel = "queue_name"
26+
27+ gpuResourceNameSuffix = "/gpu"
2228)
2329
2430var (
25- queueInfo * prometheus.GaugeVec
26- queueDeservedGPUs * prometheus.GaugeVec
27- queueQuotaCPU * prometheus.GaugeVec
28- queueQuotaMemory * prometheus.GaugeVec
31+ queueInfo * prometheus.GaugeVec
32+ queueDeservedGPUs * prometheus.GaugeVec
33+ queueQuotaCPU * prometheus.GaugeVec
34+ queueQuotaMemory * prometheus.GaugeVec
35+ queueAllocatedGpus * prometheus.GaugeVec
36+ queueAllocatedCpus * prometheus.GaugeVec
37+ queueAllocatedMemory * prometheus.GaugeVec
2938
3039 additionalQueueLabelKeys []string
3140 queueLabelToDefaultMetricValue map [string ]string
@@ -95,7 +104,32 @@ func InitMetrics(namespace string, queueLabelToMetricLabelMap, queueLabelToDefau
95104 }, queueMetricsLabels ,
96105 )
97106
98- metrics .Registry .MustRegister (queueInfo , queueDeservedGPUs , queueQuotaCPU , queueQuotaMemory )
107+ queueAllocatedGpus = promauto .NewGaugeVec (
108+ prometheus.GaugeOpts {
109+ Namespace : namespace ,
110+ Name : "queue_allocated_gpus" ,
111+ Help : "Queue allocated GPUs" ,
112+ }, queueMetricsLabels ,
113+ )
114+
115+ queueAllocatedCpus = promauto .NewGaugeVec (
116+ prometheus.GaugeOpts {
117+ Namespace : namespace ,
118+ Name : "queue_allocated_cpu_cores" ,
119+ Help : "Queue allocated CPUs" ,
120+ }, queueMetricsLabels ,
121+ )
122+
123+ queueAllocatedMemory = promauto .NewGaugeVec (
124+ prometheus.GaugeOpts {
125+ Namespace : namespace ,
126+ Name : "queue_allocated_memory_bytes" ,
127+ Help : "Queue allocated memory" ,
128+ }, queueMetricsLabels ,
129+ )
130+
131+ metrics .Registry .MustRegister (queueInfo , queueDeservedGPUs , queueQuotaCPU , queueQuotaMemory ,
132+ queueAllocatedGpus , queueAllocatedCpus , queueAllocatedMemory )
99133}
100134
101135func SetQueueMetrics (queue * v2.Queue ) {
@@ -109,15 +143,21 @@ func SetQueueMetrics(queue *v2.Queue) {
109143
110144 queueName := queue .Name
111145 gpuQuota := getGpuQuota (queue .Spec .Resources )
112- cpuQuota := getCpuQuota (queue .Spec .Resources )
113- memoryQuota := getMemoryQuota (queue .Spec .Resources )
146+ cpuQuota := getCpuQuotaCores (queue .Spec .Resources )
147+ memoryQuota := getMemoryQuotaBytes (queue .Spec .Resources )
148+ allocatedGpus := getAllocatedGpus (queue .Status )
149+ allocatedCpus := getAllocatedCpuCores (queue .Status )
150+ allocatedMemory := getAllocatedMemoryBytes (queue .Status )
114151
115152 queueQuotaMetricValues := append ([]string {queueName }, additionalMetricLabelValues ... )
116153
117154 queueInfo .WithLabelValues (queueQuotaMetricValues ... ).Set (1 )
118155 queueDeservedGPUs .WithLabelValues (queueQuotaMetricValues ... ).Set (gpuQuota )
119156 queueQuotaCPU .WithLabelValues (queueQuotaMetricValues ... ).Set (cpuQuota )
120157 queueQuotaMemory .WithLabelValues (queueQuotaMetricValues ... ).Set (memoryQuota )
158+ queueAllocatedGpus .WithLabelValues (queueQuotaMetricValues ... ).Set (allocatedGpus )
159+ queueAllocatedCpus .WithLabelValues (queueQuotaMetricValues ... ).Set (allocatedCpus )
160+ queueAllocatedMemory .WithLabelValues (queueQuotaMetricValues ... ).Set (allocatedMemory )
121161}
122162
123163func ResetQueueMetrics (queueName string ) {
@@ -126,6 +166,9 @@ func ResetQueueMetrics(queueName string) {
126166 queueDeservedGPUs .DeletePartialMatch (queueLabelIdentifier )
127167 queueQuotaCPU .DeletePartialMatch (queueLabelIdentifier )
128168 queueQuotaMemory .DeletePartialMatch (queueLabelIdentifier )
169+ queueAllocatedGpus .DeletePartialMatch (queueLabelIdentifier )
170+ queueAllocatedCpus .DeletePartialMatch (queueLabelIdentifier )
171+ queueAllocatedMemory .DeletePartialMatch (queueLabelIdentifier )
129172}
130173
131174func getGpuQuota (queueSpecResources * v2.QueueResources ) float64 {
@@ -135,7 +178,7 @@ func getGpuQuota(queueSpecResources *v2.QueueResources) float64 {
135178 return queueSpecResources .GPU .Quota
136179}
137180
138- func getCpuQuota (queueSpecResources * v2.QueueResources ) float64 {
181+ func getCpuQuotaCores (queueSpecResources * v2.QueueResources ) float64 {
139182 if queueSpecResources == nil {
140183 return float64 (0 )
141184 }
@@ -146,7 +189,7 @@ func getCpuQuota(queueSpecResources *v2.QueueResources) float64 {
146189 return queueSpecResources .CPU .Quota / milliCpuToCpuDivider
147190}
148191
149- func getMemoryQuota (queueSpecResources * v2.QueueResources ) float64 {
192+ func getMemoryQuotaBytes (queueSpecResources * v2.QueueResources ) float64 {
150193 if queueSpecResources == nil {
151194 return float64 (0 )
152195 }
@@ -157,6 +200,35 @@ func getMemoryQuota(queueSpecResources *v2.QueueResources) float64 {
157200 return memoryQuota * megabytesToBytesMultiplier
158201}
159202
203+ func getAllocatedGpus (queueStatus v2.QueueStatus ) float64 {
204+ for resourceName , quantity := range queueStatus .Allocated {
205+ if strings .HasSuffix (string (resourceName ), gpuResourceNameSuffix ) {
206+ return roundResourceQuantity (quantity )
207+ }
208+ }
209+ return 0
210+ }
211+
212+ func getAllocatedCpuCores (queueStatus v2.QueueStatus ) float64 {
213+ allocated , ok := queueStatus .Allocated [v1 .ResourceCPU ]
214+ if ! ok {
215+ return 0
216+ }
217+ return roundResourceQuantity (allocated )
218+ }
219+
220+ func getAllocatedMemoryBytes (queueStatus v2.QueueStatus ) float64 {
221+ allocated , ok := queueStatus .Allocated [v1 .ResourceMemory ]
222+ if ! ok {
223+ return 0
224+ }
225+ return roundResourceQuantity (allocated )
226+ }
227+
228+ func roundResourceQuantity (quantity resource.Quantity ) float64 {
229+ return math .Round (quantity .AsApproximateFloat64 ()* 10000 ) / 10000
230+ }
231+
160232func getAdditionalMetricLabelValues (queueLabels map [string ]string ) []string {
161233 labelValues := make ([]string , len (additionalQueueLabelKeys ))
162234
@@ -191,3 +263,15 @@ func GetQueueQuotaCPUMetric() *prometheus.GaugeVec {
191263func GetQueueQuotaMemoryMetric () * prometheus.GaugeVec {
192264 return queueQuotaMemory
193265}
266+
267+ func GetQueueAllocatedGPUsMetric () * prometheus.GaugeVec {
268+ return queueAllocatedGpus
269+ }
270+
271+ func GetQueueAllocatedCPUMetric () * prometheus.GaugeVec {
272+ return queueAllocatedCpus
273+ }
274+
275+ func GetQueueAllocatedMemoryMetric () * prometheus.GaugeVec {
276+ return queueAllocatedMemory
277+ }
0 commit comments