Skip to content

Commit 5702f4e

Browse files
authored
Natasha/add queue allocation metrics (#338)
* added allocation queue metrics + tests * fixed gpu resource constant * small fix in logs
1 parent d0d8cc3 commit 5702f4e

File tree

7 files changed

+157
-20
lines changed

7 files changed

+157
-20
lines changed

pkg/podgroupcontroller/controllers/resources/received.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ package resources
66
import (
77
"context"
88

9+
"github.com/NVIDIA/KAI-scheduler/pkg/common/constants"
10+
911
v1 "k8s.io/api/core/v1"
1012
"sigs.k8s.io/controller-runtime/pkg/client"
1113
)
@@ -27,6 +29,6 @@ func ExtractGPUSharingReceivedResources(ctx context.Context, pod *v1.Pod, kubeCl
2729
}
2830

2931
fractionResource, err := calculateAllocatedFraction(ctx, pod, kubeClient)
30-
resources[gpuFractionResourceName] = fractionResource
32+
resources[constants.GpuResource] = fractionResource
3133
return resources, err
3234
}

pkg/podgroupcontroller/controllers/resources/received_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ func Test_extractReceivedResources(t *testing.T) {
6767
},
6868
&v1.Node{},
6969
},
70-
v1.ResourceList{gpuFractionResourceName: resource.MustParse("0.4")},
70+
v1.ResourceList{constants.GpuResource: resource.MustParse("0.4")},
7171
false,
7272
},
7373
}

pkg/podgroupcontroller/controllers/resources/requested.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@ import (
1313
)
1414

1515
const (
16-
gpuMemoryResourceName = "run.ai/gpu.memory"
17-
gpuFractionResourceName = "nvidia.com/gpu"
16+
gpuMemoryResourceName = "run.ai/gpu.memory"
1817
)
1918

2019
func ExtractGPUSharingRequestedResources(pod *v1.Pod) (v1.ResourceList, error) {
@@ -53,7 +52,7 @@ func ExtractGPUSharingRequestedResources(pod *v1.Pod) (v1.ResourceList, error) {
5352
"Please check resource.Quantity restrictions. fraction <%s>, count: %d",
5453
gpuFractionStr, fractionsCount)
5554
}
56-
resources[v1.ResourceName(gpuFractionResourceName)] = quantity
55+
resources[v1.ResourceName(constants.GpuResource)] = quantity
5756
}
5857

5958
gpuMemoryStr, hasAnnotation := pod.Annotations[constants.GpuMemory]

pkg/podgrouper/podgrouper/plugins/defaultgrouper/default_grouper.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ func (dg *DefaultGrouper) CalcPodGroupPriorityClass(topOwner *unstructured.Unstr
161161
}
162162

163163
if priorityClassName != "" {
164-
logger.V(2).Info("priorityClassName from pod or owner labels is not valid, falling back to default",
164+
logger.V(1).Info("priorityClassName from pod or owner labels is not valid, falling back to default",
165165
"priorityClassName", priorityClassName, "topOwner", topOwner.GetName(), "pod", pod.GetName())
166166
}
167167

@@ -171,7 +171,7 @@ func (dg *DefaultGrouper) CalcPodGroupPriorityClass(topOwner *unstructured.Unstr
171171
return priorityClassName
172172
}
173173

174-
logger.V(2).Info("No default priority class found for group kind, using default fallback",
174+
logger.V(1).Info("No default priority class found for group kind, using default fallback",
175175
"groupKind", groupKind.String(), "defaultFallback", defaultPriorityClassForJob)
176176
return defaultPriorityClassForJob
177177
}
@@ -195,7 +195,7 @@ func (dg *DefaultGrouper) validatePriorityClassExists(priorityClassName string)
195195
priorityClass := &schedulingv1.PriorityClass{}
196196
err := dg.kubeReader.Get(context.Background(), client.ObjectKey{Name: priorityClassName}, priorityClass)
197197
if err != nil {
198-
logger.V(1).Error(err, "Failed to get priority class", "priorityClassName", priorityClassName)
198+
logger.V(1).Info("Failed to get priority class", "priorityClassName", priorityClassName, "error", err.Error())
199199
return false
200200
}
201201
return true

pkg/queuecontroller/controllers/suite_test.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,11 @@ var _ = Describe("QueueController", Ordered, func() {
298298
q.Expect(updatedQueue.Status.Requested["cpu"]).To(Equal(resource.MustParse("5")))
299299
q.Expect(updatedQueue.Status.Requested["memory"]).To(Equal(resource.MustParse("10Gi")))
300300
q.Expect(updatedQueue.Status.Requested["nvidia.com/gpu"]).To(Equal(resource.MustParse("2")))
301+
302+
labels := []string{"resource-queue", "normal", ""}
303+
expectMetricValue(q, metrics.GetQueueAllocatedGPUsMetric(), labels, 2)
304+
expectMetricValue(q, metrics.GetQueueAllocatedCPUMetric(), labels, 5)
305+
expectMetricValue(q, metrics.GetQueueAllocatedMemoryMetric(), labels, 10737418240)
301306
return true
302307
}, timeout, interval).Should(BeTrue())
303308
})
@@ -345,16 +350,22 @@ var _ = Describe("QueueController", Ordered, func() {
345350
q.Expect(gathered).To(Equal(0))
346351
gathered = testutil.CollectAndCount(metrics.GetQueueQuotaMemoryMetric())
347352
q.Expect(gathered).To(Equal(0))
353+
gathered = testutil.CollectAndCount(metrics.GetQueueAllocatedGPUsMetric())
354+
q.Expect(gathered).To(Equal(0))
355+
gathered = testutil.CollectAndCount(metrics.GetQueueAllocatedCPUMetric())
356+
q.Expect(gathered).To(Equal(0))
357+
gathered = testutil.CollectAndCount(metrics.GetQueueAllocatedMemoryMetric())
358+
q.Expect(gathered).To(Equal(0))
348359
}, timeout, interval).Should(Succeed())
349360
})
350361
})
351362
})
352363

353364
func expectMetricValue(q gomega.Gomega, gauge *prometheus.GaugeVec, labels []string, expected float64) {
354365
metricGauge, err := gauge.GetMetricWithLabelValues(labels...)
355-
q.Expect(err).To(BeNil())
356-
q.Expect(metricGauge).ToNot(BeNil())
357-
q.Expect(testutil.ToFloat64(metricGauge)).To(BeEquivalentTo(expected))
366+
q.ExpectWithOffset(1, err).To(BeNil())
367+
q.ExpectWithOffset(1, metricGauge).ToNot(BeNil())
368+
q.ExpectWithOffset(1, testutil.ToFloat64(metricGauge)).To(BeEquivalentTo(expected))
358369
}
359370

360371
func deleteQueue(ctx context.Context, k8sClient client.Client, queueName string) {

pkg/queuecontroller/metrics/metrics.go

Lines changed: 93 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,16 @@
44
package metrics
55

66
import (
7+
"math"
78
"sort"
9+
"strings"
810

911
v2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2"
1012

1113
"github.com/prometheus/client_golang/prometheus"
1214
"github.com/prometheus/client_golang/prometheus/promauto" // auto-registry collectors in default registry
15+
v1 "k8s.io/api/core/v1"
16+
"k8s.io/apimachinery/pkg/api/resource"
1317
"sigs.k8s.io/controller-runtime/pkg/metrics"
1418
)
1519

@@ -19,13 +23,18 @@ const (
1923
unlimitedQuota = float64(-1)
2024

2125
queueNameLabel = "queue_name"
26+
27+
gpuResourceNameSuffix = "/gpu"
2228
)
2329

2430
var (
25-
queueInfo *prometheus.GaugeVec
26-
queueDeservedGPUs *prometheus.GaugeVec
27-
queueQuotaCPU *prometheus.GaugeVec
28-
queueQuotaMemory *prometheus.GaugeVec
31+
queueInfo *prometheus.GaugeVec
32+
queueDeservedGPUs *prometheus.GaugeVec
33+
queueQuotaCPU *prometheus.GaugeVec
34+
queueQuotaMemory *prometheus.GaugeVec
35+
queueAllocatedGpus *prometheus.GaugeVec
36+
queueAllocatedCpus *prometheus.GaugeVec
37+
queueAllocatedMemory *prometheus.GaugeVec
2938

3039
additionalQueueLabelKeys []string
3140
queueLabelToDefaultMetricValue map[string]string
@@ -95,7 +104,32 @@ func InitMetrics(namespace string, queueLabelToMetricLabelMap, queueLabelToDefau
95104
}, queueMetricsLabels,
96105
)
97106

98-
metrics.Registry.MustRegister(queueInfo, queueDeservedGPUs, queueQuotaCPU, queueQuotaMemory)
107+
queueAllocatedGpus = promauto.NewGaugeVec(
108+
prometheus.GaugeOpts{
109+
Namespace: namespace,
110+
Name: "queue_allocated_gpus",
111+
Help: "Queue allocated GPUs",
112+
}, queueMetricsLabels,
113+
)
114+
115+
queueAllocatedCpus = promauto.NewGaugeVec(
116+
prometheus.GaugeOpts{
117+
Namespace: namespace,
118+
Name: "queue_allocated_cpu_cores",
119+
Help: "Queue allocated CPUs",
120+
}, queueMetricsLabels,
121+
)
122+
123+
queueAllocatedMemory = promauto.NewGaugeVec(
124+
prometheus.GaugeOpts{
125+
Namespace: namespace,
126+
Name: "queue_allocated_memory_bytes",
127+
Help: "Queue allocated memory",
128+
}, queueMetricsLabels,
129+
)
130+
131+
metrics.Registry.MustRegister(queueInfo, queueDeservedGPUs, queueQuotaCPU, queueQuotaMemory,
132+
queueAllocatedGpus, queueAllocatedCpus, queueAllocatedMemory)
99133
}
100134

101135
func SetQueueMetrics(queue *v2.Queue) {
@@ -109,15 +143,21 @@ func SetQueueMetrics(queue *v2.Queue) {
109143

110144
queueName := queue.Name
111145
gpuQuota := getGpuQuota(queue.Spec.Resources)
112-
cpuQuota := getCpuQuota(queue.Spec.Resources)
113-
memoryQuota := getMemoryQuota(queue.Spec.Resources)
146+
cpuQuota := getCpuQuotaCores(queue.Spec.Resources)
147+
memoryQuota := getMemoryQuotaBytes(queue.Spec.Resources)
148+
allocatedGpus := getAllocatedGpus(queue.Status)
149+
allocatedCpus := getAllocatedCpuCores(queue.Status)
150+
allocatedMemory := getAllocatedMemoryBytes(queue.Status)
114151

115152
queueQuotaMetricValues := append([]string{queueName}, additionalMetricLabelValues...)
116153

117154
queueInfo.WithLabelValues(queueQuotaMetricValues...).Set(1)
118155
queueDeservedGPUs.WithLabelValues(queueQuotaMetricValues...).Set(gpuQuota)
119156
queueQuotaCPU.WithLabelValues(queueQuotaMetricValues...).Set(cpuQuota)
120157
queueQuotaMemory.WithLabelValues(queueQuotaMetricValues...).Set(memoryQuota)
158+
queueAllocatedGpus.WithLabelValues(queueQuotaMetricValues...).Set(allocatedGpus)
159+
queueAllocatedCpus.WithLabelValues(queueQuotaMetricValues...).Set(allocatedCpus)
160+
queueAllocatedMemory.WithLabelValues(queueQuotaMetricValues...).Set(allocatedMemory)
121161
}
122162

123163
func ResetQueueMetrics(queueName string) {
@@ -126,6 +166,9 @@ func ResetQueueMetrics(queueName string) {
126166
queueDeservedGPUs.DeletePartialMatch(queueLabelIdentifier)
127167
queueQuotaCPU.DeletePartialMatch(queueLabelIdentifier)
128168
queueQuotaMemory.DeletePartialMatch(queueLabelIdentifier)
169+
queueAllocatedGpus.DeletePartialMatch(queueLabelIdentifier)
170+
queueAllocatedCpus.DeletePartialMatch(queueLabelIdentifier)
171+
queueAllocatedMemory.DeletePartialMatch(queueLabelIdentifier)
129172
}
130173

131174
func getGpuQuota(queueSpecResources *v2.QueueResources) float64 {
@@ -135,7 +178,7 @@ func getGpuQuota(queueSpecResources *v2.QueueResources) float64 {
135178
return queueSpecResources.GPU.Quota
136179
}
137180

138-
func getCpuQuota(queueSpecResources *v2.QueueResources) float64 {
181+
func getCpuQuotaCores(queueSpecResources *v2.QueueResources) float64 {
139182
if queueSpecResources == nil {
140183
return float64(0)
141184
}
@@ -146,7 +189,7 @@ func getCpuQuota(queueSpecResources *v2.QueueResources) float64 {
146189
return queueSpecResources.CPU.Quota / milliCpuToCpuDivider
147190
}
148191

149-
func getMemoryQuota(queueSpecResources *v2.QueueResources) float64 {
192+
func getMemoryQuotaBytes(queueSpecResources *v2.QueueResources) float64 {
150193
if queueSpecResources == nil {
151194
return float64(0)
152195
}
@@ -157,6 +200,35 @@ func getMemoryQuota(queueSpecResources *v2.QueueResources) float64 {
157200
return memoryQuota * megabytesToBytesMultiplier
158201
}
159202

203+
func getAllocatedGpus(queueStatus v2.QueueStatus) float64 {
204+
for resourceName, quantity := range queueStatus.Allocated {
205+
if strings.HasSuffix(string(resourceName), gpuResourceNameSuffix) {
206+
return roundResourceQuantity(quantity)
207+
}
208+
}
209+
return 0
210+
}
211+
212+
func getAllocatedCpuCores(queueStatus v2.QueueStatus) float64 {
213+
allocated, ok := queueStatus.Allocated[v1.ResourceCPU]
214+
if !ok {
215+
return 0
216+
}
217+
return roundResourceQuantity(allocated)
218+
}
219+
220+
func getAllocatedMemoryBytes(queueStatus v2.QueueStatus) float64 {
221+
allocated, ok := queueStatus.Allocated[v1.ResourceMemory]
222+
if !ok {
223+
return 0
224+
}
225+
return roundResourceQuantity(allocated)
226+
}
227+
228+
func roundResourceQuantity(quantity resource.Quantity) float64 {
229+
return math.Round(quantity.AsApproximateFloat64()*10000) / 10000
230+
}
231+
160232
func getAdditionalMetricLabelValues(queueLabels map[string]string) []string {
161233
labelValues := make([]string, len(additionalQueueLabelKeys))
162234

@@ -191,3 +263,15 @@ func GetQueueQuotaCPUMetric() *prometheus.GaugeVec {
191263
func GetQueueQuotaMemoryMetric() *prometheus.GaugeVec {
192264
return queueQuotaMemory
193265
}
266+
267+
func GetQueueAllocatedGPUsMetric() *prometheus.GaugeVec {
268+
return queueAllocatedGpus
269+
}
270+
271+
func GetQueueAllocatedCPUMetric() *prometheus.GaugeVec {
272+
return queueAllocatedCpus
273+
}
274+
275+
func GetQueueAllocatedMemoryMetric() *prometheus.GaugeVec {
276+
return queueAllocatedMemory
277+
}

pkg/queuecontroller/metrics/metrics_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111
. "github.com/onsi/gomega"
1212
"github.com/prometheus/client_golang/prometheus"
1313
"github.com/prometheus/client_golang/prometheus/testutil"
14+
v1 "k8s.io/api/core/v1"
15+
"k8s.io/apimachinery/pkg/api/resource"
1416
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1517
)
1618

@@ -46,6 +48,13 @@ var _ = Describe("Queue Metrics", Ordered, func() {
4648
Memory: v2.QueueResource{Quota: 4},
4749
},
4850
},
51+
Status: v2.QueueStatus{
52+
Allocated: map[v1.ResourceName]resource.Quantity{
53+
"nvidia.com/gpu": resource.MustParse("1.5"),
54+
v1.ResourceCPU: resource.MustParse("250m"),
55+
v1.ResourceMemory: resource.MustParse("2Gi"),
56+
},
57+
},
4958
}
5059
SetQueueMetrics(queue)
5160

@@ -56,6 +65,9 @@ var _ = Describe("Queue Metrics", Ordered, func() {
5665
expectMetricValue(queueDeservedGPUs, labels, 2)
5766
expectMetricValue(queueQuotaCPU, labels, 0.5)
5867
expectMetricValue(queueQuotaMemory, labels, 4000000)
68+
expectMetricValue(queueAllocatedGpus, labels, 1.5)
69+
expectMetricValue(queueAllocatedCpus, labels, 0.25)
70+
expectMetricValue(queueAllocatedMemory, labels, 2147483648)
5971
})
6072

6173
It("should use default label value if label is missing", func() {
@@ -70,6 +82,13 @@ var _ = Describe("Queue Metrics", Ordered, func() {
7082
Memory: v2.QueueResource{Quota: 2},
7183
},
7284
},
85+
Status: v2.QueueStatus{
86+
Allocated: map[v1.ResourceName]resource.Quantity{
87+
"somethingelse/gpu": resource.MustParse("0.3"),
88+
v1.ResourceCPU: resource.MustParse("500m"),
89+
v1.ResourceMemory: resource.MustParse("1Gi"),
90+
},
91+
},
7392
}
7493
SetQueueMetrics(queue)
7594

@@ -79,6 +98,9 @@ var _ = Describe("Queue Metrics", Ordered, func() {
7998
expectMetricValue(queueDeservedGPUs, labels, 0.7)
8099
expectMetricValue(queueQuotaCPU, labels, 1)
81100
expectMetricValue(queueQuotaMemory, labels, 2000000)
101+
expectMetricValue(queueAllocatedGpus, labels, 0.3)
102+
expectMetricValue(queueAllocatedCpus, labels, 0.5)
103+
expectMetricValue(queueAllocatedMemory, labels, 1073741824)
82104
})
83105

84106
It("should use empty string if label and default are missing", func() {
@@ -93,6 +115,9 @@ var _ = Describe("Queue Metrics", Ordered, func() {
93115
Memory: v2.QueueResource{Quota: 2},
94116
},
95117
},
118+
Status: v2.QueueStatus{
119+
Allocated: map[v1.ResourceName]resource.Quantity{},
120+
},
96121
}
97122
SetQueueMetrics(queue)
98123

@@ -102,6 +127,9 @@ var _ = Describe("Queue Metrics", Ordered, func() {
102127
expectMetricValue(queueDeservedGPUs, labels, 1)
103128
expectMetricValue(queueQuotaCPU, labels, 1)
104129
expectMetricValue(queueQuotaMemory, labels, 2000000)
130+
expectMetricValue(queueAllocatedGpus, labels, 0)
131+
expectMetricValue(queueAllocatedCpus, labels, 0)
132+
expectMetricValue(queueAllocatedMemory, labels, 0)
105133
})
106134

107135
It("should delete metrics when queue is deleted", func() {
@@ -117,6 +145,13 @@ var _ = Describe("Queue Metrics", Ordered, func() {
117145
Memory: v2.QueueResource{Quota: 4},
118146
},
119147
},
148+
Status: v2.QueueStatus{
149+
Allocated: map[v1.ResourceName]resource.Quantity{
150+
"nvidia.com/gpu": resource.MustParse("1"),
151+
v1.ResourceCPU: resource.MustParse("1000m"),
152+
v1.ResourceMemory: resource.MustParse("2Gi"),
153+
},
154+
},
120155
}
121156
SetQueueMetrics(queue)
122157
ResetQueueMetrics("test-queue")
@@ -130,6 +165,12 @@ var _ = Describe("Queue Metrics", Ordered, func() {
130165
Expect(gathered).To(Equal(0))
131166
gathered = testutil.CollectAndCount(queueQuotaMemory)
132167
Expect(gathered).To(Equal(0))
168+
gathered = testutil.CollectAndCount(queueAllocatedGpus)
169+
Expect(gathered).To(Equal(0))
170+
gathered = testutil.CollectAndCount(queueAllocatedCpus)
171+
Expect(gathered).To(Equal(0))
172+
gathered = testutil.CollectAndCount(queueAllocatedMemory)
173+
Expect(gathered).To(Equal(0))
133174
})
134175
})
135176

0 commit comments

Comments
 (0)