Skip to content

Commit f9bacb0

Browse files
authored
Natasha/queue metrics fixes (#344)
* added mustregister to metrics * added queuecontroller servicemonitor for metrics * little fix * removed servicemonitor..
1 parent 23c7a44 commit f9bacb0

File tree

3 files changed

+13
-3
lines changed

3 files changed

+13
-3
lines changed

cmd/queuecontroller/app/app.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package app
55

66
import (
77
"flag"
8+
"fmt"
89

910
"go.uber.org/zap/zapcore"
1011

@@ -50,6 +51,7 @@ func Run() error {
5051
initLogger()
5152

5253
metrics.InitMetrics(opts.MetricsNamespace, opts.QueueLabelToMetricLabel.Get(), opts.QueueLabelToDefaultMetricValue.Get())
54+
setupLog.Info(fmt.Sprintf("Queue metrics initialized and registered with namespace: %s", opts.MetricsNamespace))
5355

5456
var err error
5557
options := ctrl.Options{

deployments/kai-scheduler/templates/services/queuecontroller-service.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,9 @@ spec:
1313
targetPort: 9443
1414
protocol: TCP
1515
name: webhook
16+
- port: 8080
17+
targetPort: 8080
18+
protocol: TCP
19+
name: metrics
1620
selector:
17-
app: queuecontroller
21+
app: queuecontroller

pkg/queuecontroller/metrics/metrics.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44
package metrics
55

66
import (
7-
"github.com/prometheus/client_golang/prometheus"
8-
"github.com/prometheus/client_golang/prometheus/promauto" // auto-registry collectors in default registry
97
"sort"
108

119
v2 "github.com/NVIDIA/KAI-scheduler/pkg/apis/scheduling/v2"
10+
11+
"github.com/prometheus/client_golang/prometheus"
12+
"github.com/prometheus/client_golang/prometheus/promauto" // auto-registry collectors in default registry
13+
"sigs.k8s.io/controller-runtime/pkg/metrics"
1214
)
1315

1416
const (
@@ -92,6 +94,8 @@ func InitMetrics(namespace string, queueLabelToMetricLabelMap, queueLabelToDefau
9294
Help: "Queue quota memory",
9395
}, queueMetricsLabels,
9496
)
97+
98+
metrics.Registry.MustRegister(queueInfo, queueDeservedGPUs, queueQuotaCPU, queueQuotaMemory)
9599
}
96100

97101
func SetQueueMetrics(queue *v2.Queue) {

0 commit comments

Comments
 (0)