Skip to content

Commit b4cf379

Browse files
authored
Making scheduler metrics subsystem name configurable (#86)
1 parent 30acc1f commit b4cf379

File tree

5 files changed

+34
-26
lines changed

5 files changed

+34
-26
lines changed

cmd/scheduler/app/options/options.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414

1515
const (
1616
defaultSchedulerName = "kai-scheduler"
17+
defaultMetricsSubSystemName = "kai"
1718
defaultSchedulerPeriod = time.Second
1819
defaultStalenessGracePeriod = 60 * time.Second
1920
defaultListenAddress = ":8080"
@@ -34,6 +35,7 @@ type ServerOption struct {
3435
SchedulePeriod time.Duration
3536
EnableLeaderElection bool
3637
PrintVersion bool
38+
MetricsSubSystemName string
3739
RestrictSchedulingNodes bool
3840
NodePoolLabelKey string
3941
NodePoolLabelValue string
@@ -69,7 +71,7 @@ func NewServerOption() *ServerOption {
6971
// AddFlags adds flags for a specific CMServer to the specified FlagSet
7072
func (s *ServerOption) AddFlags(fs *pflag.FlagSet) {
7173
// kai-scheduler will ignore pods with scheduler names other than specified with the option
72-
fs.StringVar(&s.SchedulerName, "scheduler-name", defaultSchedulerName, "kai-scheduler will handle pods with the scheduler-name")
74+
fs.StringVar(&s.SchedulerName, "scheduler-name", defaultSchedulerName, "The scheduler name in pod spec that handled by this scheduler")
7375
fs.BoolVar(&s.RestrictSchedulingNodes, "restrict-node-scheduling", false, "kai-scheduler will allocate jobs only to restricted nodes")
7476
fs.StringVar(&s.NodePoolLabelKey, "nodepool-label-key", defaultNodePoolLabelKey, "The label key by which to filter scheduling nodepool")
7577
fs.StringVar(&s.NodePoolLabelValue, "partition-label-value", "", "The label value by which to filter scheduling partition")
@@ -79,6 +81,7 @@ func (s *ServerOption) AddFlags(fs *pflag.FlagSet) {
7981
"Start a leader election client and gain leadership before "+
8082
"executing the main loop. Enable this when running replicated kai-scheduler for high availability")
8183
fs.BoolVar(&s.PrintVersion, "version", true, "Show version")
84+
fs.StringVar(&s.MetricsSubSystemName, "metrics-subsystem-name", defaultMetricsSubSystemName, "The name of the metrics subsystem")
8285
fs.StringVar(&s.ListenAddress, "listen-address", defaultListenAddress, "The address to listen on for HTTP requests")
8386
fs.BoolVar(&s.EnableProfiler, "enable-profiler", false, "Enable profiler")
8487
fs.StringVar(&s.ProfilerApiPort, "profiler-port", defaultProfilerApiPort, "The port to listen for profiler api requests")

cmd/scheduler/app/options/options_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ func TestAddFlags(t *testing.T) {
3030
SchedulerName: defaultSchedulerName,
3131
SchedulePeriod: 5 * time.Minute,
3232
PrintVersion: true,
33+
MetricsSubSystemName: defaultMetricsSubSystemName,
3334
ListenAddress: defaultListenAddress,
3435
ProfilerApiPort: defaultProfilerApiPort,
3536
Verbosity: defaultVerbosityLevel,

cmd/scheduler/app/server.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,10 @@ import (
3030

3131
"github.com/NVIDIA/KAI-scheduler/cmd/scheduler/app/options"
3232
"github.com/NVIDIA/KAI-scheduler/cmd/scheduler/profiling"
33-
scheduler "github.com/NVIDIA/KAI-scheduler/pkg/scheduler"
33+
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler"
3434
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/conf"
3535
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/log"
36+
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/metrics"
3637
"github.com/NVIDIA/KAI-scheduler/pkg/scheduler/version"
3738
)
3839

@@ -131,6 +132,7 @@ func Run(opt *options.ServerOption, config *restclient.Config, mux *http.ServeMu
131132
if opt.PrintVersion {
132133
version.PrintVersion()
133134
}
135+
metrics.SetSubSystemName(opt.MetricsSubSystemName)
134136

135137
scheduler, err := scheduler.NewScheduler(config,
136138
opt.SchedulerConf,

pkg/binder/binding/resourcereservation/resource_reservation_test.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,12 @@ var _ = Describe("ResourceReservationService", func() {
8282
)
8383
Context("ReserveGpuDevice", func() {
8484
for testName, testData := range map[string]struct {
85-
reservationPod *v1.Pod
86-
groupName string
87-
clientInterceptFuncs interceptor.Funcs
88-
getGPUIndexFromMetricsAttempts int
89-
numReservationPods int
90-
expectedGPUIndex string
91-
expectedErrorContains string
85+
reservationPod *v1.Pod
86+
groupName string
87+
clientInterceptFuncs interceptor.Funcs
88+
numReservationPods int
89+
expectedGPUIndex string
90+
expectedErrorContains string
9291
}{
9392
"reservation pod exists": {
9493
groupName: existingGroup,

pkg/scheduler/metrics/metrics.go

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ import (
1111
)
1212

1313
const (
14-
RunaiNamespace = "runai"
15-
1614
// OnSessionOpen label
1715
OnSessionOpen = "OnSessionOpen"
1816

@@ -22,48 +20,49 @@ const (
2220

2321
var (
2422
currentAction string
23+
subsystemName string
2524

2625
e2eSchedulingLatency = promauto.NewGauge(
2726
prometheus.GaugeOpts{
28-
Subsystem: RunaiNamespace,
27+
Subsystem: subsystemName,
2928
Name: "e2e_scheduling_latency_milliseconds",
3029
Help: "E2e scheduling latency in milliseconds (scheduling algorithm + binding), as a gauge",
3130
},
3231
)
3332

3433
openSessionLatency = promauto.NewGauge(
3534
prometheus.GaugeOpts{
36-
Subsystem: RunaiNamespace,
35+
Subsystem: subsystemName,
3736
Name: "open_session_latency_milliseconds",
3837
Help: "Open session latency in milliseconds, including all plugins, as a gauge",
3938
},
4039
)
4140

4241
closeSessionLatency = promauto.NewGauge(
4342
prometheus.GaugeOpts{
44-
Subsystem: RunaiNamespace,
43+
Subsystem: subsystemName,
4544
Name: "close_session_latency_milliseconds",
4645
Help: "Close session latency in milliseconds, including all plugins, as a gauge",
4746
},
4847
)
4948

5049
pluginSchedulingLatency = promauto.NewGaugeVec(
5150
prometheus.GaugeOpts{
52-
Subsystem: RunaiNamespace,
51+
Subsystem: subsystemName,
5352
Name: "plugin_scheduling_latency_milliseconds",
5453
Help: "Plugin scheduling latency in milliseconds, as a gauge",
5554
}, []string{"plugin", "OnSession"})
5655

5756
actionSchedulingLatency = promauto.NewGaugeVec(
5857
prometheus.GaugeOpts{
59-
Subsystem: RunaiNamespace,
58+
Subsystem: subsystemName,
6059
Name: "action_scheduling_latency_milliseconds",
6160
Help: "Action scheduling latency in milliseconds, as a gauge",
6261
}, []string{"action"})
6362

6463
taskSchedulingLatency = promauto.NewHistogram(
6564
prometheus.HistogramOpts{
66-
Subsystem: RunaiNamespace,
65+
Subsystem: subsystemName,
6766
Name: "task_scheduling_latency_milliseconds",
6867
Help: "Task scheduling latency in milliseconds",
6968
Buckets: prometheus.ExponentialBuckets(5, 2, 10),
@@ -72,7 +71,7 @@ var (
7271

7372
taskBindLatency = promauto.NewHistogram(
7473
prometheus.HistogramOpts{
75-
Subsystem: RunaiNamespace,
74+
Subsystem: subsystemName,
7675
Name: "task_bind_latency_milliseconds",
7776
Help: "Task bind latency histogram in milliseconds",
7877
Buckets: prometheus.ExponentialBuckets(5, 2, 10),
@@ -81,55 +80,55 @@ var (
8180

8281
podgroupsScheduledByAction = promauto.NewCounterVec(
8382
prometheus.CounterOpts{
84-
Subsystem: RunaiNamespace,
83+
Subsystem: subsystemName,
8584
Name: "podgroups_scheduled_by_action",
8685
Help: "Count of podgroups scheduled per action",
8786
}, []string{"action"})
8887

8988
podgroupsConsideredByAction = promauto.NewCounterVec(
9089
prometheus.CounterOpts{
91-
Subsystem: RunaiNamespace,
90+
Subsystem: subsystemName,
9291
Name: "podgroups_acted_on_by_action",
9392
Help: "Count of podgroups tried per action",
9493
}, []string{"action"})
9594

9695
scenariosSimulatedByAction = promauto.NewCounterVec(
9796
prometheus.CounterOpts{
98-
Subsystem: RunaiNamespace,
97+
Subsystem: subsystemName,
9998
Name: "scenarios_simulation_by_action",
10099
Help: "Count of scenarios simulated per action",
101100
}, []string{"action"})
102101

103102
scenariosFilteredByAction = promauto.NewCounterVec(
104103
prometheus.CounterOpts{
105-
Subsystem: RunaiNamespace,
104+
Subsystem: subsystemName,
106105
Name: "scenarios_filtered_by_action",
107106
Help: "Count of scenarios filtered per action",
108107
}, []string{"action"})
109108

110109
preemptionAttempts = promauto.NewCounter(
111110
prometheus.CounterOpts{
112-
Subsystem: RunaiNamespace,
111+
Subsystem: subsystemName,
113112
Name: "total_preemption_attempts",
114113
Help: "Total preemption attempts in the cluster till now",
115114
},
116115
)
117116

118117
queueFairShareCPU = promauto.NewGaugeVec(
119118
prometheus.GaugeOpts{
120-
Subsystem: RunaiNamespace,
119+
Subsystem: subsystemName,
121120
Name: "queue_fair_share_cpu_cores",
122121
Help: "CPU Fair share of queue, as a gauge. Value is in Cores",
123122
}, []string{"queue_name"})
124123
queueFairShareMemory = promauto.NewGaugeVec(
125124
prometheus.GaugeOpts{
126-
Subsystem: RunaiNamespace,
125+
Subsystem: subsystemName,
127126
Name: "queue_fair_share_memory_gb",
128127
Help: "Memory Fair share of queue, as a gauge. Value is in GB",
129128
}, []string{"queue_name"})
130129
queueFairShareGPU = promauto.NewGaugeVec(
131130
prometheus.GaugeOpts{
132-
Subsystem: RunaiNamespace,
131+
Subsystem: subsystemName,
133132
Name: "queue_fair_share_gpu",
134133
Help: "GPU Fair share of queue, as a gauge. Values in GPU devices",
135134
}, []string{"queue_name"})
@@ -168,6 +167,10 @@ func UpdateTaskScheduleDuration(duration time.Duration) {
168167
taskSchedulingLatency.Observe(float64(duration.Milliseconds()))
169168
}
170169

170+
func SetSubSystemName(name string) {
171+
subsystemName = name
172+
}
173+
171174
func SetCurrentAction(action string) {
172175
currentAction = action
173176
}

0 commit comments

Comments
 (0)