Skip to content

Commit 29e95b6

Browse files
kaichiachenwilfred-s
authored andcommitted
[YUNIKORN-2930] Add time consumption metrics for scheduling/tryNode cycle (#1017)
Add two new metrics that observe: * each scheduling cycle, regardless of whether a pod was scheduled. * total time taken to find a node for a given pod. Add metrics to test Closes: #1017 Signed-off-by: Wilfred Spiegelenburg <[email protected]>
1 parent 1a07ad7 commit 29e95b6

File tree

4 files changed

+55
-0
lines changed

4 files changed

+55
-0
lines changed

pkg/metrics/scheduler.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,11 @@ type SchedulerMetrics struct {
6262
node *prometheus.GaugeVec
6363
nodeResourceUsage map[string]*prometheus.GaugeVec
6464
schedulingLatency prometheus.Histogram
65+
schedulingCycle prometheus.Histogram
6566
sortingLatency *prometheus.HistogramVec
6667
tryNodeLatency prometheus.Histogram
6768
tryPreemptionLatency prometheus.Histogram
69+
tryNodeEvaluation prometheus.Histogram
6870
lock locking.RWMutex
6971
}
7072

@@ -117,6 +119,17 @@ func InitSchedulerMetrics() *SchedulerMetrics {
117119
Buckets: prometheus.ExponentialBuckets(0.0001, 10, 8), // start from 0.1ms
118120
},
119121
)
122+
123+
s.schedulingCycle = prometheus.NewHistogram(
124+
prometheus.HistogramOpts{
125+
Namespace: Namespace,
126+
Subsystem: SchedulerSubsystem,
127+
Name: "scheduling_cycle_milliseconds",
128+
Help: "Time taken for a scheduling cycle, in seconds.",
129+
Buckets: prometheus.ExponentialBuckets(0.0001, 10, 8),
130+
},
131+
)
132+
120133
s.sortingLatency = prometheus.NewHistogramVec(
121134
prometheus.HistogramOpts{
122135
Namespace: Namespace,
@@ -136,6 +149,16 @@ func InitSchedulerMetrics() *SchedulerMetrics {
136149
},
137150
)
138151

152+
s.tryNodeEvaluation = prometheus.NewHistogram(
153+
prometheus.HistogramOpts{
154+
Namespace: Namespace,
155+
Subsystem: SchedulerSubsystem,
156+
Name: "trynode_evaluation_milliseconds",
157+
Help: "Time taken to evaluate nodes for a pod, in seconds.",
158+
Buckets: prometheus.ExponentialBuckets(0.0001, 10, 8),
159+
},
160+
)
161+
139162
s.tryPreemptionLatency = prometheus.NewHistogram(
140163
prometheus.HistogramOpts{
141164
Namespace: Namespace,
@@ -155,6 +178,8 @@ func InitSchedulerMetrics() *SchedulerMetrics {
155178
s.schedulingLatency,
156179
s.sortingLatency,
157180
s.tryNodeLatency,
181+
s.schedulingCycle,
182+
s.tryNodeEvaluation,
158183
s.tryPreemptionLatency,
159184
}
160185
for _, metric := range metricsList {
@@ -182,6 +207,10 @@ func (m *SchedulerMetrics) ObserveSchedulingLatency(start time.Time) {
182207
m.schedulingLatency.Observe(SinceInSeconds(start))
183208
}
184209

210+
func (m *SchedulerMetrics) ObserveSchedulingCycle(start time.Time) {
211+
m.schedulingCycle.Observe(SinceInSeconds(start))
212+
}
213+
185214
func (m *SchedulerMetrics) ObserveAppSortingLatency(start time.Time) {
186215
m.sortingLatency.WithLabelValues(SortingApp).Observe(SinceInSeconds(start))
187216
}
@@ -194,6 +223,10 @@ func (m *SchedulerMetrics) ObserveTryNodeLatency(start time.Time) {
194223
m.tryNodeLatency.Observe(SinceInSeconds(start))
195224
}
196225

226+
func (m *SchedulerMetrics) ObserveTryNodeEvaluation(start time.Time) {
227+
m.tryNodeEvaluation.Observe(SinceInSeconds(start))
228+
}
229+
197230
func (m *SchedulerMetrics) ObserveTryPreemptionLatency(start time.Time) {
198231
m.tryPreemptionLatency.Observe(SinceInSeconds(start))
199232
}

pkg/metrics/scheduler_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,22 @@ func TestSchedulerApplicationsFailed(t *testing.T) {
168168
verifyMetric(t, 1, "failed", "yunikorn_scheduler_application_total", dto.MetricType_GAUGE, "state")
169169
}
170170

171+
func TestSchedulingCycle(t *testing.T) {
172+
sm = getSchedulerMetrics(t)
173+
defer unregisterMetrics()
174+
175+
sm.ObserveSchedulingCycle(time.Now().Add(-1 * time.Minute))
176+
verifyHistogram(t, "scheduling_cycle_milliseconds", 60, 1)
177+
}
178+
179+
func TestTryNodeEvaluation(t *testing.T) {
180+
sm = getSchedulerMetrics(t)
181+
defer unregisterMetrics()
182+
183+
sm.ObserveTryNodeEvaluation(time.Now().Add(-1 * time.Minute))
184+
verifyHistogram(t, "trynode_evaluation_milliseconds", 60, 1)
185+
}
186+
171187
func getSchedulerMetrics(t *testing.T) *SchedulerMetrics {
172188
unregisterMetrics()
173189
return InitSchedulerMetrics()
@@ -223,7 +239,9 @@ func unregisterMetrics() {
223239
prometheus.Unregister(sm.application)
224240
prometheus.Unregister(sm.node)
225241
prometheus.Unregister(sm.schedulingLatency)
242+
prometheus.Unregister(sm.schedulingCycle)
226243
prometheus.Unregister(sm.sortingLatency)
227244
prometheus.Unregister(sm.tryNodeLatency)
245+
prometheus.Unregister(sm.tryNodeEvaluation)
228246
prometheus.Unregister(sm.tryPreemptionLatency)
229247
}

pkg/scheduler/context.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ func (cc *ClusterContext) setEventHandler(rmHandler handler.EventHandler) {
120120
func (cc *ClusterContext) schedule() bool {
121121
// schedule each partition defined in the cluster
122122
activity := false
123+
scheduleCycleStart := time.Now()
123124
for _, psc := range cc.GetPartitionMapClone() {
124125
// if there are no resources in the partition just skip
125126
if psc.root.GetMaxResource() == nil {
@@ -151,6 +152,7 @@ func (cc *ClusterContext) schedule() bool {
151152
activity = true
152153
}
153154
}
155+
metrics.GetSchedulerMetrics().ObserveSchedulingCycle(scheduleCycleStart)
154156
return activity
155157
}
156158

pkg/scheduler/objects/application.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,6 +1446,7 @@ func (sa *Application) tryNodes(ask *Allocation, iterator NodeIterator) *Allocat
14461446
reserved := sa.reservations[allocKey]
14471447
var allocResult *AllocationResult
14481448
var predicateErrors map[string]int
1449+
tryNodeCycleStart := time.Now()
14491450
iterator.ForEachNode(func(node *Node) bool {
14501451
// skip the node if the node is not schedulable
14511452
if !node.IsSchedulable() {
@@ -1510,6 +1511,7 @@ func (sa *Application) tryNodes(ask *Allocation, iterator NodeIterator) *Allocat
15101511
}
15111512
return true
15121513
})
1514+
metrics.GetSchedulerMetrics().ObserveTryNodeEvaluation(tryNodeCycleStart)
15131515

15141516
if allocResult != nil {
15151517
return allocResult

0 commit comments

Comments
 (0)