Skip to content

Commit bf5c6e3

Browse files
committed
CD daemon: implement jitter for exponential RateLimiter
The client-go rate limiters such as `ExponentialFailureRateLimiter` do not implement jitter. In a user's environment, formation of a CD across 144 nodes has shown that the absence of jitter results in significant retry attempt correlation across nodes -- even after ~10 retries, resulting in otherwise preventable conflicts (and hence increased convergence time). That effect can be diminished by adding jitter, which should allow for The JitterRL implementation provided by this patch is a simple, custom implementation that I validated with simulated errors and careful placement of log messages. Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent df6c093 commit bf5c6e3

File tree

2 files changed

+68
-1
lines changed

2 files changed

+68
-1
lines changed

pkg/workqueue/jitterlimiter.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package workqueue
19+
20+
import (
21+
"math/rand"
22+
"time"
23+
24+
"k8s.io/client-go/util/workqueue"
25+
"k8s.io/klog/v2"
26+
)
27+
28+
// Jitter relative to the delay yielded by the inner limiter. Example: a factor
29+
// of 0.1 translates to a jitter interval with a width of 10 % compared to the
30+
// inner delay, and centered around the inner delay time (resulting in +/- 5 %
31+
// deviation compared to the inner delay time).
32+
type JitterRL[T comparable] struct {
33+
inner workqueue.TypedRateLimiter[T]
34+
factor float64
35+
}
36+
37+
func NewJitterRateLimiter[T comparable](inner workqueue.TypedRateLimiter[T], factor float64) workqueue.TypedRateLimiter[T] {
38+
if factor >= 1.0 {
39+
panic("factor must be < 1.0")
40+
}
41+
return &JitterRL[T]{inner: inner, factor: factor}
42+
}
43+
44+
func (j *JitterRL[T]) When(item T) time.Duration {
45+
// Get inner limiter's delay.
46+
d := j.inner.When(item)
47+
48+
// Calculate jitter interval width W_j relative to the delay time given by
49+
// the inner limiter.
50+
jitterWidthSeconds := d.Seconds() * j.factor
51+
52+
// Get random number in the interval [-W_j/2, W_j/2).
53+
jitterSeconds := jitterWidthSeconds * (rand.Float64() - 0.5)
54+
55+
delay := d + time.Duration(jitterSeconds*float64(time.Second))
56+
klog.V(7).Infof("inner: %.5f s, jittered: %.5f s", d.Seconds(), delay.Seconds())
57+
58+
return delay
59+
}
60+
61+
func (j *JitterRL[T]) Forget(item T) {
62+
j.inner.Forget(item)
63+
}
64+
65+
func (j *JitterRL[T]) NumRequeues(item T) int {
66+
return j.inner.NumRequeues(item)
67+
}

pkg/workqueue/workqueue.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ func DefaultPrepUnprepRateLimiter() workqueue.TypedRateLimiter[any] {
5959
}
6060

6161
func DefaultCDDaemonRateLimiter() workqueue.TypedRateLimiter[any] {
62-
return workqueue.NewTypedItemExponentialFailureRateLimiter[any](5*time.Millisecond, 6000*time.Millisecond)
62+
return NewJitterRateLimiter(workqueue.NewTypedItemExponentialFailureRateLimiter[any](5*time.Millisecond, 6000*time.Millisecond), 0.5)
6363
}
6464

6565
func DefaultControllerRateLimiter() workqueue.TypedRateLimiter[any] {

0 commit comments

Comments
 (0)