Skip to content

Commit b36df94

Browse files
authored
Improve stale IP recycling in AntreaIPAM controller (#7538)
When a Node is deleted, the CNI DEL command may not be invoked for Pods running on that Node. As a result, antrea-agent cannot release their IPs, leaving stale entries in the IPPool. To mitigate this, increase the IP recycle interval from 10 minutes to 1 minute. Additionally, add a benchmark test for the IP recycle function and refine its implementation based on benchmark results to reduce memory allocations and improve cleanup performance. Signed-off-by: Lan Luo <[email protected]>
1 parent 6393ac1 commit b36df94

File tree

3 files changed

+117
-21
lines changed

3 files changed

+117
-21
lines changed

pkg/controller/ipam/antrea_ipam_controller.go

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ const (
5454
minRetryDelay = 5 * time.Second
5555
maxRetryDelay = 300 * time.Second
5656

57-
garbageCollectionInterval = 10 * time.Minute
57+
garbageCollectionInterval = 1 * time.Minute
5858

5959
// Default number of workers processing an IPPool change.
6060
defaultWorkers = 4
@@ -171,32 +171,37 @@ func (c *AntreaIPAMController) enqueueStatefulSetDeleteEvent(obj interface{}) {
171171
}
172172

173173
// Inspect all IPPools for stale IP Address entries.
174-
// This may happen if controller was down during StatefulSet delete event.
175-
// If such entry is found, enqueue cleanup event for this StatefulSet.
176-
func (c *AntreaIPAMController) cleanupStaleAddresses() {
177-
pools, _ := c.ipPoolLister.List(labels.Everything())
178-
lister := c.statefulSetInformer.Lister()
179-
statefulSets, _ := lister.List(labels.Everything())
180-
statefulSetMap := make(map[string]bool)
181-
174+
// This may happen if controller was down during StatefulSet/Pod delete event.
175+
// If such entry is found, enqueue cleanup event for this StatefulSet/Pod.
176+
func (c *AntreaIPAMController) cleanUpStaleIPAddresses() {
182177
klog.InfoS("Cleanup job for IP Pools started")
183178

179+
pools, _ := c.ipPoolLister.List(labels.Everything())
180+
statefulSets, _ := c.statefulSetInformer.Lister().List(labels.Everything())
181+
statefulSetMap := make(map[string]struct{}, len(statefulSets))
182+
184183
for _, ss := range statefulSets {
185184
// Prepare map of existing StatefulSets for quick reference below
186-
statefulSetMap[k8s.NamespacedName(ss.Namespace, ss.Name)] = true
185+
statefulSetMap[k8s.NamespacedName(ss.Namespace, ss.Name)] = struct{}{}
186+
}
187+
188+
pods, _ := c.podLister.List(labels.Everything())
189+
podsMap := make(map[string]struct{}, len(pods))
190+
for _, p := range pods {
191+
podsMap[k8s.NamespacedName(p.Namespace, p.Name)] = struct{}{}
187192
}
188193

189194
poolsUpdated := 0
190195
for _, ipPool := range pools {
191196
updateNeeded := false
192-
ipPoolCopy := ipPool.DeepCopy()
193-
var newList []crdv1b1.IPAddressState
194-
for _, address := range ipPoolCopy.Status.IPAddresses {
197+
ipAddrs := ipPool.Status.IPAddresses
198+
newList := make([]crdv1b1.IPAddressState, 0, len(ipAddrs))
199+
200+
for _, address := range ipAddrs {
195201
// Cleanup reserved addresses
196202
if address.Owner.Pod != nil {
197-
_, err := c.podLister.Pods(address.Owner.Pod.Namespace).Get(address.Owner.Pod.Name)
198-
if err != nil && errors.IsNotFound(err) {
199-
klog.InfoS("IPPool contains stale IPAddress for Pod that no longer exists", "IPPool", ipPool.Name, "Namespace", address.Owner.Pod.Namespace, "Pod", address.Owner.Pod.Name)
203+
if _, ok := podsMap[k8s.NamespacedName(address.Owner.Pod.Namespace, address.Owner.Pod.Name)]; !ok {
204+
klog.V(2).InfoS("IPPool contains stale IP address for Pod that no longer exists", "IPPool", ipPool.Name, "Namespace", address.Owner.Pod.Namespace, "Pod", address.Owner.Pod.Name)
200205
address.Owner.Pod = nil
201206
if address.Owner.StatefulSet != nil {
202207
address.Phase = crdv1b1.IPAddressPhaseReserved
@@ -208,10 +213,9 @@ func (c *AntreaIPAMController) cleanupStaleAddresses() {
208213
key := k8s.NamespacedName(address.Owner.StatefulSet.Namespace, address.Owner.StatefulSet.Name)
209214
if _, ok := statefulSetMap[key]; !ok {
210215
// This entry refers to StatefulSet that no longer exists
211-
klog.InfoS("IPPool contains stale IPAddress for StatefulSet that no longer exists", "IPPool", ipPool.Name, "Namespace", address.Owner.StatefulSet.Namespace, "StatefulSet", address.Owner.StatefulSet.Name)
216+
klog.V(2).InfoS("IPPool contains stale IP address for StatefulSet that no longer exists", "IPPool", ipPool.Name, "Namespace", address.Owner.StatefulSet.Namespace, "StatefulSet", address.Owner.StatefulSet.Name)
212217
address.Owner.StatefulSet = nil
213218
updateNeeded = true
214-
215219
}
216220
}
217221

@@ -221,6 +225,7 @@ func (c *AntreaIPAMController) cleanupStaleAddresses() {
221225
}
222226

223227
if updateNeeded {
228+
ipPoolCopy := ipPool.DeepCopy()
224229
ipPoolCopy.Status.IPAddresses = newList
225230
_, err := c.crdClient.CrdV1beta1().IPPools().UpdateStatus(context.TODO(), ipPoolCopy, metav1.UpdateOptions{})
226231
if err != nil {
@@ -229,7 +234,6 @@ func (c *AntreaIPAMController) cleanupStaleAddresses() {
229234
} else {
230235
poolsUpdated += 1
231236
}
232-
233237
}
234238
}
235239

@@ -242,6 +246,8 @@ func (c *AntreaIPAMController) cleanIPPoolForStatefulSet(namespacedName string)
242246
klog.InfoS("Processing delete notification", "StatefulSet", namespacedName)
243247
ipPools, _ := c.ipPoolInformer.Informer().GetIndexer().ByIndex(statefulSetIndex, namespacedName)
244248

249+
var retry bool
250+
namespace, name := k8s.SplitNamespacedName(namespacedName)
245251
for _, item := range ipPools {
246252
ipPool := item.(*crdv1b1.IPPool)
247253
allocator, err := poolallocator.NewIPPoolAllocator(ipPool.Name, c.crdClient, c.ipPoolLister)
@@ -251,15 +257,18 @@ func (c *AntreaIPAMController) cleanIPPoolForStatefulSet(namespacedName string)
251257
continue
252258
}
253259

254-
namespace, name := k8s.SplitNamespacedName(namespacedName)
255260
err = allocator.ReleaseStatefulSet(namespace, name)
256261
if err != nil {
257262
// This can be a transient error - worker will retry
258263
klog.ErrorS(err, "Failed to clean IP allocations", "StatefulSet", namespacedName, "IPPool", ipPool.Name)
264+
retry = true
259265
continue
260266
}
261267
}
262268

269+
if retry {
270+
return fmt.Errorf("stale IP allocations cleanup failed for at least one IPPool")
271+
}
263272
return nil
264273
}
265274

@@ -491,7 +500,7 @@ func (c *AntreaIPAMController) Run(stopCh <-chan struct{}) {
491500
}
492501

493502
// Periodic cleanup IP Pools of stale IP addresses
494-
go wait.NonSlidingUntil(c.cleanupStaleAddresses, garbageCollectionInterval, stopCh)
503+
go wait.NonSlidingUntil(c.cleanUpStaleIPAddresses, garbageCollectionInterval, stopCh)
495504

496505
go wait.Until(c.statefulSetWorker, time.Second, stopCh)
497506

pkg/controller/ipam/antrea_ipam_controller_test.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,11 @@ import (
2828
corev1 "k8s.io/api/core/v1"
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3030
"k8s.io/apimachinery/pkg/util/wait"
31+
"k8s.io/apimachinery/pkg/watch"
3132
"k8s.io/client-go/informers"
3233
"k8s.io/client-go/kubernetes/fake"
34+
k8stesting "k8s.io/client-go/testing"
35+
"k8s.io/client-go/tools/cache"
3336

3437
crdv1b1 "antrea.io/antrea/pkg/apis/crd/v1beta1"
3538
fakecrd "antrea.io/antrea/pkg/client/clientset/versioned/fake"
@@ -334,3 +337,84 @@ func TestAntreaIPAMController_getIPPoolsForStatefulSet(t *testing.T) {
334337
})
335338
}
336339
}
340+
341+
// Creates fake IPPools with N addresses each
342+
func makeFakeIPPools(poolCount, ipsPerPool int) []*crdv1b1.IPPool {
343+
var pools []*crdv1b1.IPPool
344+
for i := 0; i < poolCount; i++ {
345+
pool := &crdv1b1.IPPool{
346+
ObjectMeta: metav1.ObjectMeta{
347+
Name: fmt.Sprintf("pool-%d", i),
348+
},
349+
Status: crdv1b1.IPPoolStatus{
350+
IPAddresses: []crdv1b1.IPAddressState{},
351+
},
352+
}
353+
354+
for j := 0; j < ipsPerPool; j++ {
355+
ip := crdv1b1.IPAddressState{
356+
IPAddress: fmt.Sprintf("10.0.%d.%d", i, j),
357+
Phase: crdv1b1.IPAddressPhaseAllocated,
358+
}
359+
// Randomly assign Pod/StatefulSet owner
360+
ip.Owner.Pod = &crdv1b1.PodOwner{
361+
Namespace: "default",
362+
Name: fmt.Sprintf("pod-%d-%d", i, j),
363+
ContainerID: fmt.Sprintf("container-%d-%d", i, j),
364+
}
365+
ip.Owner.StatefulSet = &crdv1b1.StatefulSetOwner{
366+
Namespace: "default",
367+
Name: fmt.Sprintf("ss-%d", i),
368+
Index: 1,
369+
}
370+
pool.Status.IPAddresses = append(pool.Status.IPAddresses, ip)
371+
}
372+
pools = append(pools, pool)
373+
}
374+
return pools
375+
}
376+
377+
func BenchmarkCleanUpStaleIPAddresses(b *testing.B) {
378+
benchmarks := []struct {
379+
name string
380+
poolCount int
381+
ipsPerPool int
382+
}{
383+
// Sample result for small:
384+
// 1515 764513 ns/op 292771 B/op 5255 allocs/op
385+
{"Small", 10, 50},
386+
// Sample result for medium:
387+
// 39 27798675 ns/op 10321643 B/op 202300 allocs/op
388+
{"Medium", 100, 200},
389+
// Sample result for large:
390+
// 5 205046442 ns/op 126322694 B/op 2510655 allocs/op
391+
{"Large", 500, 500},
392+
}
393+
394+
for _, bm := range benchmarks {
395+
b.Run(bm.name, func(b *testing.B) {
396+
stopCh := make(chan struct{})
397+
defer close(stopCh)
398+
399+
c := newFakeAntreaIPAMController(&crdv1b1.IPPool{}, &corev1.Namespace{}, &appsv1.StatefulSet{})
400+
watcher := watch.NewFake()
401+
c.fakeCRDClient.PrependWatchReactor("ippools", k8stesting.DefaultWatchReactor(watcher, nil))
402+
c.informerFactory.Start(stopCh)
403+
c.crdInformerFactory.Start(stopCh)
404+
c.informerFactory.WaitForCacheSync(stopCh)
405+
c.crdInformerFactory.WaitForCacheSync(stopCh)
406+
407+
pools := makeFakeIPPools(bm.poolCount, bm.ipsPerPool)
408+
for _, p := range pools {
409+
_, _ = c.fakeCRDClient.CrdV1beta1().IPPools().Create(context.TODO(), p, metav1.CreateOptions{})
410+
watcher.Add(p)
411+
}
412+
cache.WaitForCacheSync(stopCh, c.ipPoolInformer.Informer().HasSynced)
413+
414+
b.ResetTimer()
415+
for i := 0; i < b.N; i++ {
416+
c.cleanUpStaleIPAddresses()
417+
}
418+
})
419+
}
420+
}

test/performance/benchmark.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,6 @@ benchmarks:
7575
package: "./pkg/apis/controlplane"
7676
- name: "BenchmarkSymmetricDifferenceString"
7777
package: "./pkg/util/sets"
78+
- name: "BenchmarkCleanUpStaleIPAddresses"
79+
package: "./pkg/controller/ipam"
80+
threshold: 0.3

0 commit comments

Comments
 (0)