Skip to content

Commit 02005a2

Browse files
authored
IPSets: re-create ipsets failed during re-sync (#11464)
1 parent 09f8fbf commit 02005a2

File tree

3 files changed

+126
-41
lines changed

3 files changed

+126
-41
lines changed

felix/ipsets/ipsets.go

Lines changed: 66 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2017-2024 Tigera, Inc. All rights reserved.
1+
// Copyright (c) 2017-2025 Tigera, Inc. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@ import (
3636

3737
const (
3838
MaxIPSetDeletionsPerIteration = 1
39+
MaxRetryAttempt = 10
3940
)
4041

4142
type dataplaneMetadata struct {
@@ -44,6 +45,7 @@ type dataplaneMetadata struct {
4445
RangeMin int
4546
RangeMax int
4647
DeleteFailed bool
48+
ListFailed bool
4749
}
4850

4951
// IPSets manages a whole "plane" of IP sets, i.e. all the IPv4 sets, or all the IPv6 IP sets.
@@ -353,22 +355,31 @@ func (s *IPSets) ApplyUpdates(listener UpdateListener) {
353355
retryDelay *= 2
354356
}
355357

356-
for attempt := 0; attempt < 10; attempt++ {
358+
var resyncErr, updateErr error
359+
for attempt := 0; attempt < MaxRetryAttempt; attempt++ {
357360
if attempt > 0 {
358361
s.logCxt.Info("Retrying after an ipsets update failure...")
359362
}
363+
treatFailureAsTransient := attempt < MaxRetryAttempt/2
360364
if s.fullResyncRequired || s.ipSetsRequiringResync.Len() > 0 {
361365
// Compare our in-memory state against the dataplane and queue up
362366
// modifications to fix any inconsistencies.
363367
s.logCxt.Debug("Resyncing ipsets with dataplane.")
364368
s.opReporter.RecordOperation(fmt.Sprint("resync-ipsets-v", s.IPVersionConfig.Family.Version()))
365369

366-
if err := s.tryResync(); err != nil {
367-
s.logCxt.WithError(err).Warning("Failed to resync with dataplane")
368-
backOff()
369-
continue
370+
if resyncErr = s.tryResync(); resyncErr != nil {
371+
s.logCxt.WithError(resyncErr).Warning("Failed to resync with dataplane")
372+
373+
// After a few attempts, most likely, we are dealing with a persistent failure.
374+
// This could be due to different failures, like userspace and kernel incompatibility.
375+
// The incompatibility failure can be fixed by swapping the one Felix understand (created from
376+
// desired state) and the one (with higher revision) in dataplane. As such, we should stop re-trying
377+
// and instead fall through to tryUpdates() below, which will do the swap.
378+
if treatFailureAsTransient {
379+
backOff()
380+
continue
381+
}
370382
}
371-
s.fullResyncRequired = false
372383
}
373384

374385
// Opportunistically delete some temporary IP sets. It's possible
@@ -377,22 +388,26 @@ func (s *IPSets) ApplyUpdates(listener UpdateListener) {
377388
s.tryTempIPSetDeletions()
378389

379390
dirtyIPSets := s.dirtyIPSetsForUpdate()
380-
if err := s.tryUpdates(dirtyIPSets, listener); err != nil {
381-
if attempt >= 5 {
391+
if updateErr = s.tryUpdates(dirtyIPSets, listener); updateErr != nil {
392+
if treatFailureAsTransient {
393+
// Transient failure, resync the IP sets that we failed to update.
394+
s.logCxt.WithError(updateErr).WithField("attempt", attempt).Warning(
395+
"Failed to update IP sets. Will do partial resync.")
396+
} else {
382397
// Persistent failures, try a full resync.
383-
s.logCxt.WithError(err).WithField("attempt", attempt).Warning(
398+
s.logCxt.WithError(updateErr).WithField("attempt", attempt).Warning(
384399
"Persistently failed to update IP sets. Will do full resync.")
385400
s.QueueResync()
386-
} else {
387-
// More than one failure, resync the IP sets that we failed to update.
388-
s.logCxt.WithError(err).WithField("attempt", attempt).Warning(
389-
"Failed to update IP sets. Will do partial resync.")
390401
}
391402
countNumIPSetErrors.Inc()
403+
}
404+
405+
if resyncErr != nil || updateErr != nil {
392406
backOff()
393407
continue
394408
}
395409

410+
s.fullResyncRequired = false
396411
success = true
397412
break
398413
}
@@ -440,20 +455,18 @@ func (s *IPSets) tryResync() (err error) {
440455
// are known to exist.
441456
ipSets, err := s.CalicoIPSets()
442457
if err != nil {
443-
s.logCxt.WithError(err).Error("Failed to get the list of ipsets")
458+
s.logCxt.WithError(err).Error("Failed to get the list of Calico ipsets")
444459
return
445460
}
446461
if debug {
447-
s.logCxt.Debugf("List of ipsets: %v", ipSets)
462+
s.logCxt.Debugf("List of calico ipsets: %v", ipSets)
448463
}
449464

450465
ipSetPartOfSync := func(name string) bool {
451-
if s.fullResyncRequired {
452-
return true
453-
}
454-
return s.ipSetsRequiringResync.Contains(name)
466+
return s.fullResyncRequired || s.ipSetsRequiringResync.Contains(name)
455467
}
456468

469+
var failedIPSets []string
457470
for _, name := range ipSets {
458471
if !ipSetPartOfSync(name) {
459472
// Skipping this IP set on this pass.
@@ -462,16 +475,29 @@ func (s *IPSets) tryResync() (err error) {
462475
if debug {
463476
s.logCxt.Debugf("Parsing IP set %v.", name)
464477
}
465-
err = s.resyncIPSet(name)
466-
if err != nil {
467-
s.logCxt.WithError(err).Errorf("Failed to parse ipset %v", name)
468-
return
478+
if err = s.resyncIPSet(name); err != nil {
479+
// Ignore failures of IP sets not in desired state, as those will be cleaned up later.
480+
_, desired := s.setNameToProgrammedMetadata.Desired().Get(name)
481+
if desired {
482+
failedIPSets = append(failedIPSets, name)
483+
}
484+
if desired {
485+
s.logCxt.WithError(err).WithField("name", name).
486+
Warn("Failed to parse required Calico-owned ipset that is needed, will try recreating it.")
487+
} else {
488+
s.logCxt.WithError(err).WithField("name", name).
489+
Warn("Failed to parse Calico-owned ipset that is no longer needed, will queue it for deletion.")
490+
}
469491
} else {
470492
// Successful resync of this IP set, clear any pending partial resync.
471493
s.ipSetsRequiringResync.Discard(name)
472494
}
473495
}
474496

497+
if len(failedIPSets) > 0 {
498+
return fmt.Errorf("failed to parse IPSets %v", strings.Join(failedIPSets, ","))
499+
}
500+
475501
// Mark any IP sets that we didn't see as empty.
476502
for name, members := range s.mainSetNameToMembers {
477503
if !ipSetPartOfSync(name) {
@@ -551,8 +577,9 @@ func (s *IPSets) resyncIPSet(ipSetName string) error {
551577
//
552578
// As we stream through the data, we extract the name of the IP set and its members. We
553579
// use the IP set's metadata to convert each member to its canonical form for comparison.
580+
meta := dataplaneMetadata{}
581+
debug := log.GetLevel() >= log.DebugLevel
554582
err := s.runIPSetList(ipSetName, func(scanner *bufio.Scanner) error {
555-
debug := log.GetLevel() >= log.DebugLevel
556583
ipSetName := ""
557584
var ipSetType IPSetType
558585
for scanner.Scan() {
@@ -576,9 +603,7 @@ func (s *IPSets) resyncIPSet(ipSetName string) error {
576603
// When we hit the Header line we should know the name, and type of the IP set, which lets
577604
// us update the tracker.
578605
parts := strings.Split(line, " ")
579-
meta := dataplaneMetadata{
580-
Type: ipSetType,
581-
}
606+
meta.Type = ipSetType
582607
for idx, p := range parts {
583608
if p == "maxelem" {
584609
if idx+1 >= len(parts) {
@@ -609,7 +634,6 @@ func (s *IPSets) resyncIPSet(ipSetName string) error {
609634
break
610635
}
611636
}
612-
s.setNameToProgrammedMetadata.Dataplane().Set(ipSetName, meta)
613637
}
614638
if strings.HasPrefix(line, "Members:") {
615639
// Start of a Members entry, following this, there'll be one member per
@@ -681,9 +705,16 @@ func (s *IPSets) resyncIPSet(ipSetName string) error {
681705
return scanner.Err()
682706
})
683707
if err != nil {
684-
return err
708+
// This can occur if we have version skew with the version of IP set
709+
// used to create the IP set. Mark the metadata as invalid in order
710+
// to trigger the IP set to be recreated.
711+
meta.ListFailed = true
685712
}
686-
return nil
713+
if debug {
714+
s.logCxt.WithField("setName", ipSetName).Debugf("Parsed metadata from dataplane %+v", meta)
715+
}
716+
s.setNameToProgrammedMetadata.Dataplane().Set(ipSetName, meta)
717+
return err
687718
}
688719

689720
func (s *IPSets) runIPSetList(arg string, parsingFunc func(*bufio.Scanner) error) error {
@@ -1014,7 +1045,8 @@ func (s *IPSets) ApplyDeletions() bool {
10141045
if numDeletions >= MaxIPSetDeletionsPerIteration {
10151046
// Deleting IP sets is slow (40ms) and serialised in the kernel. Avoid holding up the main loop
10161047
// for too long. We'll leave the remaining sets pending deletion and mop them up next time.
1017-
log.Debugf("Deleted batch of %d IP sets, rate limiting further IP set deletions.", MaxIPSetDeletionsPerIteration)
1048+
log.Debugf("Deleted batch of %d IP sets, rate limiting further IP set deletions.",
1049+
MaxIPSetDeletionsPerIteration)
10181050
// Leave the item in the set, so we'll do another batch of deletions next time around the loop.
10191051
return deltatracker.IterActionNoOpStopIteration
10201052
}
@@ -1070,7 +1102,8 @@ func (s *IPSets) tryTempIPSetDeletions() {
10701102
if numDeletions >= MaxIPSetDeletionsPerIteration {
10711103
// Deleting IP sets is slow (40ms) and serialised in the kernel. Avoid holding up the main loop
10721104
// for too long. We'll leave the remaining sets pending deletion and mop them up next time.
1073-
log.Debugf("Deleted batch of 20 temp IP sets, rate limiting further IP set deletions.")
1105+
log.Debugf("Deleted batch of %d IP sets, rate limiting further IP set deletions.",
1106+
MaxIPSetDeletionsPerIteration)
10741107
// Leave the item in the set, so we'll do another batch of deletions next time around the loop.
10751108
return deltatracker.IterActionNoOpStopIteration
10761109
}

felix/ipsets/ipsets_test.go

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2016-2024 Tigera, Inc. All rights reserved.
1+
// Copyright (c) 2016-2025 Tigera, Inc. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -464,6 +464,45 @@ var _ = Describe("IP sets dataplane", func() {
464464
})
465465
})
466466

467+
It("Calico IP sets with unsupported revision and in desired sets should be re-created", func() {
468+
dataplane.IPSetMetadata = map[string]setMetadata{
469+
v4MainIPSetName: {
470+
Name: v4MainIPSetName,
471+
Family: "inet",
472+
Type: IPSetTypeHashIP,
473+
MaxSize: 1234,
474+
Revision: supportedMockRevision + 1,
475+
},
476+
v4MainIPSetName2: {
477+
Name: v4MainIPSetName2,
478+
Family: "inet",
479+
Type: IPSetTypeHashIP,
480+
MaxSize: 1234,
481+
Revision: supportedMockRevision + 1,
482+
},
483+
v4MainIPSetName3: {
484+
Name: v4MainIPSetName3,
485+
Family: "inet",
486+
Type: IPSetTypeHashIP,
487+
MaxSize: 1234,
488+
Revision: supportedMockRevision,
489+
},
490+
}
491+
dataplane.IPSetMembers[v4MainIPSetName] = set.From("10.0.0.1", "10.0.0.3")
492+
dataplane.IPSetMembers[v4MainIPSetName2] = set.From("10.0.0.1", "10.0.0.4")
493+
dataplane.IPSetMembers[v4MainIPSetName3] = set.From("10.0.0.5", "10.0.0.6")
494+
495+
ipsets.AddOrReplaceIPSet(meta, []string{"10.0.0.1", "10.0.0.2"})
496+
ipsets.AddOrReplaceIPSet(meta3, []string{"10.0.0.5", "10.0.0.6"})
497+
498+
apply()
499+
dataplane.ExpectMembers(map[string][]string{
500+
v4MainIPSetName: []string{"10.0.0.1", "10.0.0.2"},
501+
// v4MainIPSetName2 should be destroyed since it's not in the desired state.
502+
v4MainIPSetName3: []string{"10.0.0.5", "10.0.0.6"}, // This IPSet should not be touched.
503+
})
504+
})
505+
467506
Describe("with many left-over IP sets in place", func() {
468507
BeforeEach(func() {
469508
for i := 0; i < MaxIPSetDeletionsPerIteration*3; i++ {
@@ -795,7 +834,7 @@ var _ = Describe("IP sets dataplane", func() {
795834

796835
It("should be detected after many transient errors", func() {
797836
// Simulate lots of transient failures in a row, followed by success.
798-
dataplane.RestoreOpFailures = slices.Repeat([]string{"write-ip"}, 6)
837+
dataplane.RestoreOpFailures = slices.Repeat([]string{"write-ip"}, (MaxRetryAttempt/2)+1)
799838
// Trigger an update to only one IP set.
800839
ipsets.AddMembers(ipSetID2, []string{"10.0.0.4"})
801840
apply()

felix/ipsets/utils_for_test.go

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2016-2024 Tigera, Inc. All rights reserved.
1+
// Copyright (c) 2016-2025 Tigera, Inc. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -36,6 +36,10 @@ import (
3636

3737
// This file contains shared test infrastructure for testing the ipsets package.
3838

39+
const (
40+
supportedMockRevision = 5
41+
)
42+
3943
var (
4044
transientFailure = errors.New("Simulated transient failure")
4145
permanentFailure = errors.New("Simulated permanent failure")
@@ -454,6 +458,7 @@ type setMetadata struct {
454458
Name string
455459
Family IPFamily
456460
Type IPSetType
461+
Revision int
457462
MaxSize int
458463
RangeMin int
459464
RangeMax int
@@ -726,18 +731,26 @@ func (c *listCmd) main() {
726731
result = fmt.Errorf("ipset %v does not exists", c.SetName)
727732
return
728733
}
729-
writef("Name: %s\n", c.SetName)
730734
meta, ok := c.Dataplane.IPSetMetadata[c.SetName]
731735
if !ok {
732736
// Default metadata for IP sets created by tests.
733737
meta = setMetadata{
734-
Name: v4MainIPSetName,
735-
Family: IPFamilyV4,
736-
Type: IPSetTypeHashIP,
737-
MaxSize: 1234,
738+
Name: v4MainIPSetName,
739+
Family: IPFamilyV4,
740+
Type: IPSetTypeHashIP,
741+
Revision: supportedMockRevision,
742+
MaxSize: 1234,
738743
}
739744
}
745+
746+
if meta.Revision > supportedMockRevision {
747+
result = fmt.Errorf("revision %v not supported", meta.Revision)
748+
return
749+
}
750+
751+
writef("Name: %s\n", c.SetName)
740752
writef("Type: %s\n", meta.Type)
753+
writef("Revision: %d\n", meta.Revision)
741754
if meta.Type == IPSetTypeBitmapPort {
742755
writef("Header: family %s range %d-%d\n", meta.Family, meta.RangeMin, meta.RangeMax)
743756
} else if meta.Type == "unknown:type" {

0 commit comments

Comments
 (0)