1- // Copyright (c) 2017-2024 Tigera, Inc. All rights reserved.
1+ // Copyright (c) 2017-2025 Tigera, Inc. All rights reserved.
22//
33// Licensed under the Apache License, Version 2.0 (the "License");
44// you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@ import (
3636
3737const (
3838 MaxIPSetDeletionsPerIteration = 1
39+ MaxRetryAttempt = 10
3940)
4041
4142type dataplaneMetadata struct {
@@ -44,6 +45,7 @@ type dataplaneMetadata struct {
4445 RangeMin int
4546 RangeMax int
4647 DeleteFailed bool
48+ ListFailed bool
4749}
4850
4951// IPSets manages a whole "plane" of IP sets, i.e. all the IPv4 sets, or all the IPv6 IP sets.
@@ -353,22 +355,31 @@ func (s *IPSets) ApplyUpdates(listener UpdateListener) {
353355 retryDelay *= 2
354356 }
355357
356- for attempt := 0 ; attempt < 10 ; attempt ++ {
358+ var resyncErr , updateErr error
359+ for attempt := 0 ; attempt < MaxRetryAttempt ; attempt ++ {
357360 if attempt > 0 {
358361 s .logCxt .Info ("Retrying after an ipsets update failure..." )
359362 }
363+ treatFailureAsTransient := attempt < MaxRetryAttempt / 2
360364 if s .fullResyncRequired || s .ipSetsRequiringResync .Len () > 0 {
361365 // Compare our in-memory state against the dataplane and queue up
362366 // modifications to fix any inconsistencies.
363367 s .logCxt .Debug ("Resyncing ipsets with dataplane." )
364368 s .opReporter .RecordOperation (fmt .Sprint ("resync-ipsets-v" , s .IPVersionConfig .Family .Version ()))
365369
366- if err := s .tryResync (); err != nil {
367- s .logCxt .WithError (err ).Warning ("Failed to resync with dataplane" )
368- backOff ()
369- continue
370+ if resyncErr = s .tryResync (); resyncErr != nil {
371+ s .logCxt .WithError (resyncErr ).Warning ("Failed to resync with dataplane" )
372+
373+ // After a few attempts, most likely, we are dealing with a persistent failure.
374+ // This could be due to different failures, like userspace and kernel incompatibility.
375+ // The incompatibility failure can be fixed by swapping the one Felix understand (created from
376+ // desired state) and the one (with higher revision) in dataplane. As such, we should stop re-trying
377+ // and instead fall through to tryUpdates() below, which will do the swap.
378+ if treatFailureAsTransient {
379+ backOff ()
380+ continue
381+ }
370382 }
371- s .fullResyncRequired = false
372383 }
373384
374385 // Opportunistically delete some temporary IP sets. It's possible
@@ -377,22 +388,26 @@ func (s *IPSets) ApplyUpdates(listener UpdateListener) {
377388 s .tryTempIPSetDeletions ()
378389
379390 dirtyIPSets := s .dirtyIPSetsForUpdate ()
380- if err := s .tryUpdates (dirtyIPSets , listener ); err != nil {
381- if attempt >= 5 {
391+ if updateErr = s .tryUpdates (dirtyIPSets , listener ); updateErr != nil {
392+ if treatFailureAsTransient {
393+ // Transient failure, resync the IP sets that we failed to update.
394+ s .logCxt .WithError (updateErr ).WithField ("attempt" , attempt ).Warning (
395+ "Failed to update IP sets. Will do partial resync." )
396+ } else {
382397 // Persistent failures, try a full resync.
383- s .logCxt .WithError (err ).WithField ("attempt" , attempt ).Warning (
398+ s .logCxt .WithError (updateErr ).WithField ("attempt" , attempt ).Warning (
384399 "Persistently failed to update IP sets. Will do full resync." )
385400 s .QueueResync ()
386- } else {
387- // More than one failure, resync the IP sets that we failed to update.
388- s .logCxt .WithError (err ).WithField ("attempt" , attempt ).Warning (
389- "Failed to update IP sets. Will do partial resync." )
390401 }
391402 countNumIPSetErrors .Inc ()
403+ }
404+
405+ if resyncErr != nil || updateErr != nil {
392406 backOff ()
393407 continue
394408 }
395409
410+ s .fullResyncRequired = false
396411 success = true
397412 break
398413 }
@@ -440,20 +455,18 @@ func (s *IPSets) tryResync() (err error) {
440455 // are known to exist.
441456 ipSets , err := s .CalicoIPSets ()
442457 if err != nil {
443- s .logCxt .WithError (err ).Error ("Failed to get the list of ipsets" )
458+ s .logCxt .WithError (err ).Error ("Failed to get the list of Calico ipsets" )
444459 return
445460 }
446461 if debug {
447- s .logCxt .Debugf ("List of ipsets: %v" , ipSets )
462+ s .logCxt .Debugf ("List of calico ipsets: %v" , ipSets )
448463 }
449464
450465 ipSetPartOfSync := func (name string ) bool {
451- if s .fullResyncRequired {
452- return true
453- }
454- return s .ipSetsRequiringResync .Contains (name )
466+ return s .fullResyncRequired || s .ipSetsRequiringResync .Contains (name )
455467 }
456468
469+ var failedIPSets []string
457470 for _ , name := range ipSets {
458471 if ! ipSetPartOfSync (name ) {
459472 // Skipping this IP set on this pass.
@@ -462,16 +475,29 @@ func (s *IPSets) tryResync() (err error) {
462475 if debug {
463476 s .logCxt .Debugf ("Parsing IP set %v." , name )
464477 }
465- err = s .resyncIPSet (name )
466- if err != nil {
467- s .logCxt .WithError (err ).Errorf ("Failed to parse ipset %v" , name )
468- return
478+ if err = s .resyncIPSet (name ); err != nil {
479+ // Ignore failures of IP sets not in desired state, as those will be cleaned up later.
480+ _ , desired := s .setNameToProgrammedMetadata .Desired ().Get (name )
481+ if desired {
482+ failedIPSets = append (failedIPSets , name )
483+ }
484+ if desired {
485+ s .logCxt .WithError (err ).WithField ("name" , name ).
486+ Warn ("Failed to parse required Calico-owned ipset that is needed, will try recreating it." )
487+ } else {
488+ s .logCxt .WithError (err ).WithField ("name" , name ).
489+ Warn ("Failed to parse Calico-owned ipset that is no longer needed, will queue it for deletion." )
490+ }
469491 } else {
470492 // Successful resync of this IP set, clear any pending partial resync.
471493 s .ipSetsRequiringResync .Discard (name )
472494 }
473495 }
474496
497+ if len (failedIPSets ) > 0 {
498+ return fmt .Errorf ("failed to parse IPSets %v" , strings .Join (failedIPSets , "," ))
499+ }
500+
475501 // Mark any IP sets that we didn't see as empty.
476502 for name , members := range s .mainSetNameToMembers {
477503 if ! ipSetPartOfSync (name ) {
@@ -551,8 +577,9 @@ func (s *IPSets) resyncIPSet(ipSetName string) error {
551577 //
552578 // As we stream through the data, we extract the name of the IP set and its members. We
553579 // use the IP set's metadata to convert each member to its canonical form for comparison.
580+ meta := dataplaneMetadata {}
581+ debug := log .GetLevel () >= log .DebugLevel
554582 err := s .runIPSetList (ipSetName , func (scanner * bufio.Scanner ) error {
555- debug := log .GetLevel () >= log .DebugLevel
556583 ipSetName := ""
557584 var ipSetType IPSetType
558585 for scanner .Scan () {
@@ -576,9 +603,7 @@ func (s *IPSets) resyncIPSet(ipSetName string) error {
576603 // When we hit the Header line we should know the name, and type of the IP set, which lets
577604 // us update the tracker.
578605 parts := strings .Split (line , " " )
579- meta := dataplaneMetadata {
580- Type : ipSetType ,
581- }
606+ meta .Type = ipSetType
582607 for idx , p := range parts {
583608 if p == "maxelem" {
584609 if idx + 1 >= len (parts ) {
@@ -609,7 +634,6 @@ func (s *IPSets) resyncIPSet(ipSetName string) error {
609634 break
610635 }
611636 }
612- s .setNameToProgrammedMetadata .Dataplane ().Set (ipSetName , meta )
613637 }
614638 if strings .HasPrefix (line , "Members:" ) {
615639 // Start of a Members entry, following this, there'll be one member per
@@ -681,9 +705,16 @@ func (s *IPSets) resyncIPSet(ipSetName string) error {
681705 return scanner .Err ()
682706 })
683707 if err != nil {
684- return err
708+ // This can occur if we have version skew with the version of IP set
709+ // used to create the IP set. Mark the metadata as invalid in order
710+ // to trigger the IP set to be recreated.
711+ meta .ListFailed = true
685712 }
686- return nil
713+ if debug {
714+ s .logCxt .WithField ("setName" , ipSetName ).Debugf ("Parsed metadata from dataplane %+v" , meta )
715+ }
716+ s .setNameToProgrammedMetadata .Dataplane ().Set (ipSetName , meta )
717+ return err
687718}
688719
689720func (s * IPSets ) runIPSetList (arg string , parsingFunc func (* bufio.Scanner ) error ) error {
@@ -1014,7 +1045,8 @@ func (s *IPSets) ApplyDeletions() bool {
10141045 if numDeletions >= MaxIPSetDeletionsPerIteration {
10151046 // Deleting IP sets is slow (40ms) and serialised in the kernel. Avoid holding up the main loop
10161047 // for too long. We'll leave the remaining sets pending deletion and mop them up next time.
1017- log .Debugf ("Deleted batch of %d IP sets, rate limiting further IP set deletions." , MaxIPSetDeletionsPerIteration )
1048+ log .Debugf ("Deleted batch of %d IP sets, rate limiting further IP set deletions." ,
1049+ MaxIPSetDeletionsPerIteration )
10181050 // Leave the item in the set, so we'll do another batch of deletions next time around the loop.
10191051 return deltatracker .IterActionNoOpStopIteration
10201052 }
@@ -1070,7 +1102,8 @@ func (s *IPSets) tryTempIPSetDeletions() {
10701102 if numDeletions >= MaxIPSetDeletionsPerIteration {
10711103 // Deleting IP sets is slow (40ms) and serialised in the kernel. Avoid holding up the main loop
10721104 // for too long. We'll leave the remaining sets pending deletion and mop them up next time.
1073- log .Debugf ("Deleted batch of 20 temp IP sets, rate limiting further IP set deletions." )
1105+ log .Debugf ("Deleted batch of %d IP sets, rate limiting further IP set deletions." ,
1106+ MaxIPSetDeletionsPerIteration )
10741107 // Leave the item in the set, so we'll do another batch of deletions next time around the loop.
10751108 return deltatracker .IterActionNoOpStopIteration
10761109 }
0 commit comments