11package manager
22
33import (
4+ "context"
45 "fmt"
56 "sync"
67
78 networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned"
89 nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned"
10+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
911 "k8s.io/apimachinery/pkg/types"
1012 "k8s.io/client-go/kubernetes"
1113 providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1"
@@ -21,9 +23,17 @@ import (
2123 "k8s.io/klog/v2"
2224)
2325
26+ // startNEGController is a package-level variable to allow tests to stub the
27+ // actual NEG controller startup routine.
28+ var startNEGController = neg .StartNEGController
29+
30+ // ProviderConfigControllersManager coordinates lifecycle of controllers scoped to
31+ // a single ProviderConfig. It ensures per-ProviderConfig controller startup is
32+ // idempotent, adds/removes the NEG cleanup finalizer, and wires a stop channel
33+ // for clean shutdown.
2434type ProviderConfigControllersManager struct {
2535 mu sync.Mutex
26- controllers map [string ]* ControllerSet
36+ controllers map [string ]* controllerSet
2737
2838 logger klog.Logger
2939 providerConfigClient providerconfigclient.Interface
@@ -43,10 +53,14 @@ type ProviderConfigControllersManager struct {
4353 syncerMetrics * syncMetrics.SyncerMetrics
4454}
4555
46- type ControllerSet struct {
56+ // controllerSet holds controller-specific resources for a ProviderConfig.
57+ // Unexported because it is an internal implementation detail.
58+ type controllerSet struct {
4759 stopCh chan <- struct {}
4860}
4961
62+ // NewProviderConfigControllerManager constructs a new ProviderConfigControllersManager.
63+ // It does not start any controllers until StartControllersForProviderConfig is invoked.
5064func NewProviderConfigControllerManager (
5165 kubeClient kubernetes.Interface ,
5266 informers * multiprojectinformers.InformerSet ,
@@ -65,7 +79,7 @@ func NewProviderConfigControllerManager(
6579 syncerMetrics * syncMetrics.SyncerMetrics ,
6680) * ProviderConfigControllersManager {
6781 return & ProviderConfigControllersManager {
68- controllers : make (map [string ]* ControllerSet ),
82+ controllers : make (map [string ]* controllerSet ),
6983 logger : logger ,
7084 providerConfigClient : providerConfigClient ,
7185 informers : informers ,
@@ -88,6 +102,10 @@ func providerConfigKey(pc *providerconfig.ProviderConfig) string {
88102 return pc .Name
89103}
90104
105+ // StartControllersForProviderConfig ensures finalizers are present and starts
106+ // the controllers associated with the given ProviderConfig. The call is
107+ // idempotent per ProviderConfig: concurrent or repeated calls for the same
108+ // ProviderConfig will only start controllers once.
91109func (pccm * ProviderConfigControllersManager ) StartControllersForProviderConfig (pc * providerconfig.ProviderConfig ) error {
92110 pccm .mu .Lock ()
93111 defer pccm .mu .Unlock ()
@@ -102,17 +120,23 @@ func (pccm *ProviderConfigControllersManager) StartControllersForProviderConfig(
102120 return nil
103121 }
104122
123+ // Add the cleanup finalizer up front to avoid a window where controllers
124+ // may create external resources without a finalizer present. If deletion
125+ // happens in that window, cleanup could be skipped. We roll this back on
126+ // any subsequent startup error.
105127 err := finalizer .EnsureProviderConfigNEGCleanupFinalizer (pc , pccm .providerConfigClient , logger )
106128 if err != nil {
107- return fmt .Errorf ("failed to ensure NEG cleanup finalizer for project %s: %v " , pcKey , err )
129+ return fmt .Errorf ("failed to ensure NEG cleanup finalizer for project %s: %w " , pcKey , err )
108130 }
109131
110132 cloud , err := pccm .gceCreator .GCEForProviderConfig (pc , logger )
111133 if err != nil {
112- return fmt .Errorf ("failed to create GCE client for provider config %+v: %v" , pc , err )
134+ // If GCE client creation fails after finalizer was added, roll it back.
135+ pccm .rollbackFinalizerOnStartFailure (pc , logger , err )
136+ return fmt .Errorf ("failed to create GCE client for provider config %+v: %w" , pc , err )
113137 }
114138
115- negControllerStopCh , err := neg . StartNEGController (
139+ negControllerStopCh , err := startNEGController (
116140 pccm .informers ,
117141 pccm .kubeClient ,
118142 pccm .eventRecorderClient ,
@@ -130,14 +154,12 @@ func (pccm *ProviderConfigControllersManager) StartControllersForProviderConfig(
130154 pccm .syncerMetrics ,
131155 )
132156 if err != nil {
133- cleanupErr := finalizer .DeleteProviderConfigNEGCleanupFinalizer (pc , pccm .providerConfigClient , logger )
134- if cleanupErr != nil {
135- logger .Error (cleanupErr , "failed to clean up NEG finalizer after controller start failure" , "originalError" , err )
136- }
137- return fmt .Errorf ("failed to start NEG controller for project %s: %v" , pcKey , err )
157+ // If startup fails, make a best-effort to roll back the finalizer.
158+ pccm .rollbackFinalizerOnStartFailure (pc , logger , err )
159+ return fmt .Errorf ("failed to start NEG controller for project %s: %w" , pcKey , err )
138160 }
139161
140- pccm .controllers [pcKey ] = & ControllerSet {
162+ pccm .controllers [pcKey ] = & controllerSet {
141163 stopCh : negControllerStopCh ,
142164 }
143165
@@ -161,9 +183,34 @@ func (pccm *ProviderConfigControllersManager) StopControllersForProviderConfig(p
161183 close (pccm .controllers [csKey ].stopCh )
162184
163185 delete (pccm .controllers , csKey )
164- err := finalizer .DeleteProviderConfigNEGCleanupFinalizer (pc , pccm .providerConfigClient , logger )
186+
187+ // Ensure we remove the finalizer from the latest object state.
188+ pcLatest := pccm .latestPCWithCleanupFinalizer (pc )
189+ err := finalizer .DeleteProviderConfigNEGCleanupFinalizer (pcLatest , pccm .providerConfigClient , logger )
165190 if err != nil {
166191 logger .Error (err , "failed to delete NEG cleanup finalizer for project" )
167192 }
168193 logger .Info ("Stopped controllers for provider config" )
169194}
195+
196+ // rollbackFinalizerOnStartFailure removes the NEG cleanup finalizer after a
197+ // start failure so that ProviderConfig deletion is not blocked.
198+ func (pccm * ProviderConfigControllersManager ) rollbackFinalizerOnStartFailure (pc * providerconfig.ProviderConfig , logger klog.Logger , cause error ) {
199+ pcLatest := pccm .latestPCWithCleanupFinalizer (pc )
200+ if err := finalizer .DeleteProviderConfigNEGCleanupFinalizer (pcLatest , pccm .providerConfigClient , logger ); err != nil {
201+ logger .Error (err , "failed to clean up NEG finalizer after start failure" , "originalError" , cause )
202+ }
203+ }
204+
205+ // latestPCWithCleanupFinalizer returns the latest ProviderConfig from the API server.
206+ // If the Get fails, it returns a local copy of the provided pc with the cleanup
207+ // finalizer appended to ensure a subsequent delete attempt is not a no-op.
208+ func (pccm * ProviderConfigControllersManager ) latestPCWithCleanupFinalizer (pc * providerconfig.ProviderConfig ) * providerconfig.ProviderConfig {
209+ pcLatest , err := pccm .providerConfigClient .CloudV1 ().ProviderConfigs ().Get (context .Background (), pc .Name , metav1.GetOptions {})
210+ if err != nil {
211+ pcCopy := pc .DeepCopy ()
212+ pcCopy .Finalizers = append (pcCopy .Finalizers , finalizer .ProviderConfigNEGCleanupFinalizer )
213+ return pcCopy
214+ }
215+ return pcLatest
216+ }
0 commit comments