From a59f8598dd17be7a2db94752138ad4023d7d112a Mon Sep 17 00:00:00 2001 From: Slavik Panasovets Date: Fri, 29 Aug 2025 13:54:23 -0700 Subject: [PATCH 1/6] Centralize informer management with InformerSet Replace factory-based informer creation with centralized InformerSet that: - Creates base SharedIndexInformers directly without factories - Manages lifecycle and synchronization in one place - Provides filtered views for ProviderConfig-specific controllers - Reduces boilerplate and improves maintainability Using informers directly without factories simplifies the logic and eliminates potential mistakes from unnecessary factory usage, such as cidentally creating duplicate informers or incorrect factory scoping. --- cmd/glbc/main.go | 29 +- pkg/multiproject/README.md | 179 +++++++++++ pkg/multiproject/informerset/informerset.go | 281 ++++++++++++++++++ .../informerset/informerset_test.go | 260 ++++++++++++++++ pkg/multiproject/manager/manager.go | 54 ++-- pkg/multiproject/neg/neg.go | 173 ++--------- pkg/multiproject/start/start.go | 72 +++-- pkg/multiproject/start/start_test.go | 47 +-- .../informers/externalversions/factory.go | 262 ---------------- .../informers/externalversions/generic.go | 68 ----- .../externalversions/network/interface.go | 46 --- .../informers/externalversions/factory.go | 262 ---------------- .../informers/externalversions/generic.go | 62 ---- .../nodetopology/interface.go | 46 --- vendor/modules.txt | 4 - 15 files changed, 828 insertions(+), 1017 deletions(-) create mode 100644 pkg/multiproject/README.md create mode 100644 pkg/multiproject/informerset/informerset.go create mode 100644 pkg/multiproject/informerset/informerset_test.go delete mode 100644 vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/factory.go delete mode 100644 vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/generic.go delete mode 100644 vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/network/interface.go delete mode 100644 vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/factory.go delete mode 100644 vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/generic.go delete mode 100644 vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/nodetopology/interface.go diff --git a/cmd/glbc/main.go b/cmd/glbc/main.go index 5891b16b29..1f889a07b9 100644 --- a/cmd/glbc/main.go +++ b/cmd/glbc/main.go @@ -27,14 +27,11 @@ import ( firewallcrclient "github.com/GoogleCloudPlatform/gke-networking-api/client/gcpfirewall/clientset/versioned" networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" - informernetwork "github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions" nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" - informernodetopology "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions" k8scp "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud" flag "github.com/spf13/pflag" crdclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - informers "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" restclient "k8s.io/client-go/rest" "k8s.io/client-go/tools/leaderelection" @@ -54,7 +51,6 @@ import ( serviceattachmentclient "k8s.io/ingress-gce/pkg/serviceattachment/client/clientset/versioned" "k8s.io/ingress-gce/pkg/svcneg" svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" - informersvcneg "k8s.io/ingress-gce/pkg/svcneg/client/informers/externalversions" "k8s.io/ingress-gce/pkg/systemhealth" "k8s.io/ingress-gce/pkg/utils" "k8s.io/klog/v2" @@ -259,19 +255,6 @@ func main() { if err != nil { klog.Fatalf("Failed to create ProviderConfig client: %v", err) } - informersFactory := informers.NewSharedInformerFactory(kubeClient, flags.F.ResyncPeriod) - var svcNegFactory informersvcneg.SharedInformerFactory - if svcNegClient != nil { - svcNegFactory = informersvcneg.NewSharedInformerFactory(svcNegClient, flags.F.ResyncPeriod) - } - var networkFactory informernetwork.SharedInformerFactory - if networkClient != nil { - networkFactory = informernetwork.NewSharedInformerFactory(networkClient, flags.F.ResyncPeriod) - } - var nodeTopologyFactory informernodetopology.SharedInformerFactory - if nodeTopologyClient != nil { - nodeTopologyFactory = informernodetopology.NewSharedInformerFactory(nodeTopologyClient, flags.F.ResyncPeriod) - } ctx := context.Background() syncerMetrics := syncMetrics.NewNegMetricsCollector(flags.F.NegMetricsExportInterval, rootLogger) go syncerMetrics.Run(stopCh) @@ -284,13 +267,11 @@ func main() { rootLogger, kubeClient, svcNegClient, + networkClient, + nodeTopologyClient, kubeSystemUID, eventRecorderKubeClient, providerConfigClient, - informersFactory, - svcNegFactory, - networkFactory, - nodeTopologyFactory, gceCreator, namer, stopCh, @@ -305,13 +286,11 @@ func main() { rootLogger, kubeClient, svcNegClient, + networkClient, + nodeTopologyClient, kubeSystemUID, eventRecorderKubeClient, providerConfigClient, - informersFactory, - svcNegFactory, - networkFactory, - nodeTopologyFactory, gceCreator, namer, stopCh, diff --git a/pkg/multiproject/README.md b/pkg/multiproject/README.md new file mode 100644 index 0000000000..9be6c1da26 --- /dev/null +++ b/pkg/multiproject/README.md @@ -0,0 +1,179 @@ +# Multi-Project Controller Architecture + +## Overview + +The multi-project controller enables Kubernetes ingress-gce to manage Network Endpoint Groups (NEGs) across multiple Google Cloud Platform (GCP) projects. This allows for multi-tenant scenarios where different namespaces or services can be associated with different GCP projects through ProviderConfig resources. + +## Architecture + +### Core Components + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Main Process │ +│ │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ start.Start() │ │ +│ │ - Creates base SharedIndexInformers │ │ +│ │ - Starts informers with globalStopCh │ │ +│ │ - Creates ProviderConfigController │ │ +│ └────────────────────┬───────────────────────────────────┘ │ +│ │ │ +│ ┌────────────────────▼───────────────────────────────────┐ │ +│ │ ProviderConfigController │ │ +│ │ - Watches ProviderConfig resources │ │ +│ │ - Manages lifecycle of per-PC controllers │ │ +│ └────────────────────┬───────────────────────────────────┘ │ +│ │ │ +│ ┌────────────────────▼───────────────────────────────────┐ │ +│ │ ProviderConfigControllersManager │ │ +│ │ - Starts/stops NEG controllers per ProviderConfig │ │ +│ │ - Manages controller lifecycle │ │ +│ └──────────┬─────────────────────┬───────────────────────┘ │ +│ │ │ │ +│ ┌────────▼──────────┐ ┌───────▼──────────┐ │ +│ │ NEG Controller #1 │ │ NEG Controller #2 │ ... │ +│ │ (ProviderConfig A) │ │ (ProviderConfig B) │ │ +│ └────────────────────┘ └──────────────────┘ │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Key Design Principles + +1. **Shared Informers**: Base informers are created once and shared across all ProviderConfig controllers +2. **Filtered Views**: Each NEG controller gets a filtered view of resources based on ProviderConfig +3. **Lifecycle Management**: Controllers can be started/stopped independently as ProviderConfigs are added/removed +4. **Channel Management**: Proper channel lifecycle ensures clean shutdown and resource cleanup + +## Component Details + +### start/start.go +Main entry point that: +- Creates base SharedIndexInformers via InformerSet (no factories) +- Starts all informers with the global stop channel +- Creates the ProviderConfigController +- Manages leader election (when enabled) + +### controller/controller.go +ProviderConfigController that: +- Watches ProviderConfig resources +- Enqueues changes for processing +- Delegates to ProviderConfigControllersManager + +### manager/manager.go +ProviderConfigControllersManager that: +- Maintains a map of active controllers per ProviderConfig +- Starts NEG controllers when ProviderConfigs are added +- Stops NEG controllers when ProviderConfigs are deleted +- Manages finalizers for cleanup + +### neg/neg.go +NEG controller factory that: +- Wraps base SharedIndexInformers with provider-config filters via ProviderConfigFilteredInformer +- Sets up the NEG controller with proper GCE client +- Manages channel lifecycle (globalStopCh vs providerConfigStopCh) + +### filteredinformer/ +Filtered informer implementation that: +- Wraps base SharedIndexInformers +- Filters resources based on ProviderConfig labels +- Provides filtered cache/store views + +## Channel Lifecycle + +The implementation uses three types of channels: + +1. **globalStopCh**: Process-wide shutdown signal + - Closes on leader election loss or process termination + - Used by base informers and shared resources + +2. **providerConfigStopCh**: Per-ProviderConfig shutdown signal + - Closed when a ProviderConfig is deleted + - Used to stop PC-specific controllers + +3. **joinedStopCh**: Combined shutdown signal + - Closes when either globalStopCh OR providerConfigStopCh closes + - Used by PC-specific resources that should stop in either case + +## Resource Filtering + +Resources are associated with ProviderConfigs through labels: +- Services, Ingresses, etc. have a label indicating their ProviderConfig +- The filtered informer only passes through resources matching the PC name +- This ensures each controller only sees and manages its own resources + +## Informer Lifecycle + +### Creation +1. Base informers are created via `InformerSet` using `NewXInformer()` functions +2. Base informers are started by `InformerSet.Start` with `globalStopCh` +3. Filtered wrappers are created per ProviderConfig using `ProviderConfigFilteredInformer` + +### Synchronization +- `InformerSet.Start` waits for base informer caches to sync +- Filtered informers rely on the synced base caches +- Controllers use `CombinedHasSynced()` from filtered informers before processing + +### Shutdown +- Base informers stop when globalStopCh closes +- Filtered informers are just wrappers (no separate shutdown) +- Controllers stop when their providerConfigStopCh closes + +## Configuration + +Key configuration flags: +- `--provider-config-name-label-key`: Label key for PC association (default: cloud.gke.io/provider-config-name) +- `--multi-project-owner-label-key`: Label key for PC owner +- `--resync-period`: Informer resync period + +## Testing + +### Unit Tests +- Controller logic testing +- Filter functionality testing +- Channel lifecycle testing + +### Integration Tests +- Multi-ProviderConfig scenarios +- Controller start/stop sequencing +- Resource cleanup verification + +### Key Test Scenarios +1. Single ProviderConfig with services +2. Multiple ProviderConfigs +3. ProviderConfig deletion and cleanup +4. Shared informer survival across PC changes + +## Common Operations + +### Adding a ProviderConfig +1. Create ProviderConfig resource +2. Controller detects addition +3. Manager starts NEG controller +4. NEG controller creates filtered informers +5. NEGs are created in target GCP project + +### Removing a ProviderConfig + +The deletion process follows a specific sequence to ensure proper cleanup: + +1. **External automation initiates deletion**: + - Server-side automation triggers the deletion process + - All namespaces belonging to the ProviderConfig are deleted first + +2. **Namespace cleanup**: + - Kubernetes deletes all resources within the namespaces + - Services are deleted, triggering NEG cleanup + - NEG controller removes NEGs from GCP as services are deleted + +3. **Wait for namespace deletion**: + - External automation waits for all namespaces to be fully deleted + - This ensures all NEGs and other resources are cleaned up + +4. **ProviderConfig deletion**: + - Only after namespaces are gone, ProviderConfig is deleted + - Controller stops the NEG controller for this ProviderConfig + - Finalizer is removed from ProviderConfig + - ProviderConfig resource is removed from Kubernetes + +**Important**: NEGs are not automatically deleted when a ProviderConfig is removed. They are cleaned up as part of the namespace/service deletion process that happens before ProviderConfig deletion. diff --git a/pkg/multiproject/informerset/informerset.go b/pkg/multiproject/informerset/informerset.go new file mode 100644 index 0000000000..8c7eb64986 --- /dev/null +++ b/pkg/multiproject/informerset/informerset.go @@ -0,0 +1,281 @@ +package informerset + +import ( + "fmt" + + networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" + informernetwork "github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/network/v1" + nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" + informernodetopology "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/nodetopology/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + informerv1 "k8s.io/client-go/informers/core/v1" + discoveryinformer "k8s.io/client-go/informers/discovery/v1" + informernetworking "k8s.io/client-go/informers/networking/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" + "k8s.io/ingress-gce/pkg/multiproject/filteredinformer" + svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" + informersvcneg "k8s.io/ingress-gce/pkg/svcneg/client/informers/externalversions/svcneg/v1beta1" + "k8s.io/ingress-gce/pkg/utils" + "k8s.io/ingress-gce/pkg/utils/endpointslices" + "k8s.io/klog/v2" +) + +// InformerSet manages all shared informers used by multiproject controllers. +// It provides centralized initialization and lifecycle management. +type InformerSet struct { + // Core Kubernetes informers (always present) + Ingress cache.SharedIndexInformer + Service cache.SharedIndexInformer + Pod cache.SharedIndexInformer + Node cache.SharedIndexInformer + EndpointSlice cache.SharedIndexInformer + + // Custom resource informers (may be nil) + SvcNeg cache.SharedIndexInformer // ServiceNetworkEndpointGroups CRD + Network cache.SharedIndexInformer // GKE Network CRD + GkeNetworkParams cache.SharedIndexInformer // GKENetworkParamSets CRD + NodeTopology cache.SharedIndexInformer // NodeTopology CRD + + // State tracking + started bool +} + +// NewInformerSet creates and initializes a new InformerSet with all required informers. +// Optional CRD informers are created only when corresponding clients are provided; +// those fields remain nil otherwise. +func NewInformerSet( + kubeClient kubernetes.Interface, + svcNegClient svcnegclient.Interface, + networkClient networkclient.Interface, + nodeTopologyClient nodetopologyclient.Interface, + resyncPeriod metav1.Duration, +) *InformerSet { + informers := &InformerSet{} + + // Create core Kubernetes informers + informers.Ingress = informernetworking.NewIngressInformer( + kubeClient, + metav1.NamespaceAll, + resyncPeriod.Duration, + utils.NewNamespaceIndexer(), + ) + + informers.Service = informerv1.NewServiceInformer( + kubeClient, + metav1.NamespaceAll, + resyncPeriod.Duration, + utils.NewNamespaceIndexer(), + ) + + informers.Pod = informerv1.NewPodInformer( + kubeClient, + metav1.NamespaceAll, + resyncPeriod.Duration, + utils.NewNamespaceIndexer(), + ) + + informers.Node = informerv1.NewNodeInformer( + kubeClient, + resyncPeriod.Duration, + utils.NewNamespaceIndexer(), + ) + + // EndpointSlice informer with custom indexers for NEG controller + informers.EndpointSlice = discoveryinformer.NewEndpointSliceInformer( + kubeClient, + metav1.NamespaceAll, + resyncPeriod.Duration, + cache.Indexers{ + cache.NamespaceIndex: cache.MetaNamespaceIndexFunc, + endpointslices.EndpointSlicesByServiceIndex: endpointslices.EndpointSlicesByServiceFunc, + }, + ) + + // Create CRD informers if clients are available + if svcNegClient != nil { + informers.SvcNeg = informersvcneg.NewServiceNetworkEndpointGroupInformer( + svcNegClient, + metav1.NamespaceAll, + resyncPeriod.Duration, + utils.NewNamespaceIndexer(), + ) + } + + if networkClient != nil { + informers.Network = informernetwork.NewNetworkInformer( + networkClient, + resyncPeriod.Duration, + utils.NewNamespaceIndexer(), + ) + + informers.GkeNetworkParams = informernetwork.NewGKENetworkParamSetInformer( + networkClient, + resyncPeriod.Duration, + utils.NewNamespaceIndexer(), + ) + } + + if nodeTopologyClient != nil { + informers.NodeTopology = informernodetopology.NewNodeTopologyInformer( + nodeTopologyClient, + resyncPeriod.Duration, + utils.NewNamespaceIndexer(), + ) + } + + return informers +} + +// Start starts all informers and waits for their caches to sync. +// It is idempotent: repeated calls return nil once informers have started. +// If the provided stop channel is already closed, it returns an error after +// marking the set as started to prevent subsequent re-runs. +func (i *InformerSet) Start(stopCh <-chan struct{}, logger klog.Logger) error { + if i.started { + logger.V(3).Info("InformerSet already started; skipping") + return nil + } + + // If the stop channel is already closed, report error but mark started so + // subsequent Start calls are a no-op (idempotent behavior expected by callers). + select { + case <-stopCh: + err := fmt.Errorf("stop channel closed before start") + logger.Error(err, "Cannot start informers") + return err + default: + } + + // Start all core informers + startInformer(i.Ingress, stopCh) + startInformer(i.Service, stopCh) + startInformer(i.Pod, stopCh) + startInformer(i.Node, stopCh) + startInformer(i.EndpointSlice, stopCh) + + // Start optional informers + startInformer(i.SvcNeg, stopCh) + startInformer(i.Network, stopCh) + startInformer(i.GkeNetworkParams, stopCh) + startInformer(i.NodeTopology, stopCh) + + i.started = true + + // Wait for initial sync + logger.V(2).Info("Waiting for informer caches to sync") + if !cache.WaitForCacheSync(stopCh, i.CombinedHasSynced()) { + err := fmt.Errorf("failed to sync informer caches") + logger.Error(err, "Failed to sync informer caches") + return err + } + + logger.V(2).Info("Informer caches synced successfully") + return nil +} + +// FilterByProviderConfig creates a new InformerSet with all informers wrapped in a ProviderConfig filter. +// This is used for provider-config-specific controllers. +func (i *InformerSet) FilterByProviderConfig(providerConfigName string) *InformerSet { + filteredInformers := &InformerSet{ + started: i.started, + } + + // Wrap core informers + if i.Ingress != nil { + filteredInformers.Ingress = newProviderConfigFilteredInformer(i.Ingress, providerConfigName) + } + if i.Service != nil { + filteredInformers.Service = newProviderConfigFilteredInformer(i.Service, providerConfigName) + } + if i.Pod != nil { + filteredInformers.Pod = newProviderConfigFilteredInformer(i.Pod, providerConfigName) + } + if i.Node != nil { + filteredInformers.Node = newProviderConfigFilteredInformer(i.Node, providerConfigName) + } + if i.EndpointSlice != nil { + filteredInformers.EndpointSlice = newProviderConfigFilteredInformer(i.EndpointSlice, providerConfigName) + } + + // Wrap optional informers + if i.SvcNeg != nil { + filteredInformers.SvcNeg = newProviderConfigFilteredInformer(i.SvcNeg, providerConfigName) + } + if i.Network != nil { + filteredInformers.Network = newProviderConfigFilteredInformer(i.Network, providerConfigName) + } + if i.GkeNetworkParams != nil { + filteredInformers.GkeNetworkParams = newProviderConfigFilteredInformer(i.GkeNetworkParams, providerConfigName) + } + if i.NodeTopology != nil { + filteredInformers.NodeTopology = newProviderConfigFilteredInformer(i.NodeTopology, providerConfigName) + } + + return filteredInformers +} + +// newProviderConfigFilteredInformer wraps an informer with a provider config filter. +// The filtered informer shares the same underlying cache and indexers. +func newProviderConfigFilteredInformer(informer cache.SharedIndexInformer, providerConfigName string) cache.SharedIndexInformer { + return filteredinformer.NewProviderConfigFilteredInformer(informer, providerConfigName) +} + +// CombinedHasSynced returns a function that checks if all informers have synced. +func (i *InformerSet) CombinedHasSynced() func() bool { + syncFuncs := i.hasSyncedFuncs() + return func() bool { + for _, hasSynced := range syncFuncs { + if !hasSynced() { + return false + } + } + return true + } +} + +// hasSyncedFuncs returns a list of HasSynced functions for all non-nil informers. +func (i *InformerSet) hasSyncedFuncs() []func() bool { + var funcs []func() bool + + // Core informers (always present) + if i.Ingress != nil { + funcs = append(funcs, i.Ingress.HasSynced) + } + if i.Service != nil { + funcs = append(funcs, i.Service.HasSynced) + } + if i.Pod != nil { + funcs = append(funcs, i.Pod.HasSynced) + } + if i.Node != nil { + funcs = append(funcs, i.Node.HasSynced) + } + if i.EndpointSlice != nil { + funcs = append(funcs, i.EndpointSlice.HasSynced) + } + + // Optional informers + if i.SvcNeg != nil { + funcs = append(funcs, i.SvcNeg.HasSynced) + } + if i.Network != nil { + funcs = append(funcs, i.Network.HasSynced) + } + if i.GkeNetworkParams != nil { + funcs = append(funcs, i.GkeNetworkParams.HasSynced) + } + if i.NodeTopology != nil { + funcs = append(funcs, i.NodeTopology.HasSynced) + } + + return funcs +} + +// startInformer starts the informer if it is non-nil. +func startInformer(inf cache.SharedIndexInformer, stopCh <-chan struct{}) { + if inf == nil { + return + } + go inf.Run(stopCh) +} diff --git a/pkg/multiproject/informerset/informerset_test.go b/pkg/multiproject/informerset/informerset_test.go new file mode 100644 index 0000000000..b08f3123a9 --- /dev/null +++ b/pkg/multiproject/informerset/informerset_test.go @@ -0,0 +1,260 @@ +package informerset + +import ( + "testing" + + networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" + networkfake "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned/fake" + nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" + nodetopologyfake "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned/fake" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sfake "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/tools/cache" + svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" + svcnegfake "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned/fake" + "k8s.io/ingress-gce/pkg/utils/endpointslices" + ktesting "k8s.io/klog/v2/ktesting" +) + +// TestNewInformerSet_OptionalClients verifies that optional informers are created +// only when their corresponding CRD clients are provided. +func TestNewInformerSet_OptionalClients(t *testing.T) { + t.Parallel() + + testCases := []struct { + name string + withSvcNeg bool + withNetwork bool + withNodeTopology bool + wantSvcNeg bool + wantNetwork bool + wantGKEParams bool + wantNodeTopology bool + }{ + { + name: "no-optional-clients", + }, + { + name: "all-optional-clients", + withSvcNeg: true, + withNetwork: true, + withNodeTopology: true, + wantSvcNeg: true, + wantNetwork: true, + wantGKEParams: true, + wantNodeTopology: true, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + kubeClient := k8sfake.NewSimpleClientset() + + var svcNegClient svcnegclient.Interface + if tc.withSvcNeg { + svcNegClient = svcnegfake.NewSimpleClientset() + } + + var netClient networkclient.Interface + if tc.withNetwork { + netClient = networkfake.NewSimpleClientset() + } + + var topoClient nodetopologyclient.Interface + if tc.withNodeTopology { + topoClient = nodetopologyfake.NewSimpleClientset() + } + + inf := NewInformerSet(kubeClient, svcNegClient, netClient, topoClient, metav1.Duration{Duration: 0}) + if inf == nil { + t.Fatalf("NewInformerSet returned nil") + } + + if got := inf.SvcNeg != nil; got != tc.wantSvcNeg { + t.Errorf("SvcNeg: got %t, want %t", got, tc.wantSvcNeg) + } + if got := inf.Network != nil; got != tc.wantNetwork { + t.Errorf("Network: got %t, want %t", got, tc.wantNetwork) + } + if got := inf.GkeNetworkParams != nil; got != tc.wantGKEParams { + t.Errorf("GkeNetworkParams: got %t, want %t", got, tc.wantGKEParams) + } + if got := inf.NodeTopology != nil; got != tc.wantNodeTopology { + t.Errorf("NodeTopology: got %t, want %t", got, tc.wantNodeTopology) + } + }) + } +} + +// TestEndpointSlice_HasRequiredIndexers verifies that the EndpointSlice informer is initialized +// and includes both the namespace indexer and the NEG-specific service indexer. +func TestEndpointSlice_HasRequiredIndexers(t *testing.T) { + t.Parallel() + + kubeClient := k8sfake.NewSimpleClientset() + inf := NewInformerSet(kubeClient, nil, nil, nil, metav1.Duration{Duration: 0}) + if inf == nil { + t.Fatalf("NewInformerSet returned nil") + } + + if inf.EndpointSlice == nil { + t.Fatalf("EndpointSlice informer must be initialized") + } + + indexers := inf.EndpointSlice.GetIndexer().GetIndexers() + if _, ok := indexers[cache.NamespaceIndex]; !ok { + t.Errorf("EndpointSlice missing NamespaceIndex indexer") + } + if _, ok := indexers[endpointslices.EndpointSlicesByServiceIndex]; !ok { + t.Errorf("EndpointSlice missing EndpointSlicesByServiceIndex indexer") + } +} + +// TestStart_Semantics verifies Start() idempotency, behavior with a closed stop channel, +// and that CombinedHasSynced reports true after caches sync. +func TestStart_Semantics(t *testing.T) { + t.Parallel() + + logger, _ := ktesting.NewTestContext(t) + + t.Run("idempotent", func(t *testing.T) { + t.Parallel() + + kubeClient := k8sfake.NewSimpleClientset() + inf := NewInformerSet(kubeClient, nil, nil, nil, metav1.Duration{Duration: 0}) + + stop := make(chan struct{}) + defer close(stop) + + err := inf.Start(stop, logger) + if err != nil { + t.Fatalf("Start() returned error: %v", err) + } + err = inf.Start(stop, logger) + if err != nil { + t.Fatalf("Second Start() returned error: %v", err) + } + }) + + t.Run("closed-stop-channel", func(t *testing.T) { + t.Parallel() + + kubeClient := k8sfake.NewSimpleClientset() + inf := NewInformerSet(kubeClient, nil, nil, nil, metav1.Duration{Duration: 0}) + + stop := make(chan struct{}) + close(stop) + + err := inf.Start(stop, logger) + if err == nil { + t.Fatalf("expected error when stop channel is closed, got nil") + } + if inf.started { + t.Fatalf("expected started=false when Start is called with a closed stop channel") + } + }) + + t.Run("combined-has-synced", func(t *testing.T) { + t.Parallel() + + kubeClient := k8sfake.NewSimpleClientset() + inf := NewInformerSet(kubeClient, nil, nil, nil, metav1.Duration{Duration: 0}) + + stop := make(chan struct{}) + defer close(stop) + + err := inf.Start(stop, logger) + if err != nil { + t.Fatalf("Start() returned error: %v", err) + } + if ok := cache.WaitForCacheSync(stop, inf.CombinedHasSynced()); !ok { + t.Fatalf("timed out waiting for CombinedHasSynced to be true") + } + }) +} + +// TestFilterByProviderConfig_WrappingAndState verifies that FilterByProviderConfig wraps all +// non-nil informers, preserves nil optional informers, and propagates the 'started' state +// from the base set into the filtered view. +func TestFilterByProviderConfig_WrappingAndState(t *testing.T) { + t.Parallel() + + kubeClient := k8sfake.NewSimpleClientset() + svcClient := svcnegfake.NewSimpleClientset() + + inf := NewInformerSet(kubeClient, svcClient, nil, nil, metav1.Duration{Duration: 0}) + + // Before starting, filtered should mirror started=false. + filteredBefore := inf.FilterByProviderConfig("pc-1") + if filteredBefore == nil { + t.Fatalf("FilterByProviderConfig returned nil") + } + if filteredBefore.started { + t.Fatalf("expected filtered InformerSet to have started=false before base Start") + } + if filteredBefore.Ingress == nil || filteredBefore.Service == nil || filteredBefore.Pod == nil || filteredBefore.Node == nil || filteredBefore.EndpointSlice == nil { + t.Fatalf("expected core filtered informers to be non-nil before base Start") + } + if filteredBefore.SvcNeg == nil { + t.Fatalf("expected SvcNeg filtered informer to be non-nil when present in base") + } + if filteredBefore.Network != nil || filteredBefore.GkeNetworkParams != nil || filteredBefore.NodeTopology != nil { + t.Fatalf("expected absent optional informers to remain nil in filtered view") + } + + // After starting, filtered should mirror started=true. + logger, _ := ktesting.NewTestContext(t) + stop := make(chan struct{}) + defer close(stop) + + err := inf.Start(stop, logger) + if err != nil { + t.Fatalf("Start() returned error: %v", err) + } + + filteredAfter := inf.FilterByProviderConfig("pc-1") + if filteredAfter == nil { + t.Fatalf("FilterByProviderConfig returned nil (after start)") + } + if !filteredAfter.started { + t.Fatalf("expected filtered InformerSet to have started=true after base Start") + } +} + +// TestCombinedHasSynced_AllNil verifies that CombinedHasSynced returns true when +// all informers in the set are nil (no caches to sync). +func TestCombinedHasSynced_AllNil(t *testing.T) { + t.Parallel() + + inf := &InformerSet{} + syncFunc := inf.CombinedHasSynced() + if !syncFunc() { + t.Errorf("CombinedHasSynced should return true when all informers are nil") + } +} + +// TestFilterByProviderConfig_PreservesIndexers verifies that filtering preserves the +// indexers from the original EndpointSlice informer (critical for NEG controller behavior). +func TestFilterByProviderConfig_PreservesIndexers(t *testing.T) { + t.Parallel() + + kubeClient := k8sfake.NewSimpleClientset() + inf := NewInformerSet(kubeClient, nil, nil, nil, metav1.Duration{Duration: 0}) + + original := inf.EndpointSlice.GetIndexer().GetIndexers() + filtered := inf.FilterByProviderConfig("pc-1").EndpointSlice.GetIndexer().GetIndexers() + + if _, ok := filtered[cache.NamespaceIndex]; !ok { + t.Errorf("filtered EndpointSlice missing NamespaceIndex") + } + if _, ok := filtered[endpointslices.EndpointSlicesByServiceIndex]; !ok { + t.Errorf("filtered EndpointSlice missing EndpointSlicesByServiceIndex") + } + + if len(filtered) != len(original) { + t.Errorf("filtered indexers count mismatch: got=%d want=%d", len(filtered), len(original)) + } +} diff --git a/pkg/multiproject/manager/manager.go b/pkg/multiproject/manager/manager.go index 1b44fe8b6d..e0f78ad7dc 100644 --- a/pkg/multiproject/manager/manager.go +++ b/pkg/multiproject/manager/manager.go @@ -5,21 +5,18 @@ import ( "sync" networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" - informernetwork "github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions" nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" - informernodetopology "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions" "k8s.io/apimachinery/pkg/types" - informers "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" "k8s.io/ingress-gce/pkg/multiproject/finalizer" "k8s.io/ingress-gce/pkg/multiproject/gce" + multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/informerset" "k8s.io/ingress-gce/pkg/multiproject/neg" syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" "k8s.io/ingress-gce/pkg/neg/syncers/labels" providerconfigclient "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned" svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" - informersvcneg "k8s.io/ingress-gce/pkg/svcneg/client/informers/externalversions" "k8s.io/ingress-gce/pkg/utils/namer" "k8s.io/klog/v2" ) @@ -30,22 +27,20 @@ type ProviderConfigControllersManager struct { logger klog.Logger providerConfigClient providerconfigclient.Interface - informersFactory informers.SharedInformerFactory - svcNegFactory informersvcneg.SharedInformerFactory - networkFactory informernetwork.SharedInformerFactory - nodeTopologyFactory informernodetopology.SharedInformerFactory - kubeClient kubernetes.Interface - svcNegClient svcnegclient.Interface - eventRecorderClient kubernetes.Interface - networkClient networkclient.Interface - nodetopologyClient nodetopologyclient.Interface - kubeSystemUID types.UID - clusterNamer *namer.Namer - l4Namer *namer.L4Namer - lpConfig labels.PodLabelPropagationConfig - gceCreator gce.GCECreator - globalStopCh <-chan struct{} - syncerMetrics *syncMetrics.SyncerMetrics + // Base informers shared across all ProviderConfigs + informers *multiprojectinformers.InformerSet + kubeClient kubernetes.Interface + svcNegClient svcnegclient.Interface + eventRecorderClient kubernetes.Interface + networkClient networkclient.Interface + nodetopologyClient nodetopologyclient.Interface + kubeSystemUID types.UID + clusterNamer *namer.Namer + l4Namer *namer.L4Namer + lpConfig labels.PodLabelPropagationConfig + gceCreator gce.GCECreator + globalStopCh <-chan struct{} + syncerMetrics *syncMetrics.SyncerMetrics } type ControllerSet struct { @@ -54,12 +49,11 @@ type ControllerSet struct { func NewProviderConfigControllerManager( kubeClient kubernetes.Interface, - informersFactory informers.SharedInformerFactory, - svcNegFactory informersvcneg.SharedInformerFactory, - networkFactory informernetwork.SharedInformerFactory, - nodeTopologyFactory informernodetopology.SharedInformerFactory, + informers *multiprojectinformers.InformerSet, providerConfigClient providerconfigclient.Interface, svcNegClient svcnegclient.Interface, + networkClient networkclient.Interface, + nodetopologyClient nodetopologyclient.Interface, eventRecorderClient kubernetes.Interface, kubeSystemUID types.UID, clusterNamer *namer.Namer, @@ -74,13 +68,12 @@ func NewProviderConfigControllerManager( controllers: make(map[string]*ControllerSet), logger: logger, providerConfigClient: providerConfigClient, - informersFactory: informersFactory, - svcNegFactory: svcNegFactory, - networkFactory: networkFactory, - nodeTopologyFactory: nodeTopologyFactory, + informers: informers, kubeClient: kubeClient, svcNegClient: svcNegClient, eventRecorderClient: eventRecorderClient, + networkClient: networkClient, + nodetopologyClient: nodetopologyClient, kubeSystemUID: kubeSystemUID, clusterNamer: clusterNamer, l4Namer: l4Namer, @@ -120,10 +113,7 @@ func (pccm *ProviderConfigControllersManager) StartControllersForProviderConfig( } negControllerStopCh, err := neg.StartNEGController( - pccm.informersFactory, - pccm.svcNegFactory, - pccm.networkFactory, - pccm.nodeTopologyFactory, + pccm.informers, pccm.kubeClient, pccm.eventRecorderClient, pccm.svcNegClient, diff --git a/pkg/multiproject/neg/neg.go b/pkg/multiproject/neg/neg.go index 8423745374..cc064001a0 100644 --- a/pkg/multiproject/neg/neg.go +++ b/pkg/multiproject/neg/neg.go @@ -4,17 +4,14 @@ import ( "fmt" networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" - informernetwork "github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions" nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" - informernodetopology "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" "k8s.io/cloud-provider-gcp/providers/gce" providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" "k8s.io/ingress-gce/pkg/flags" - "k8s.io/ingress-gce/pkg/multiproject/filteredinformer" + multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/informerset" "k8s.io/ingress-gce/pkg/neg" "k8s.io/ingress-gce/pkg/neg/metrics" syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" @@ -22,9 +19,7 @@ import ( negtypes "k8s.io/ingress-gce/pkg/neg/types" "k8s.io/ingress-gce/pkg/network" svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" - informersvcneg "k8s.io/ingress-gce/pkg/svcneg/client/informers/externalversions" "k8s.io/ingress-gce/pkg/utils" - "k8s.io/ingress-gce/pkg/utils/endpointslices" "k8s.io/ingress-gce/pkg/utils/namer" "k8s.io/ingress-gce/pkg/utils/zonegetter" "k8s.io/klog/v2" @@ -42,13 +37,10 @@ import ( // - joinedStopCh: Internal channel that closes when EITHER globalStopCh OR providerConfigStopCh // closes. Used for PC-specific resources that should stop in either case. // -// IMPORTANT: Base informers from factories use globalStopCh to remain alive across PC changes. -// Only ProviderConfig-specific controllers and resources should use joinedStopCh. +// IMPORTANT: Base informers are already running with globalStopCh. We wrap them with +// ProviderConfig filters that use globalStopCh to remain alive across PC changes. func StartNEGController( - informersFactory informers.SharedInformerFactory, - svcNegFactory informersvcneg.SharedInformerFactory, - networkFactory informernetwork.SharedInformerFactory, - nodeTopologyFactory informernodetopology.SharedInformerFactory, + informers *multiprojectinformers.InformerSet, kubeClient kubernetes.Interface, eventRecorderClient kubernetes.Interface, svcNegClient svcnegclient.Interface, @@ -85,12 +77,11 @@ func StartNEGController( } }() - informers, hasSynced, err := initializeInformers(informersFactory, svcNegFactory, networkFactory, nodeTopologyFactory, providerConfigName, logger, globalStopCh) - if err != nil { - return nil, err - } + // Wrap informers with provider config filter + filteredInformers := informers.FilterByProviderConfig(providerConfigName) + hasSynced := filteredInformers.CombinedHasSynced() - zoneGetter, err := zonegetter.NewZoneGetter(informers.nodeInformer, informers.providerConfigFilteredNodeTopologyInformer, cloud.SubnetworkURL()) + zoneGetter, err := zonegetter.NewZoneGetter(filteredInformers.Node, filteredInformers.NodeTopology, cloud.SubnetworkURL()) if err != nil { logger.Error(err, "failed to initialize zone getter") return nil, fmt.Errorf("failed to initialize zonegetter: %v", err) @@ -101,14 +92,15 @@ func StartNEGController( svcNegClient, eventRecorderClient, kubeSystemUID, - informers.ingressInformer, - informers.serviceInformer, - informers.podInformer, - informers.nodeInformer, - informers.endpointSliceInformer, - informers.providerConfigFilteredSvcNegInformer, - informers.providerConfigFilteredNetworkInformer, - informers.providerConfigFilteredGkeNetworkParamsInformer, + filteredInformers.Ingress, + filteredInformers.Service, + filteredInformers.Pod, + filteredInformers.Node, + filteredInformers.EndpointSlice, + filteredInformers.SvcNeg, + filteredInformers.Network, + filteredInformers.GkeNetworkParams, + filteredInformers.NodeTopology, hasSynced, cloud, zoneGetter, @@ -129,132 +121,6 @@ func StartNEGController( return providerConfigStopCh, nil } -type negInformers struct { - ingressInformer cache.SharedIndexInformer - serviceInformer cache.SharedIndexInformer - podInformer cache.SharedIndexInformer - nodeInformer cache.SharedIndexInformer - endpointSliceInformer cache.SharedIndexInformer - providerConfigFilteredSvcNegInformer cache.SharedIndexInformer - providerConfigFilteredNetworkInformer cache.SharedIndexInformer - providerConfigFilteredGkeNetworkParamsInformer cache.SharedIndexInformer - providerConfigFilteredNodeTopologyInformer cache.SharedIndexInformer -} - -// initializeInformers wraps the base SharedIndexInformers in a providerConfig filter -// and runs them. -func initializeInformers( - informersFactory informers.SharedInformerFactory, - svcNegFactory informersvcneg.SharedInformerFactory, - networkFactory informernetwork.SharedInformerFactory, - nodeTopologyFactory informernodetopology.SharedInformerFactory, - providerConfigName string, - logger klog.Logger, - globalStopCh <-chan struct{}, -) (*negInformers, func() bool, error) { - ingressInformer := filteredinformer.NewProviderConfigFilteredInformer(informersFactory.Networking().V1().Ingresses().Informer(), providerConfigName) - serviceInformer := filteredinformer.NewProviderConfigFilteredInformer(informersFactory.Core().V1().Services().Informer(), providerConfigName) - podInformer := filteredinformer.NewProviderConfigFilteredInformer(informersFactory.Core().V1().Pods().Informer(), providerConfigName) - nodeInformer := filteredinformer.NewProviderConfigFilteredInformer(informersFactory.Core().V1().Nodes().Informer(), providerConfigName) - - endpointSliceInformer := filteredinformer.NewProviderConfigFilteredInformer( - informersFactory.Discovery().V1().EndpointSlices().Informer(), - providerConfigName, - ) - // Even though we created separate "provider-config-filtered" informer, informers from the same - // factory will share indexers. That's why we need to add the indexer only if it's not present. - // This basically means we will only add indexer to the first provider config's informer. - err := addIndexerIfNotPresent(endpointSliceInformer.GetIndexer(), endpointslices.EndpointSlicesByServiceIndex, endpointslices.EndpointSlicesByServiceFunc) - if err != nil { - return nil, nil, fmt.Errorf("failed to add indexers to endpointSliceInformer: %v", err) - } - - var providerConfigFilteredSvcNegInformer cache.SharedIndexInformer - if svcNegFactory != nil { - svcNegInformer := svcNegFactory.Networking().V1beta1().ServiceNetworkEndpointGroups().Informer() - providerConfigFilteredSvcNegInformer = filteredinformer.NewProviderConfigFilteredInformer(svcNegInformer, providerConfigName) - } - - var providerConfigFilteredNetworkInformer cache.SharedIndexInformer - var providerConfigFilteredGkeNetworkParamsInformer cache.SharedIndexInformer - if networkFactory != nil { - networkInformer := networkFactory.Networking().V1().Networks().Informer() - providerConfigFilteredNetworkInformer = filteredinformer.NewProviderConfigFilteredInformer(networkInformer, providerConfigName) - - gkeNetworkParamsInformer := networkFactory.Networking().V1().GKENetworkParamSets().Informer() - providerConfigFilteredGkeNetworkParamsInformer = filteredinformer.NewProviderConfigFilteredInformer(gkeNetworkParamsInformer, providerConfigName) - } - - var providerConfigFilteredNodeTopologyInformer cache.SharedIndexInformer - if nodeTopologyFactory != nil { - nodeTopologyInformer := nodeTopologyFactory.Networking().V1().NodeTopologies().Informer() - providerConfigFilteredNodeTopologyInformer = filteredinformer.NewProviderConfigFilteredInformer(nodeTopologyInformer, providerConfigName) - } - - // Start them with the joinedStopCh so they properly stop - hasSyncedList := []func() bool{ - ingressInformer.HasSynced, - serviceInformer.HasSynced, - podInformer.HasSynced, - nodeInformer.HasSynced, - endpointSliceInformer.HasSynced, - } - go ingressInformer.Run(globalStopCh) - go serviceInformer.Run(globalStopCh) - go podInformer.Run(globalStopCh) - go nodeInformer.Run(globalStopCh) - go endpointSliceInformer.Run(globalStopCh) - if providerConfigFilteredSvcNegInformer != nil { - go providerConfigFilteredSvcNegInformer.Run(globalStopCh) - hasSyncedList = append(hasSyncedList, providerConfigFilteredSvcNegInformer.HasSynced) - } - if providerConfigFilteredNetworkInformer != nil { - go providerConfigFilteredNetworkInformer.Run(globalStopCh) - hasSyncedList = append(hasSyncedList, providerConfigFilteredNetworkInformer.HasSynced) - } - if providerConfigFilteredGkeNetworkParamsInformer != nil { - go providerConfigFilteredGkeNetworkParamsInformer.Run(globalStopCh) - hasSyncedList = append(hasSyncedList, providerConfigFilteredGkeNetworkParamsInformer.HasSynced) - } - if providerConfigFilteredNodeTopologyInformer != nil { - go providerConfigFilteredNodeTopologyInformer.Run(globalStopCh) - hasSyncedList = append(hasSyncedList, providerConfigFilteredNodeTopologyInformer.HasSynced) - } - - logger.V(2).Info("NEG informers initialized", "providerConfigName", providerConfigName) - informers := &negInformers{ - ingressInformer: ingressInformer, - serviceInformer: serviceInformer, - podInformer: podInformer, - nodeInformer: nodeInformer, - endpointSliceInformer: endpointSliceInformer, - providerConfigFilteredSvcNegInformer: providerConfigFilteredSvcNegInformer, - providerConfigFilteredNetworkInformer: providerConfigFilteredNetworkInformer, - providerConfigFilteredGkeNetworkParamsInformer: providerConfigFilteredGkeNetworkParamsInformer, - providerConfigFilteredNodeTopologyInformer: providerConfigFilteredNodeTopologyInformer, - } - hasSynced := func() bool { - for _, hasSynced := range hasSyncedList { - if !hasSynced() { - return false - } - } - return true - } - - return informers, hasSynced, nil -} - -// addIndexerIfNotPresent adds an indexer to the indexer if it's not present. -// This is needed because informers from the same factory will share indexers. -func addIndexerIfNotPresent(indexer cache.Indexer, indexName string, indexFunc cache.IndexFunc) error { - indexers := indexer.GetIndexers() - if _, ok := indexers[indexName]; ok { - return nil - } - return indexer.AddIndexers(cache.Indexers{indexName: indexFunc}) -} - func createNEGController( kubeClient kubernetes.Interface, svcNegClient svcnegclient.Interface, @@ -268,6 +134,7 @@ func createNEGController( svcNegInformer cache.SharedIndexInformer, networkInformer cache.SharedIndexInformer, gkeNetworkParamsInformer cache.SharedIndexInformer, + nodeTopologyInformer cache.SharedIndexInformer, hasSynced func() bool, cloud *gce.Cloud, zoneGetter *zonegetter.ZoneGetter, @@ -287,8 +154,8 @@ func createNEGController( } noDefaultBackendServicePort := utils.ServicePort{} - var noNodeTopologyInformer cache.SharedIndexInformer negMetrics := metrics.NewNegMetrics() + negController, err := neg.NewController( kubeClient, svcNegClient, @@ -302,7 +169,7 @@ func createNEGController( svcNegInformer, networkInformer, gkeNetworkParamsInformer, - noNodeTopologyInformer, + nodeTopologyInformer, hasSynced, l4Namer, noDefaultBackendServicePort, diff --git a/pkg/multiproject/start/start.go b/pkg/multiproject/start/start.go index 8c237beff2..1d911bea77 100644 --- a/pkg/multiproject/start/start.go +++ b/pkg/multiproject/start/start.go @@ -7,17 +7,19 @@ import ( "math/rand" "os" - informernetwork "github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions" - informernodetopology "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions" + networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" + nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/leaderelection" "k8s.io/client-go/tools/leaderelection/resourcelock" "k8s.io/ingress-gce/pkg/flags" _ "k8s.io/ingress-gce/pkg/klog" pccontroller "k8s.io/ingress-gce/pkg/multiproject/controller" "k8s.io/ingress-gce/pkg/multiproject/gce" + multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/informerset" "k8s.io/ingress-gce/pkg/multiproject/manager" syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" "k8s.io/ingress-gce/pkg/neg/syncers/labels" @@ -26,7 +28,6 @@ import ( "k8s.io/ingress-gce/pkg/recorders" svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" - informersvcneg "k8s.io/ingress-gce/pkg/svcneg/client/informers/externalversions" "k8s.io/ingress-gce/pkg/utils/namer" "k8s.io/klog/v2" ) @@ -41,13 +42,11 @@ func StartWithLeaderElection( logger klog.Logger, kubeClient kubernetes.Interface, svcNegClient svcnegclient.Interface, + networkClient networkclient.Interface, + nodeTopologyClient nodetopologyclient.Interface, kubeSystemUID types.UID, eventRecorderKubeClient kubernetes.Interface, providerConfigClient providerconfigclient.Interface, - informersFactory informers.SharedInformerFactory, - svcNegFactory informersvcneg.SharedInformerFactory, - networkFactory informernetwork.SharedInformerFactory, - nodeTopologyFactory informernodetopology.SharedInformerFactory, gceCreator gce.GCECreator, rootNamer *namer.Namer, stopCh <-chan struct{}, @@ -57,7 +56,7 @@ func StartWithLeaderElection( recordersManager := recorders.NewManager(eventRecorderKubeClient, logger) - leConfig, err := makeLeaderElectionConfig(leaderElectKubeClient, hostname, recordersManager, logger, kubeClient, svcNegClient, kubeSystemUID, eventRecorderKubeClient, providerConfigClient, informersFactory, svcNegFactory, networkFactory, nodeTopologyFactory, gceCreator, rootNamer, syncerMetrics) + leConfig, err := makeLeaderElectionConfig(leaderElectKubeClient, hostname, recordersManager, logger, kubeClient, svcNegClient, networkClient, nodeTopologyClient, kubeSystemUID, eventRecorderKubeClient, providerConfigClient, gceCreator, rootNamer, syncerMetrics) if err != nil { return err } @@ -82,13 +81,11 @@ func makeLeaderElectionConfig( logger klog.Logger, kubeClient kubernetes.Interface, svcNegClient svcnegclient.Interface, + networkClient networkclient.Interface, + nodeTopologyClient nodetopologyclient.Interface, kubeSystemUID types.UID, eventRecorderKubeClient kubernetes.Interface, providerConfigClient providerconfigclient.Interface, - informersFactory informers.SharedInformerFactory, - svcNegFactory informersvcneg.SharedInformerFactory, - networkFactory informernetwork.SharedInformerFactory, - nodeTopologyFactory informernodetopology.SharedInformerFactory, gceCreator gce.GCECreator, rootNamer *namer.Namer, syncerMetrics *syncMetrics.SyncerMetrics, @@ -120,7 +117,7 @@ func makeLeaderElectionConfig( Callbacks: leaderelection.LeaderCallbacks{ OnStartedLeading: func(ctx context.Context) { logger.Info("Became leader, starting multi-project controller") - Start(logger, kubeClient, svcNegClient, kubeSystemUID, eventRecorderKubeClient, providerConfigClient, informersFactory, svcNegFactory, networkFactory, nodeTopologyFactory, gceCreator, rootNamer, ctx.Done(), syncerMetrics) + Start(logger, kubeClient, svcNegClient, networkClient, nodeTopologyClient, kubeSystemUID, eventRecorderKubeClient, providerConfigClient, gceCreator, rootNamer, ctx.Done(), syncerMetrics) }, OnStoppedLeading: func() { logger.Info("Stop running multi-project leader election") @@ -132,18 +129,16 @@ func makeLeaderElectionConfig( } // Start starts the ProviderConfig controller. -// It builds required context and starts the controller. +// It creates SharedIndexInformers directly and starts the controller. func Start( logger klog.Logger, kubeClient kubernetes.Interface, svcNegClient svcnegclient.Interface, + networkClient networkclient.Interface, + nodeTopologyClient nodetopologyclient.Interface, kubeSystemUID types.UID, eventRecorderKubeClient kubernetes.Interface, providerConfigClient providerconfigclient.Interface, - informersFactory informers.SharedInformerFactory, - svcNegFactory informersvcneg.SharedInformerFactory, - networkFactory informernetwork.SharedInformerFactory, - nodeTopologyFactory informernodetopology.SharedInformerFactory, gceCreator gce.GCECreator, rootNamer *namer.Namer, stopCh <-chan struct{}, @@ -158,18 +153,29 @@ func Start( } } - providerConfigInformer := providerconfiginformers.NewSharedInformerFactory(providerConfigClient, flags.F.ResyncPeriod).Cloud().V1().ProviderConfigs().Informer() - logger.V(2).Info("Starting ProviderConfig informer") - go providerConfigInformer.Run(stopCh) + // Create and start all informers + informers := multiprojectinformers.NewInformerSet( + kubeClient, + svcNegClient, + networkClient, + nodeTopologyClient, + metav1.Duration{Duration: flags.F.ResyncPeriod}, + ) + + // Start all informers + err := informers.Start(stopCh, logger) + if err != nil { + logger.Error(err, "Failed to start informers") + return + } manager := manager.NewProviderConfigControllerManager( kubeClient, - informersFactory, - svcNegFactory, - networkFactory, - nodeTopologyFactory, + informers, providerConfigClient, svcNegClient, + networkClient, + nodeTopologyClient, eventRecorderKubeClient, kubeSystemUID, rootNamer, @@ -182,6 +188,20 @@ func Start( ) logger.V(1).Info("Initialized ProviderConfig controller manager") + // Create ProviderConfig informer + providerConfigInformer := providerconfiginformers.NewSharedInformerFactory(providerConfigClient, flags.F.ResyncPeriod).Cloud().V1().ProviderConfigs().Informer() + logger.V(2).Info("Starting ProviderConfig informer") + go providerConfigInformer.Run(stopCh) + + // Wait for provider config informer to sync + logger.Info("Waiting for provider config informer to sync") + if !cache.WaitForCacheSync(stopCh, providerConfigInformer.HasSynced) { + err := fmt.Errorf("failed to sync provider config informer") + logger.Error(err, "Failed to sync provider config informer") + return + } + logger.Info("Provider config informer synced successfully") + pcController := pccontroller.NewProviderConfigController(manager, providerConfigInformer, stopCh, logger) logger.V(1).Info("Running ProviderConfig controller") pcController.Run() diff --git a/pkg/multiproject/start/start_test.go b/pkg/multiproject/start/start_test.go index 0a92ac0325..c59dc44702 100644 --- a/pkg/multiproject/start/start_test.go +++ b/pkg/multiproject/start/start_test.go @@ -9,9 +9,7 @@ import ( "time" networkfake "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned/fake" - informernetwork "github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions" nodetopologyfake "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned/fake" - informernodetopology "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions" corev1 "k8s.io/api/core/v1" discovery "k8s.io/api/discovery/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -35,7 +33,6 @@ import ( pcclientfake "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned/fake" svcnegfake "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned/fake" - informersvcneg "k8s.io/ingress-gce/pkg/svcneg/client/informers/externalversions" "k8s.io/ingress-gce/pkg/utils/namer" klog "k8s.io/klog/v2" ) @@ -214,9 +211,6 @@ func TestStartProviderConfigIntegration(t *testing.T) { logger := klog.TODO() gceCreator := multiprojectgce.NewGCEFake() informersFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, flags.F.ResyncPeriod) - svcNegFactory := informersvcneg.NewSharedInformerFactoryWithOptions(svcNegClient, flags.F.ResyncPeriod) - networkFactory := informernetwork.NewSharedInformerFactoryWithOptions(networkClient, flags.F.ResyncPeriod) - nodeTopologyFactory := informernodetopology.NewSharedInformerFactoryWithOptions(nodeTopologyClient, flags.F.ResyncPeriod) rootNamer := namer.NewNamer("test-clusteruid", "", logger) kubeSystemUID := types.UID("test-kube-system-uid") @@ -234,13 +228,11 @@ func TestStartProviderConfigIntegration(t *testing.T) { logger, kubeClient, svcNegClient, + networkClient, + nodeTopologyClient, kubeSystemUID, kubeClient, // eventRecorderKubeClient can be the same as main client pcClient, - informersFactory, - svcNegFactory, - networkFactory, - nodeTopologyFactory, gceCreator, rootNamer, stopCh, @@ -287,14 +279,14 @@ func TestStartProviderConfigIntegration(t *testing.T) { } t.Logf("Created Service %s/%s", createdSvc.Namespace, createdSvc.Name) - // Populate endpoint slices in the fake informer. + // Populate endpoint slices in the fake client so InformerSet picks them up. addressPrefix := "10.100" if svc.Labels[flags.F.ProviderConfigNameLabelKey] == providerConfigName2 { addressPrefix = "20.100" } populateFakeEndpointSlices( t, - informersFactory.Discovery().V1().EndpointSlices().Informer(), + kubeClient, svc.Name, svc.Labels[flags.F.ProviderConfigNameLabelKey], addressPrefix, @@ -332,9 +324,6 @@ func TestSharedInformers_PC1Stops_PC2AndPC3KeepWorking(t *testing.T) { testutil.EmulateProviderConfigLabelingWebhook(&svcNegClient.Fake, "servicenetworkendpointgroups") informersFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, flags.F.ResyncPeriod) - svcNegFactory := informersvcneg.NewSharedInformerFactoryWithOptions(svcNegClient, flags.F.ResyncPeriod) - networkFactory := informernetwork.NewSharedInformerFactoryWithOptions(networkClient, flags.F.ResyncPeriod) - nodeTopoFactory := informernodetopology.NewSharedInformerFactoryWithOptions(nodeTopoClient, flags.F.ResyncPeriod) logger := klog.TODO() gceCreator := multiprojectgce.NewGCEFake() @@ -348,8 +337,8 @@ func TestSharedInformers_PC1Stops_PC2AndPC3KeepWorking(t *testing.T) { // Start multiproject manager (this starts shared factories once with globalStop). go Start( - logger, kubeClient, svcNegClient, kubeSystemUID, kubeClient, - pcClient, informersFactory, svcNegFactory, networkFactory, nodeTopoFactory, + logger, kubeClient, svcNegClient, networkClient, nodeTopoClient, + kubeSystemUID, kubeClient, pcClient, gceCreator, rootNamer, globalStop, syncMetrics.FakeSyncerMetrics(), ) @@ -369,7 +358,7 @@ func TestSharedInformers_PC1Stops_PC2AndPC3KeepWorking(t *testing.T) { markPCDeletingAndWait(ctx, t, pcClient, pc1) // --- pc-2 after pc-1 stops: create a NEW service; it must still work --- - seedEPS(t, informersFactory, "pc-2", "svc2-b", "20.100") + seedEPS(t, kubeClient, informersFactory, "pc-2", "svc2-b", "20.100") svc2b := createNEGService(ctx, t, kubeClient, "pc-2", "svc2-b", "demo") validateService(ctx, t, kubeClient, svcNegClient, gceCreator, svc2b, pc2) @@ -451,16 +440,18 @@ func seedAll( ) { t.Helper() populateFakeNodeInformer(t, kubeClient, informersFactory.Core().V1().Nodes().Informer(), ns, cidrPrefix) - populateFakeEndpointSlices(t, informersFactory.Discovery().V1().EndpointSlices().Informer(), svcName, ns, cidrPrefix) + populateFakeEndpointSlices(t, kubeClient, svcName, ns, cidrPrefix) } func seedEPS( t *testing.T, + kubeClient *fake.Clientset, informersFactory informers.SharedInformerFactory, ns, svcName, cidrPrefix string, ) { t.Helper() - populateFakeEndpointSlices(t, informersFactory.Discovery().V1().EndpointSlices().Informer(), svcName, ns, cidrPrefix) + // Create EndpointSlices in the fake client so InformerSet sees them. + populateFakeEndpointSlices(t, kubeClient, svcName, ns, cidrPrefix) } func createNEGService( @@ -717,14 +708,14 @@ func populateFakeNodeInformer( // populateFakeEndpointSlices indexes a set of fake EndpointSlices to simulate the real endpoints in the cluster. func populateFakeEndpointSlices( t *testing.T, - endpointSliceInformer cache.SharedIndexInformer, + client *fake.Clientset, serviceName, providerConfigName, addressPrefix string, ) { t.Helper() endpointSlices := getTestEndpointSlices(serviceName, providerConfigName, addressPrefix) for _, es := range endpointSlices { - if err := endpointSliceInformer.GetIndexer().Add(es); err != nil { - t.Fatalf("Failed to add endpoint slice %q: %v", es.Name, err) + if _, err := client.DiscoveryV1().EndpointSlices(providerConfigName).Create(context.Background(), es, metav1.CreateOptions{}); err != nil { + t.Fatalf("Failed to create endpoint slice %q: %v", es.Name, err) } } } @@ -943,10 +934,6 @@ func TestProviderConfigErrorCases(t *testing.T) { logger := klog.TODO() gceCreator := multiprojectgce.NewGCEFake() - informersFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, flags.F.ResyncPeriod) - svcNegFactory := informersvcneg.NewSharedInformerFactoryWithOptions(svcNegClient, flags.F.ResyncPeriod) - networkFactory := informernetwork.NewSharedInformerFactoryWithOptions(networkClient, flags.F.ResyncPeriod) - nodeTopologyFactory := informernodetopology.NewSharedInformerFactoryWithOptions(nodeTopologyClient, flags.F.ResyncPeriod) rootNamer := namer.NewNamer("test-clusteruid", "", logger) kubeSystemUID := types.UID("test-kube-system-uid") @@ -964,13 +951,11 @@ func TestProviderConfigErrorCases(t *testing.T) { logger, kubeClient, svcNegClient, + networkClient, + nodeTopologyClient, kubeSystemUID, kubeClient, pcClient, - informersFactory, - svcNegFactory, - networkFactory, - nodeTopologyFactory, gceCreator, rootNamer, stopCh, diff --git a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/factory.go b/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/factory.go deleted file mode 100644 index a6f207d5c0..0000000000 --- a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/factory.go +++ /dev/null @@ -1,262 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Code generated by informer-gen. DO NOT EDIT. - -package externalversions - -import ( - reflect "reflect" - sync "sync" - time "time" - - versioned "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" - internalinterfaces "github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/internalinterfaces" - network "github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/network" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - runtime "k8s.io/apimachinery/pkg/runtime" - schema "k8s.io/apimachinery/pkg/runtime/schema" - cache "k8s.io/client-go/tools/cache" -) - -// SharedInformerOption defines the functional option type for SharedInformerFactory. -type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory - -type sharedInformerFactory struct { - client versioned.Interface - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc - lock sync.Mutex - defaultResync time.Duration - customResync map[reflect.Type]time.Duration - transform cache.TransformFunc - - informers map[reflect.Type]cache.SharedIndexInformer - // startedInformers is used for tracking which informers have been started. - // This allows Start() to be called multiple times safely. - startedInformers map[reflect.Type]bool - // wg tracks how many goroutines were started. - wg sync.WaitGroup - // shuttingDown is true when Shutdown has been called. It may still be running - // because it needs to wait for goroutines. - shuttingDown bool -} - -// WithCustomResyncConfig sets a custom resync period for the specified informer types. -func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - for k, v := range resyncConfig { - factory.customResync[reflect.TypeOf(k)] = v - } - return factory - } -} - -// WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory. -func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - factory.tweakListOptions = tweakListOptions - return factory - } -} - -// WithNamespace limits the SharedInformerFactory to the specified namespace. -func WithNamespace(namespace string) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - factory.namespace = namespace - return factory - } -} - -// WithTransform sets a transform on all informers. -func WithTransform(transform cache.TransformFunc) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - factory.transform = transform - return factory - } -} - -// NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces. -func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory { - return NewSharedInformerFactoryWithOptions(client, defaultResync) -} - -// NewFilteredSharedInformerFactory constructs a new instance of sharedInformerFactory. -// Listers obtained via this SharedInformerFactory will be subject to the same filters -// as specified here. -// Deprecated: Please use NewSharedInformerFactoryWithOptions instead -func NewFilteredSharedInformerFactory(client versioned.Interface, defaultResync time.Duration, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerFactory { - return NewSharedInformerFactoryWithOptions(client, defaultResync, WithNamespace(namespace), WithTweakListOptions(tweakListOptions)) -} - -// NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options. -func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, options ...SharedInformerOption) SharedInformerFactory { - factory := &sharedInformerFactory{ - client: client, - namespace: v1.NamespaceAll, - defaultResync: defaultResync, - informers: make(map[reflect.Type]cache.SharedIndexInformer), - startedInformers: make(map[reflect.Type]bool), - customResync: make(map[reflect.Type]time.Duration), - } - - // Apply all options - for _, opt := range options { - factory = opt(factory) - } - - return factory -} - -func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) { - f.lock.Lock() - defer f.lock.Unlock() - - if f.shuttingDown { - return - } - - for informerType, informer := range f.informers { - if !f.startedInformers[informerType] { - f.wg.Add(1) - // We need a new variable in each loop iteration, - // otherwise the goroutine would use the loop variable - // and that keeps changing. - informer := informer - go func() { - defer f.wg.Done() - informer.Run(stopCh) - }() - f.startedInformers[informerType] = true - } - } -} - -func (f *sharedInformerFactory) Shutdown() { - f.lock.Lock() - f.shuttingDown = true - f.lock.Unlock() - - // Will return immediately if there is nothing to wait for. - f.wg.Wait() -} - -func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool { - informers := func() map[reflect.Type]cache.SharedIndexInformer { - f.lock.Lock() - defer f.lock.Unlock() - - informers := map[reflect.Type]cache.SharedIndexInformer{} - for informerType, informer := range f.informers { - if f.startedInformers[informerType] { - informers[informerType] = informer - } - } - return informers - }() - - res := map[reflect.Type]bool{} - for informType, informer := range informers { - res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced) - } - return res -} - -// InformerFor returns the SharedIndexInformer for obj using an internal -// client. -func (f *sharedInformerFactory) InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer { - f.lock.Lock() - defer f.lock.Unlock() - - informerType := reflect.TypeOf(obj) - informer, exists := f.informers[informerType] - if exists { - return informer - } - - resyncPeriod, exists := f.customResync[informerType] - if !exists { - resyncPeriod = f.defaultResync - } - - informer = newFunc(f.client, resyncPeriod) - informer.SetTransform(f.transform) - f.informers[informerType] = informer - - return informer -} - -// SharedInformerFactory provides shared informers for resources in all known -// API group versions. -// -// It is typically used like this: -// -// ctx, cancel := context.Background() -// defer cancel() -// factory := NewSharedInformerFactory(client, resyncPeriod) -// defer factory.WaitForStop() // Returns immediately if nothing was started. -// genericInformer := factory.ForResource(resource) -// typedInformer := factory.SomeAPIGroup().V1().SomeType() -// factory.Start(ctx.Done()) // Start processing these informers. -// synced := factory.WaitForCacheSync(ctx.Done()) -// for v, ok := range synced { -// if !ok { -// fmt.Fprintf(os.Stderr, "caches failed to sync: %v", v) -// return -// } -// } -// -// // Creating informers can also be created after Start, but then -// // Start must be called again: -// anotherGenericInformer := factory.ForResource(resource) -// factory.Start(ctx.Done()) -type SharedInformerFactory interface { - internalinterfaces.SharedInformerFactory - - // Start initializes all requested informers. They are handled in goroutines - // which run until the stop channel gets closed. - // Warning: Start does not block. When run in a go-routine, it will race with a later WaitForCacheSync. - Start(stopCh <-chan struct{}) - - // Shutdown marks a factory as shutting down. At that point no new - // informers can be started anymore and Start will return without - // doing anything. - // - // In addition, Shutdown blocks until all goroutines have terminated. For that - // to happen, the close channel(s) that they were started with must be closed, - // either before Shutdown gets called or while it is waiting. - // - // Shutdown may be called multiple times, even concurrently. All such calls will - // block until all goroutines have terminated. - Shutdown() - - // WaitForCacheSync blocks until all started informers' caches were synced - // or the stop channel gets closed. - WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool - - // ForResource gives generic access to a shared informer of the matching type. - ForResource(resource schema.GroupVersionResource) (GenericInformer, error) - - // InformerFor returns the SharedIndexInformer for obj using an internal - // client. - InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer - - Networking() network.Interface -} - -func (f *sharedInformerFactory) Networking() network.Interface { - return network.New(f, f.namespace, f.tweakListOptions) -} diff --git a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/generic.go b/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/generic.go deleted file mode 100644 index 7d9caf1bfa..0000000000 --- a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/generic.go +++ /dev/null @@ -1,68 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Code generated by informer-gen. DO NOT EDIT. - -package externalversions - -import ( - "fmt" - - v1 "github.com/GoogleCloudPlatform/gke-networking-api/apis/network/v1" - schema "k8s.io/apimachinery/pkg/runtime/schema" - cache "k8s.io/client-go/tools/cache" -) - -// GenericInformer is type of SharedIndexInformer which will locate and delegate to other -// sharedInformers based on type -type GenericInformer interface { - Informer() cache.SharedIndexInformer - Lister() cache.GenericLister -} - -type genericInformer struct { - informer cache.SharedIndexInformer - resource schema.GroupResource -} - -// Informer returns the SharedIndexInformer. -func (f *genericInformer) Informer() cache.SharedIndexInformer { - return f.informer -} - -// Lister returns the GenericLister. -func (f *genericInformer) Lister() cache.GenericLister { - return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) -} - -// ForResource gives generic access to a shared informer of the matching type -// TODO extend this to unknown resources with a client pool -func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { - switch resource { - // Group=networking.gke.io, Version=v1 - case v1.SchemeGroupVersion.WithResource("gkenetworkparamsets"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Networking().V1().GKENetworkParamSets().Informer()}, nil - case v1.SchemeGroupVersion.WithResource("networks"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Networking().V1().Networks().Informer()}, nil - case v1.SchemeGroupVersion.WithResource("networkinterfaces"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Networking().V1().NetworkInterfaces().Informer()}, nil - case v1.SchemeGroupVersion.WithResource("subnetworks"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Networking().V1().Subnetworks().Informer()}, nil - - } - - return nil, fmt.Errorf("no informer found for %v", resource) -} diff --git a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/network/interface.go b/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/network/interface.go deleted file mode 100644 index ab755e66f0..0000000000 --- a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/network/interface.go +++ /dev/null @@ -1,46 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Code generated by informer-gen. DO NOT EDIT. - -package network - -import ( - internalinterfaces "github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/internalinterfaces" - v1 "github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/network/v1" -) - -// Interface provides access to each of this group's versions. -type Interface interface { - // V1 provides access to shared informers for resources in V1. - V1() v1.Interface -} - -type group struct { - factory internalinterfaces.SharedInformerFactory - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { - return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} -} - -// V1 returns a new v1.Interface. -func (g *group) V1() v1.Interface { - return v1.New(g.factory, g.namespace, g.tweakListOptions) -} diff --git a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/factory.go b/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/factory.go deleted file mode 100644 index d387b69978..0000000000 --- a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/factory.go +++ /dev/null @@ -1,262 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Code generated by informer-gen. DO NOT EDIT. - -package externalversions - -import ( - reflect "reflect" - sync "sync" - time "time" - - versioned "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" - internalinterfaces "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/internalinterfaces" - nodetopology "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/nodetopology" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - runtime "k8s.io/apimachinery/pkg/runtime" - schema "k8s.io/apimachinery/pkg/runtime/schema" - cache "k8s.io/client-go/tools/cache" -) - -// SharedInformerOption defines the functional option type for SharedInformerFactory. -type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory - -type sharedInformerFactory struct { - client versioned.Interface - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc - lock sync.Mutex - defaultResync time.Duration - customResync map[reflect.Type]time.Duration - transform cache.TransformFunc - - informers map[reflect.Type]cache.SharedIndexInformer - // startedInformers is used for tracking which informers have been started. - // This allows Start() to be called multiple times safely. - startedInformers map[reflect.Type]bool - // wg tracks how many goroutines were started. - wg sync.WaitGroup - // shuttingDown is true when Shutdown has been called. It may still be running - // because it needs to wait for goroutines. - shuttingDown bool -} - -// WithCustomResyncConfig sets a custom resync period for the specified informer types. -func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - for k, v := range resyncConfig { - factory.customResync[reflect.TypeOf(k)] = v - } - return factory - } -} - -// WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory. -func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - factory.tweakListOptions = tweakListOptions - return factory - } -} - -// WithNamespace limits the SharedInformerFactory to the specified namespace. -func WithNamespace(namespace string) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - factory.namespace = namespace - return factory - } -} - -// WithTransform sets a transform on all informers. -func WithTransform(transform cache.TransformFunc) SharedInformerOption { - return func(factory *sharedInformerFactory) *sharedInformerFactory { - factory.transform = transform - return factory - } -} - -// NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces. -func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory { - return NewSharedInformerFactoryWithOptions(client, defaultResync) -} - -// NewFilteredSharedInformerFactory constructs a new instance of sharedInformerFactory. -// Listers obtained via this SharedInformerFactory will be subject to the same filters -// as specified here. -// Deprecated: Please use NewSharedInformerFactoryWithOptions instead -func NewFilteredSharedInformerFactory(client versioned.Interface, defaultResync time.Duration, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerFactory { - return NewSharedInformerFactoryWithOptions(client, defaultResync, WithNamespace(namespace), WithTweakListOptions(tweakListOptions)) -} - -// NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options. -func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, options ...SharedInformerOption) SharedInformerFactory { - factory := &sharedInformerFactory{ - client: client, - namespace: v1.NamespaceAll, - defaultResync: defaultResync, - informers: make(map[reflect.Type]cache.SharedIndexInformer), - startedInformers: make(map[reflect.Type]bool), - customResync: make(map[reflect.Type]time.Duration), - } - - // Apply all options - for _, opt := range options { - factory = opt(factory) - } - - return factory -} - -func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) { - f.lock.Lock() - defer f.lock.Unlock() - - if f.shuttingDown { - return - } - - for informerType, informer := range f.informers { - if !f.startedInformers[informerType] { - f.wg.Add(1) - // We need a new variable in each loop iteration, - // otherwise the goroutine would use the loop variable - // and that keeps changing. - informer := informer - go func() { - defer f.wg.Done() - informer.Run(stopCh) - }() - f.startedInformers[informerType] = true - } - } -} - -func (f *sharedInformerFactory) Shutdown() { - f.lock.Lock() - f.shuttingDown = true - f.lock.Unlock() - - // Will return immediately if there is nothing to wait for. - f.wg.Wait() -} - -func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool { - informers := func() map[reflect.Type]cache.SharedIndexInformer { - f.lock.Lock() - defer f.lock.Unlock() - - informers := map[reflect.Type]cache.SharedIndexInformer{} - for informerType, informer := range f.informers { - if f.startedInformers[informerType] { - informers[informerType] = informer - } - } - return informers - }() - - res := map[reflect.Type]bool{} - for informType, informer := range informers { - res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced) - } - return res -} - -// InformerFor returns the SharedIndexInformer for obj using an internal -// client. -func (f *sharedInformerFactory) InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer { - f.lock.Lock() - defer f.lock.Unlock() - - informerType := reflect.TypeOf(obj) - informer, exists := f.informers[informerType] - if exists { - return informer - } - - resyncPeriod, exists := f.customResync[informerType] - if !exists { - resyncPeriod = f.defaultResync - } - - informer = newFunc(f.client, resyncPeriod) - informer.SetTransform(f.transform) - f.informers[informerType] = informer - - return informer -} - -// SharedInformerFactory provides shared informers for resources in all known -// API group versions. -// -// It is typically used like this: -// -// ctx, cancel := context.Background() -// defer cancel() -// factory := NewSharedInformerFactory(client, resyncPeriod) -// defer factory.WaitForStop() // Returns immediately if nothing was started. -// genericInformer := factory.ForResource(resource) -// typedInformer := factory.SomeAPIGroup().V1().SomeType() -// factory.Start(ctx.Done()) // Start processing these informers. -// synced := factory.WaitForCacheSync(ctx.Done()) -// for v, ok := range synced { -// if !ok { -// fmt.Fprintf(os.Stderr, "caches failed to sync: %v", v) -// return -// } -// } -// -// // Creating informers can also be created after Start, but then -// // Start must be called again: -// anotherGenericInformer := factory.ForResource(resource) -// factory.Start(ctx.Done()) -type SharedInformerFactory interface { - internalinterfaces.SharedInformerFactory - - // Start initializes all requested informers. They are handled in goroutines - // which run until the stop channel gets closed. - // Warning: Start does not block. When run in a go-routine, it will race with a later WaitForCacheSync. - Start(stopCh <-chan struct{}) - - // Shutdown marks a factory as shutting down. At that point no new - // informers can be started anymore and Start will return without - // doing anything. - // - // In addition, Shutdown blocks until all goroutines have terminated. For that - // to happen, the close channel(s) that they were started with must be closed, - // either before Shutdown gets called or while it is waiting. - // - // Shutdown may be called multiple times, even concurrently. All such calls will - // block until all goroutines have terminated. - Shutdown() - - // WaitForCacheSync blocks until all started informers' caches were synced - // or the stop channel gets closed. - WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool - - // ForResource gives generic access to a shared informer of the matching type. - ForResource(resource schema.GroupVersionResource) (GenericInformer, error) - - // InformerFor returns the SharedIndexInformer for obj using an internal - // client. - InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer - - Networking() nodetopology.Interface -} - -func (f *sharedInformerFactory) Networking() nodetopology.Interface { - return nodetopology.New(f, f.namespace, f.tweakListOptions) -} diff --git a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/generic.go b/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/generic.go deleted file mode 100644 index 6b54cb9076..0000000000 --- a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/generic.go +++ /dev/null @@ -1,62 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Code generated by informer-gen. DO NOT EDIT. - -package externalversions - -import ( - "fmt" - - v1 "github.com/GoogleCloudPlatform/gke-networking-api/apis/nodetopology/v1" - schema "k8s.io/apimachinery/pkg/runtime/schema" - cache "k8s.io/client-go/tools/cache" -) - -// GenericInformer is type of SharedIndexInformer which will locate and delegate to other -// sharedInformers based on type -type GenericInformer interface { - Informer() cache.SharedIndexInformer - Lister() cache.GenericLister -} - -type genericInformer struct { - informer cache.SharedIndexInformer - resource schema.GroupResource -} - -// Informer returns the SharedIndexInformer. -func (f *genericInformer) Informer() cache.SharedIndexInformer { - return f.informer -} - -// Lister returns the GenericLister. -func (f *genericInformer) Lister() cache.GenericLister { - return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) -} - -// ForResource gives generic access to a shared informer of the matching type -// TODO extend this to unknown resources with a client pool -func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { - switch resource { - // Group=networking.gke.io, Version=v1 - case v1.SchemeGroupVersion.WithResource("nodetopologies"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Networking().V1().NodeTopologies().Informer()}, nil - - } - - return nil, fmt.Errorf("no informer found for %v", resource) -} diff --git a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/nodetopology/interface.go b/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/nodetopology/interface.go deleted file mode 100644 index 0f37c9a121..0000000000 --- a/vendor/github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/nodetopology/interface.go +++ /dev/null @@ -1,46 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Code generated by informer-gen. DO NOT EDIT. - -package nodetopology - -import ( - internalinterfaces "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/internalinterfaces" - v1 "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/nodetopology/v1" -) - -// Interface provides access to each of this group's versions. -type Interface interface { - // V1 provides access to shared informers for resources in V1. - V1() v1.Interface -} - -type group struct { - factory internalinterfaces.SharedInformerFactory - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { - return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} -} - -// V1 returns a new v1.Interface. -func (g *group) V1() v1.Interface { - return v1.New(g.factory, g.namespace, g.tweakListOptions) -} diff --git a/vendor/modules.txt b/vendor/modules.txt index 023c176727..c2270559c4 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -81,9 +81,7 @@ github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versi github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned/scheme github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned/typed/network/v1 github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned/typed/network/v1/fake -github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/internalinterfaces -github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/network github.com/GoogleCloudPlatform/gke-networking-api/client/network/informers/externalversions/network/v1 github.com/GoogleCloudPlatform/gke-networking-api/client/network/listers/network/v1 github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned @@ -91,9 +89,7 @@ github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/ github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned/scheme github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned/typed/nodetopology/v1 github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned/typed/nodetopology/v1/fake -github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/internalinterfaces -github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/nodetopology github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/informers/externalversions/nodetopology/v1 github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/listers/nodetopology/v1 # github.com/GoogleCloudPlatform/k8s-cloud-provider v1.32.0 From 19268b8dd7a3e139e52796d223483add0ae9ebbf Mon Sep 17 00:00:00 2001 From: Slavik Panasovets Date: Fri, 29 Aug 2025 13:56:12 -0700 Subject: [PATCH 2/6] Add README for multiproject folder, to briefly explain the logic in there --- pkg/multiproject/README.md | 219 +++++++++++-------------------------- 1 file changed, 61 insertions(+), 158 deletions(-) diff --git a/pkg/multiproject/README.md b/pkg/multiproject/README.md index 9be6c1da26..d37a7f4b62 100644 --- a/pkg/multiproject/README.md +++ b/pkg/multiproject/README.md @@ -1,179 +1,82 @@ -# Multi-Project Controller Architecture +# Multi-Project Controller -## Overview - -The multi-project controller enables Kubernetes ingress-gce to manage Network Endpoint Groups (NEGs) across multiple Google Cloud Platform (GCP) projects. This allows for multi-tenant scenarios where different namespaces or services can be associated with different GCP projects through ProviderConfig resources. +Enables ingress-gce to manage GCP resources across multiple projects through ProviderConfig CRs. ## Architecture -### Core Components - ``` ┌─────────────────────────────────────────────────────────────┐ -│ Main Process │ +│ Main Controller │ │ │ │ ┌────────────────────────────────────────────────────────┐ │ -│ │ start.Start() │ │ -│ │ - Creates base SharedIndexInformers │ │ -│ │ - Starts informers with globalStopCh │ │ -│ │ - Creates ProviderConfigController │ │ -│ └────────────────────┬───────────────────────────────────┘ │ -│ │ │ -│ ┌────────────────────▼───────────────────────────────────┐ │ -│ │ ProviderConfigController │ │ -│ │ - Watches ProviderConfig resources │ │ -│ │ - Manages lifecycle of per-PC controllers │ │ -│ └────────────────────┬───────────────────────────────────┘ │ -│ │ │ -│ ┌────────────────────▼───────────────────────────────────┐ │ -│ │ ProviderConfigControllersManager │ │ -│ │ - Starts/stops NEG controllers per ProviderConfig │ │ -│ │ - Manages controller lifecycle │ │ -│ └──────────┬─────────────────────┬───────────────────────┘ │ -│ │ │ │ -│ ┌────────▼──────────┐ ┌───────▼──────────┐ │ -│ │ NEG Controller #1 │ │ NEG Controller #2 │ ... │ -│ │ (ProviderConfig A) │ │ (ProviderConfig B) │ │ -│ └────────────────────┘ └──────────────────┘ │ +│ │ Shared Kubernetes Informers │ │ +│ │ (Services, Ingresses, EndpointSlices) │ │ +│ └─────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────────▼──────────────────────────────────┐ │ +│ │ ProviderConfig Controller │ │ +│ │ Watches ProviderConfig resources │ │ +│ │ Manages per-project controllers │ │ +│ └─────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ┌────────────────┼────────────────┐ │ +│ │ │ │ │ +│ ┌────▼─────┐ ┌─────▼─────┐ ┌─────▼─────┐ │ +│ │Project A │ │ Project B │ │ Project C │ ... │ +│ │Controller│ │Controller │ │Controller │ │ +│ └──────────┘ └───────────┘ └───────────┘ │ └─────────────────────────────────────────────────────────────┘ ``` -### Key Design Principles - -1. **Shared Informers**: Base informers are created once and shared across all ProviderConfig controllers -2. **Filtered Views**: Each NEG controller gets a filtered view of resources based on ProviderConfig -3. **Lifecycle Management**: Controllers can be started/stopped independently as ProviderConfigs are added/removed -4. **Channel Management**: Proper channel lifecycle ensures clean shutdown and resource cleanup - -## Component Details - -### start/start.go -Main entry point that: -- Creates base SharedIndexInformers via InformerSet (no factories) -- Starts all informers with the global stop channel -- Creates the ProviderConfigController -- Manages leader election (when enabled) - -### controller/controller.go -ProviderConfigController that: -- Watches ProviderConfig resources -- Enqueues changes for processing -- Delegates to ProviderConfigControllersManager - -### manager/manager.go -ProviderConfigControllersManager that: -- Maintains a map of active controllers per ProviderConfig -- Starts NEG controllers when ProviderConfigs are added -- Stops NEG controllers when ProviderConfigs are deleted -- Manages finalizers for cleanup - -### neg/neg.go -NEG controller factory that: -- Wraps base SharedIndexInformers with provider-config filters via ProviderConfigFilteredInformer -- Sets up the NEG controller with proper GCE client -- Manages channel lifecycle (globalStopCh vs providerConfigStopCh) - -### filteredinformer/ -Filtered informer implementation that: -- Wraps base SharedIndexInformers -- Filters resources based on ProviderConfig labels -- Provides filtered cache/store views - -## Channel Lifecycle - -The implementation uses three types of channels: - -1. **globalStopCh**: Process-wide shutdown signal - - Closes on leader election loss or process termination - - Used by base informers and shared resources - -2. **providerConfigStopCh**: Per-ProviderConfig shutdown signal - - Closed when a ProviderConfig is deleted - - Used to stop PC-specific controllers - -3. **joinedStopCh**: Combined shutdown signal - - Closes when either globalStopCh OR providerConfigStopCh closes - - Used by PC-specific resources that should stop in either case +## Key Concepts -## Resource Filtering +- **ProviderConfig**: CR defining a GCP project configuration +- **Resource Filtering**: Resources are associated via labels; each controller sees only its labeled resources +- **Shared Informers**: Base informers are created once and shared; controllers get filtered views +- **Dynamic Lifecycle**: Controllers start/stop with ProviderConfig create/delete -Resources are associated with ProviderConfigs through labels: -- Services, Ingresses, etc. have a label indicating their ProviderConfig -- The filtered informer only passes through resources matching the PC name -- This ensures each controller only sees and manages its own resources +## Usage -## Informer Lifecycle +### Create ProviderConfig -### Creation -1. Base informers are created via `InformerSet` using `NewXInformer()` functions -2. Base informers are started by `InformerSet.Start` with `globalStopCh` -3. Filtered wrappers are created per ProviderConfig using `ProviderConfigFilteredInformer` - -### Synchronization -- `InformerSet.Start` waits for base informer caches to sync -- Filtered informers rely on the synced base caches -- Controllers use `CombinedHasSynced()` from filtered informers before processing - -### Shutdown -- Base informers stop when globalStopCh closes -- Filtered informers are just wrappers (no separate shutdown) -- Controllers stop when their providerConfigStopCh closes - -## Configuration - -Key configuration flags: -- `--provider-config-name-label-key`: Label key for PC association (default: cloud.gke.io/provider-config-name) -- `--multi-project-owner-label-key`: Label key for PC owner -- `--resync-period`: Informer resync period - -## Testing - -### Unit Tests -- Controller logic testing -- Filter functionality testing -- Channel lifecycle testing - -### Integration Tests -- Multi-ProviderConfig scenarios -- Controller start/stop sequencing -- Resource cleanup verification - -### Key Test Scenarios -1. Single ProviderConfig with services -2. Multiple ProviderConfigs -3. ProviderConfig deletion and cleanup -4. Shared informer survival across PC changes - -## Common Operations - -### Adding a ProviderConfig -1. Create ProviderConfig resource -2. Controller detects addition -3. Manager starts NEG controller -4. NEG controller creates filtered informers -5. NEGs are created in target GCP project - -### Removing a ProviderConfig +```yaml +apiVersion: networking.gke.io/v1 +kind: ProviderConfig +metadata: + name: team-a-project +spec: + projectID: team-a-gcp-project + network: team-a-network +``` -The deletion process follows a specific sequence to ensure proper cleanup: +### Associate Resources + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: my-service + labels: + ${PROVIDER_CONFIG_LABEL_KEY}: provider-config-a +spec: + # Service spec... +``` -1. **External automation initiates deletion**: - - Server-side automation triggers the deletion process - - All namespaces belonging to the ProviderConfig are deleted first +## Operations -2. **Namespace cleanup**: - - Kubernetes deletes all resources within the namespaces - - Services are deleted, triggering NEG cleanup - - NEG controller removes NEGs from GCP as services are deleted +### Adding a Project +1. Create ProviderConfig +2. Label services/ingresses with PC name +3. NEGs created in target project -3. **Wait for namespace deletion**: - - External automation waits for all namespaces to be fully deleted - - This ensures all NEGs and other resources are cleaned up +### Removing a Project +1. Remove/relabel services using the PC +2. Wait for NEG cleanup +3. Delete ProviderConfig -4. **ProviderConfig deletion**: - - Only after namespaces are gone, ProviderConfig is deleted - - Controller stops the NEG controller for this ProviderConfig - - Finalizer is removed from ProviderConfig - - ProviderConfig resource is removed from Kubernetes +## Guarantees -**Important**: NEGs are not automatically deleted when a ProviderConfig is removed. They are cleaned up as part of the namespace/service deletion process that happens before ProviderConfig deletion. +- Controllers only manage explicitly labeled resources +- One controller per ProviderConfig +- Base infrastructure survives individual controller failures +- PC deletion doesn't affect other projects \ No newline at end of file From bf55f3ea78aa576486ada641303ee6d6f165493d Mon Sep 17 00:00:00 2001 From: Slavik Panasovets Date: Wed, 3 Sep 2025 05:52:17 -0700 Subject: [PATCH 3/6] multiproject/manager: add tests; harden startup and finalizer handling - Add manager_test.go covering: - Start adds NEG cleanup finalizer and is idempotent - Start failure and GCE client creation failure roll back finalizer - Stop closes controller channel and removes finalizer - Concurrent starts for same ProviderConfig are single-shot - Improve manager.go: - Introduce test seam via package var startNEGController - Ensure finalizer is added before start; roll back on any startup error - Delete finalizer using latest ProviderConfig from API to avoid stale updates - Wrap errors with %w and add GoDoc comments - Rename exported ControllerSet to unexported controllerSet - No expected behavior change in the happy path; robustness and testability improved. --- pkg/multiproject/manager/manager.go | 73 ++++-- pkg/multiproject/manager/manager_test.go | 280 +++++++++++++++++++++++ 2 files changed, 340 insertions(+), 13 deletions(-) create mode 100644 pkg/multiproject/manager/manager_test.go diff --git a/pkg/multiproject/manager/manager.go b/pkg/multiproject/manager/manager.go index e0f78ad7dc..ab7caf487d 100644 --- a/pkg/multiproject/manager/manager.go +++ b/pkg/multiproject/manager/manager.go @@ -1,11 +1,13 @@ package manager import ( + "context" "fmt" "sync" networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" @@ -21,9 +23,17 @@ import ( "k8s.io/klog/v2" ) +// startNEGController is a package-level variable to allow tests to stub the +// actual NEG controller startup routine. +var startNEGController = neg.StartNEGController + +// ProviderConfigControllersManager coordinates lifecycle of controllers scoped to +// a single ProviderConfig. It ensures per-ProviderConfig controller startup is +// idempotent, adds/removes the NEG cleanup finalizer, and wires a stop channel +// for clean shutdown. type ProviderConfigControllersManager struct { mu sync.Mutex - controllers map[string]*ControllerSet + controllers map[string]*controllerSet logger klog.Logger providerConfigClient providerconfigclient.Interface @@ -43,10 +53,14 @@ type ProviderConfigControllersManager struct { syncerMetrics *syncMetrics.SyncerMetrics } -type ControllerSet struct { +// controllerSet holds controller-specific resources for a ProviderConfig. +// Unexported because it is an internal implementation detail. +type controllerSet struct { stopCh chan<- struct{} } +// NewProviderConfigControllerManager constructs a new ProviderConfigControllersManager. +// It does not start any controllers until StartControllersForProviderConfig is invoked. func NewProviderConfigControllerManager( kubeClient kubernetes.Interface, informers *multiprojectinformers.InformerSet, @@ -65,7 +79,7 @@ func NewProviderConfigControllerManager( syncerMetrics *syncMetrics.SyncerMetrics, ) *ProviderConfigControllersManager { return &ProviderConfigControllersManager{ - controllers: make(map[string]*ControllerSet), + controllers: make(map[string]*controllerSet), logger: logger, providerConfigClient: providerConfigClient, informers: informers, @@ -88,6 +102,10 @@ func providerConfigKey(pc *providerconfig.ProviderConfig) string { return pc.Name } +// StartControllersForProviderConfig ensures finalizers are present and starts +// the controllers associated with the given ProviderConfig. The call is +// idempotent per ProviderConfig: concurrent or repeated calls for the same +// ProviderConfig will only start controllers once. func (pccm *ProviderConfigControllersManager) StartControllersForProviderConfig(pc *providerconfig.ProviderConfig) error { pccm.mu.Lock() defer pccm.mu.Unlock() @@ -102,17 +120,23 @@ func (pccm *ProviderConfigControllersManager) StartControllersForProviderConfig( return nil } + // Add the cleanup finalizer up front to avoid a window where controllers + // may create external resources without a finalizer present. If deletion + // happens in that window, cleanup could be skipped. We roll this back on + // any subsequent startup error. err := finalizer.EnsureProviderConfigNEGCleanupFinalizer(pc, pccm.providerConfigClient, logger) if err != nil { - return fmt.Errorf("failed to ensure NEG cleanup finalizer for project %s: %v", pcKey, err) + return fmt.Errorf("failed to ensure NEG cleanup finalizer for project %s: %w", pcKey, err) } cloud, err := pccm.gceCreator.GCEForProviderConfig(pc, logger) if err != nil { - return fmt.Errorf("failed to create GCE client for provider config %+v: %v", pc, err) + // If GCE client creation fails after finalizer was added, roll it back. + pccm.rollbackFinalizerOnStartFailure(pc, logger, err) + return fmt.Errorf("failed to create GCE client for provider config %+v: %w", pc, err) } - negControllerStopCh, err := neg.StartNEGController( + negControllerStopCh, err := startNEGController( pccm.informers, pccm.kubeClient, pccm.eventRecorderClient, @@ -130,14 +154,12 @@ func (pccm *ProviderConfigControllersManager) StartControllersForProviderConfig( pccm.syncerMetrics, ) if err != nil { - cleanupErr := finalizer.DeleteProviderConfigNEGCleanupFinalizer(pc, pccm.providerConfigClient, logger) - if cleanupErr != nil { - logger.Error(cleanupErr, "failed to clean up NEG finalizer after controller start failure", "originalError", err) - } - return fmt.Errorf("failed to start NEG controller for project %s: %v", pcKey, err) + // If startup fails, make a best-effort to roll back the finalizer. + pccm.rollbackFinalizerOnStartFailure(pc, logger, err) + return fmt.Errorf("failed to start NEG controller for project %s: %w", pcKey, err) } - pccm.controllers[pcKey] = &ControllerSet{ + pccm.controllers[pcKey] = &controllerSet{ stopCh: negControllerStopCh, } @@ -161,9 +183,34 @@ func (pccm *ProviderConfigControllersManager) StopControllersForProviderConfig(p close(pccm.controllers[csKey].stopCh) delete(pccm.controllers, csKey) - err := finalizer.DeleteProviderConfigNEGCleanupFinalizer(pc, pccm.providerConfigClient, logger) + + // Ensure we remove the finalizer from the latest object state. + pcLatest := pccm.latestPCWithCleanupFinalizer(pc) + err := finalizer.DeleteProviderConfigNEGCleanupFinalizer(pcLatest, pccm.providerConfigClient, logger) if err != nil { logger.Error(err, "failed to delete NEG cleanup finalizer for project") } logger.Info("Stopped controllers for provider config") } + +// rollbackFinalizerOnStartFailure removes the NEG cleanup finalizer after a +// start failure so that ProviderConfig deletion is not blocked. +func (pccm *ProviderConfigControllersManager) rollbackFinalizerOnStartFailure(pc *providerconfig.ProviderConfig, logger klog.Logger, cause error) { + pcLatest := pccm.latestPCWithCleanupFinalizer(pc) + if err := finalizer.DeleteProviderConfigNEGCleanupFinalizer(pcLatest, pccm.providerConfigClient, logger); err != nil { + logger.Error(err, "failed to clean up NEG finalizer after start failure", "originalError", cause) + } +} + +// latestPCWithCleanupFinalizer returns the latest ProviderConfig from the API server. +// If the Get fails, it returns a local copy of the provided pc with the cleanup +// finalizer appended to ensure a subsequent delete attempt is not a no-op. +func (pccm *ProviderConfigControllersManager) latestPCWithCleanupFinalizer(pc *providerconfig.ProviderConfig) *providerconfig.ProviderConfig { + pcLatest, err := pccm.providerConfigClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) + if err != nil { + pcCopy := pc.DeepCopy() + pcCopy.Finalizers = append(pcCopy.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) + return pcCopy + } + return pcLatest +} diff --git a/pkg/multiproject/manager/manager_test.go b/pkg/multiproject/manager/manager_test.go new file mode 100644 index 0000000000..d343f37322 --- /dev/null +++ b/pkg/multiproject/manager/manager_test.go @@ -0,0 +1,280 @@ +package manager + +import ( + "context" + "errors" + "slices" + "sync" + "testing" + + networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" + nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/fake" + cloudgce "k8s.io/cloud-provider-gcp/providers/gce" + providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" + "k8s.io/ingress-gce/pkg/multiproject/finalizer" + multiprojectgce "k8s.io/ingress-gce/pkg/multiproject/gce" + multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/informerset" + syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" + "k8s.io/ingress-gce/pkg/neg/syncers/labels" + providerconfigclientfake "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned/fake" + svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" + svcnegfake "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned/fake" + "k8s.io/ingress-gce/pkg/utils/namer" + klog "k8s.io/klog/v2" + ktesting "k8s.io/klog/v2/ktesting" +) + +// stubbed start function handle +type startCall struct { + pcName string +} + +// failingGCECreator is a test double that fails GCE client creation. +type failingGCECreator struct{} + +func (f failingGCECreator) GCEForProviderConfig(_ *providerconfig.ProviderConfig, _ klog.Logger) (*cloudgce.Cloud, error) { + return nil, errors.New("boom") +} + +func makePC(name string) *providerconfig.ProviderConfig { + return &providerconfig.ProviderConfig{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: providerconfig.ProviderConfigSpec{ + ProjectID: "test-project", + ProjectNumber: 123, + NetworkConfig: providerconfig.ProviderNetworkConfig{ + Network: "net-1", + SubnetInfo: providerconfig.ProviderConfigSubnetInfo{Subnetwork: "sub-1"}, + }, + }, + } +} + +func newManagerForTest(t *testing.T, svcNeg svcnegclient.Interface) (*ProviderConfigControllersManager, *providerconfigclientfake.Clientset) { + t.Helper() + kubeClient := fake.NewSimpleClientset() + pcClient := providerconfigclientfake.NewSimpleClientset() + informers := multiprojectinformers.NewInformerSet(kubeClient, svcNeg, networkclient.Interface(nil), nodetopologyclient.Interface(nil), metav1.Duration{}) + logger, _ := ktesting.NewTestContext(t) + + rootNamer := namer.NewNamer("clusteruid", "", logger) + kubeSystemUID := types.UID("uid") + gceCreator := multiprojectgce.NewGCEFake() + lpCfg := labels.PodLabelPropagationConfig{} + globalStop := make(chan struct{}) + t.Cleanup(func() { close(globalStop) }) + + mgr := NewProviderConfigControllerManager( + kubeClient, + informers, + pcClient, + svcNeg, + networkclient.Interface(nil), + nodetopologyclient.Interface(nil), + kubeClient, // event recorder + kubeSystemUID, + rootNamer, + namer.NewL4Namer(string(kubeSystemUID), rootNamer), + lpCfg, + gceCreator, + globalStop, + logger, + syncMetrics.FakeSyncerMetrics(), + ) + return mgr, pcClient +} + +func createProviderConfig(t *testing.T, pcClient *providerconfigclientfake.Clientset, name string) *providerconfig.ProviderConfig { + t.Helper() + pc := makePC(name) + _, err := pcClient.CloudV1().ProviderConfigs().Create(context.Background(), pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("create pc: %v", err) + } + return pc +} + +// TestStart_AddsFinalizer_AndIsIdempotent verifies that starting controllers +// for a ProviderConfig adds the NEG cleanup finalizer and that repeated starts +// are idempotent (the underlying controller is launched only once). +func TestStart_AddsFinalizer_AndIsIdempotent(t *testing.T) { + var calls []startCall + orig := startNEGController + startNEGController = func(_ *multiprojectinformers.InformerSet, _ kubernetes.Interface, _ kubernetes.Interface, _ svcnegclient.Interface, + _ networkclient.Interface, _ nodetopologyclient.Interface, _ types.UID, _ *namer.Namer, _ *namer.L4Namer, + _ labels.PodLabelPropagationConfig, _ *cloudgce.Cloud, _ <-chan struct{}, _ klog.Logger, pc *providerconfig.ProviderConfig, _ *syncMetrics.SyncerMetrics) (chan<- struct{}, error) { + calls = append(calls, startCall{pcName: pc.Name}) + return make(chan struct{}), nil + } + t.Cleanup(func() { startNEGController = orig }) + + mgr, pcClient := newManagerForTest(t, svcnegfake.NewSimpleClientset()) + pc := createProviderConfig(t, pcClient, "pc-1") + + err := mgr.StartControllersForProviderConfig(pc) + if err != nil { + t.Fatalf("StartControllersForProviderConfig error: %v", err) + } + + got, err := pcClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get pc: %v", err) + } + if !slices.Contains(got.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) { + t.Fatalf("expected finalizer to be added, got %v", got.Finalizers) + } + + err = mgr.StartControllersForProviderConfig(pc) + if err != nil { + t.Fatalf("second start returned error: %v", err) + } + + if len(calls) != 1 || calls[0].pcName != pc.Name { + t.Fatalf("unexpected start calls: %#v", calls) + } + +} + +// TestStart_FailureCleansFinalizer verifies that when starting controllers +// fails, the added finalizer is rolled back and removed from the object. +func TestStart_FailureCleansFinalizer(t *testing.T) { + orig := startNEGController + startNEGController = func(_ *multiprojectinformers.InformerSet, _ kubernetes.Interface, _ kubernetes.Interface, _ svcnegclient.Interface, + _ networkclient.Interface, _ nodetopologyclient.Interface, _ types.UID, _ *namer.Namer, _ *namer.L4Namer, + _ labels.PodLabelPropagationConfig, _ *cloudgce.Cloud, _ <-chan struct{}, _ klog.Logger, _ *providerconfig.ProviderConfig, _ *syncMetrics.SyncerMetrics) (chan<- struct{}, error) { + return nil, errors.New("boom") + } + t.Cleanup(func() { startNEGController = orig }) + + mgr, pcClient := newManagerForTest(t, svcnegfake.NewSimpleClientset()) + pc := createProviderConfig(t, pcClient, "pc-err") + + err := mgr.StartControllersForProviderConfig(pc) + if err == nil { + t.Fatalf("expected error from StartControllersForProviderConfig") + } + + got, err := pcClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get pc: %v", err) + } + if slices.Contains(got.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) { + t.Fatalf("expected finalizer removed on failure, got %v", got.Finalizers) + } + +} + +// TestStart_GCEClientError_CleansFinalizer verifies that if the GCE client +// creation fails after adding the finalizer, the finalizer is rolled back so +// deletion is not blocked unnecessarily. +func TestStart_GCEClientError_CleansFinalizer(t *testing.T) { + mgr, pcClient := newManagerForTest(t, svcnegfake.NewSimpleClientset()) + pc := createProviderConfig(t, pcClient, "pc-gce-err") + + // Inject failing GCE creator. + mgr.gceCreator = failingGCECreator{} + + err := mgr.StartControllersForProviderConfig(pc) + if err == nil { + t.Fatalf("expected error from StartControllersForProviderConfig when GCEForProviderConfig fails") + } + + got, err := pcClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get pc: %v", err) + } + if slices.Contains(got.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) { + t.Fatalf("expected finalizer removed on GCE client failure, got %v", got.Finalizers) + } +} + +// TestStop_ClosesChannel_AndRemovesFinalizer verifies that stopping a +// ProviderConfig's controllers closes the stop channel and removes the +// cleanup finalizer from the ProviderConfig resource. +func TestStop_ClosesChannel_AndRemovesFinalizer(t *testing.T) { + var ch chan struct{} + orig := startNEGController + startNEGController = func(_ *multiprojectinformers.InformerSet, _ kubernetes.Interface, _ kubernetes.Interface, _ svcnegclient.Interface, + _ networkclient.Interface, _ nodetopologyclient.Interface, _ types.UID, _ *namer.Namer, _ *namer.L4Namer, + _ labels.PodLabelPropagationConfig, _ *cloudgce.Cloud, _ <-chan struct{}, _ klog.Logger, _ *providerconfig.ProviderConfig, _ *syncMetrics.SyncerMetrics) (chan<- struct{}, error) { + ch = make(chan struct{}) + return ch, nil + } + t.Cleanup(func() { startNEGController = orig }) + + mgr, pcClient := newManagerForTest(t, svcnegfake.NewSimpleClientset()) + pc := createProviderConfig(t, pcClient, "pc-stop") + + err := mgr.StartControllersForProviderConfig(pc) + if err != nil { + t.Fatalf("start: %v", err) + } + gotBefore, err := pcClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get pc: %v", err) + } + if !slices.Contains(gotBefore.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) { + t.Fatalf("expected finalizer before stop") + } + + mgr.StopControllersForProviderConfig(pc) + + // Channel should be closed (non-blocking read succeeds) + select { + case <-ch: + // ok + default: + t.Fatalf("expected stop channel to be closed") + } + + gotAfter, err := pcClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("get pc: %v", err) + } + if slices.Contains(gotAfter.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) { + t.Fatalf("expected finalizer removed after stop, got %v", gotAfter.Finalizers) + } + + // Second stop should not panic + mgr.StopControllersForProviderConfig(pc) + +} + +// TestConcurrentStart_IsSingleShot verifies that concurrent calls to start +// controllers for the same ProviderConfig result in a single start. +func TestConcurrentStart_IsSingleShot(t *testing.T) { + var mu sync.Mutex + var calls []startCall + orig := startNEGController + startNEGController = func(_ *multiprojectinformers.InformerSet, _ kubernetes.Interface, _ kubernetes.Interface, _ svcnegclient.Interface, + _ networkclient.Interface, _ nodetopologyclient.Interface, _ types.UID, _ *namer.Namer, _ *namer.L4Namer, + _ labels.PodLabelPropagationConfig, _ *cloudgce.Cloud, _ <-chan struct{}, _ klog.Logger, pc *providerconfig.ProviderConfig, _ *syncMetrics.SyncerMetrics) (chan<- struct{}, error) { + mu.Lock() + defer mu.Unlock() + calls = append(calls, startCall{pcName: pc.Name}) + return make(chan struct{}), nil + } + t.Cleanup(func() { startNEGController = orig }) + + mgr, pcClient := newManagerForTest(t, svcnegfake.NewSimpleClientset()) + pc := createProviderConfig(t, pcClient, "pc-concurrent") + + var wg sync.WaitGroup + const n = 10 + wg.Add(n) + for range n { + go func() { + defer wg.Done() + _ = mgr.StartControllersForProviderConfig(pc) + }() + } + wg.Wait() + + if len(calls) != 1 { + t.Fatalf("expected single start call, got %d", len(calls)) + } +} From 291e886bee7b72c53dadc5a5fc78c3db58e1c54d Mon Sep 17 00:00:00 2001 From: Slavik Panasovets Date: Thu, 4 Sep 2025 14:41:56 -0700 Subject: [PATCH 4/6] Add MT NEG Controller unit tests --- pkg/multiproject/controller/controller.go | 9 +- pkg/multiproject/neg/neg.go | 6 +- pkg/multiproject/neg/neg_test.go | 180 ++++++++++++++++++++++ 3 files changed, 186 insertions(+), 9 deletions(-) create mode 100644 pkg/multiproject/neg/neg_test.go diff --git a/pkg/multiproject/controller/controller.go b/pkg/multiproject/controller/controller.go index 300537bfb1..60786a4ac4 100644 --- a/pkg/multiproject/controller/controller.go +++ b/pkg/multiproject/controller/controller.go @@ -2,7 +2,6 @@ package controller import ( - "context" "fmt" "math/rand" "runtime/debug" @@ -72,15 +71,9 @@ func (pcc *ProviderConfigController) Run() { defer pcc.shutdown() pcc.logger.Info("Starting ProviderConfig controller") - ctx, cancel := context.WithCancel(context.Background()) - go func() { - <-pcc.stopCh - pcc.logger.Info("Stop channel closed, cancelling context") - cancel() - }() pcc.logger.Info("Waiting for initial cache sync before starting ProviderConfig Controller") - ok := cache.WaitForCacheSync(ctx.Done(), pcc.hasSynced) + ok := cache.WaitForCacheSync(pcc.stopCh, pcc.hasSynced) if !ok { pcc.logger.Error(nil, "Failed to wait for initial cache sync before starting ProviderConfig Controller") } diff --git a/pkg/multiproject/neg/neg.go b/pkg/multiproject/neg/neg.go index cc064001a0..1507d4d8fc 100644 --- a/pkg/multiproject/neg/neg.go +++ b/pkg/multiproject/neg/neg.go @@ -25,6 +25,10 @@ import ( "k8s.io/klog/v2" ) +// newNEGController is a package-level indirection to allow tests to stub the +// actual NEG controller constructor. In production it points to neg.NewController. +var newNEGController = neg.NewController + // StartNEGController creates and runs a NEG controller for the specified ProviderConfig. // The returned channel is closed by StopControllersForProviderConfig to signal a shutdown // specific to this ProviderConfig's controller. @@ -156,7 +160,7 @@ func createNEGController( noDefaultBackendServicePort := utils.ServicePort{} negMetrics := metrics.NewNegMetrics() - negController, err := neg.NewController( + negController, err := newNEGController( kubeClient, svcNegClient, eventRecorderClient, diff --git a/pkg/multiproject/neg/neg_test.go b/pkg/multiproject/neg/neg_test.go new file mode 100644 index 0000000000..deb0356e32 --- /dev/null +++ b/pkg/multiproject/neg/neg_test.go @@ -0,0 +1,180 @@ +package neg + +import ( + "testing" + "time" + + networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" + nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + k8sfake "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/tools/cache" + providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" + multiprojectgce "k8s.io/ingress-gce/pkg/multiproject/gce" + multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/informerset" + "k8s.io/ingress-gce/pkg/neg" + "k8s.io/ingress-gce/pkg/neg/metrics" + syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" + "k8s.io/ingress-gce/pkg/neg/syncers/labels" + negtypes "k8s.io/ingress-gce/pkg/neg/types" + svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" + svcnegfake "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned/fake" + "k8s.io/ingress-gce/pkg/utils" + "k8s.io/ingress-gce/pkg/utils/namer" + "k8s.io/ingress-gce/pkg/utils/zonegetter" + klog "k8s.io/klog/v2" + ktesting "k8s.io/klog/v2/ktesting" +) + +// TestStartNEGController_StopJoin verifies that the stop channel passed to the controller +// closes when either the global stop channel or the per-ProviderConfig stop channel closes. +func TestStartNEGController_StopJoin(t *testing.T) { + + logger, _ := ktesting.NewTestContext(t) + kubeClient := k8sfake.NewSimpleClientset() + informers := multiprojectinformers.NewInformerSet(kubeClient, svcnegfake.NewSimpleClientset(), networkclient.Interface(nil), nodetopologyclient.Interface(nil), metav1.Duration{}) + + // Start base informers; they are not strictly required by our stubbed controller, + // but mirrors real startup flow and ensures CombinedHasSynced would be true if used. + globalStop := make(chan struct{}) + t.Cleanup(func() { close(globalStop) }) + if err := informers.Start(globalStop, logger); err != nil { + t.Fatalf("start informers: %v", err) + } + + // Provide required inputs for StartNEGController + pc := &providerconfig.ProviderConfig{ObjectMeta: metav1.ObjectMeta{Name: "pc-1"}} + kubeSystemUID := types.UID("uid") + rootNamer := namer.NewNamer("clusteruid", "", logger) + l4Namer := namer.NewL4Namer(string(kubeSystemUID), rootNamer) + lpCfg := labels.PodLabelPropagationConfig{} + // Create a fake cloud with a valid SubnetworkURL via multiproject helper. + gceCreator := multiprojectgce.NewGCEFake() + // Minimal provider config for the GCE fake + pcForCloud := &providerconfig.ProviderConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "pc-1"}, + Spec: providerconfig.ProviderConfigSpec{ + ProjectID: "test-project", + ProjectNumber: 123, + NetworkConfig: providerconfig.ProviderNetworkConfig{ + Network: "net-1", + SubnetInfo: providerconfig.ProviderConfigSubnetInfo{Subnetwork: "sub-1"}, + }, + }, + } + cloud, err := gceCreator.GCEForProviderConfig(pcForCloud, logger) + if err != nil { + t.Fatalf("create fake cloud: %v", err) + } + + // Stub newNEGController to capture the stopCh passed in and to construct a minimal controller + // that can run without panics. + var capturedStopCh <-chan struct{} + orig := newNEGController + newNEGController = func(kc kubernetes.Interface, sc svcnegclient.Interface, ec kubernetes.Interface, uid types.UID, + ing cache.SharedIndexInformer, svc cache.SharedIndexInformer, pod cache.SharedIndexInformer, node cache.SharedIndexInformer, + es cache.SharedIndexInformer, sn cache.SharedIndexInformer, netInf cache.SharedIndexInformer, gke cache.SharedIndexInformer, nt cache.SharedIndexInformer, + synced func() bool, l4 namer.L4ResourcesNamer, defSP utils.ServicePort, cloud negtypes.NetworkEndpointGroupCloud, zg *zonegetter.ZoneGetter, nm negtypes.NetworkEndpointGroupNamer, + resync time.Duration, gc time.Duration, workers int, enableRR bool, runL4 bool, nonGCP bool, dualStack bool, lp labels.PodLabelPropagationConfig, + multiNetworking bool, ingressRegional bool, runNetLB bool, readOnly bool, enableNEGsForIngress bool, + stopCh <-chan struct{}, l klog.Logger, negMetrics *metrics.NegMetrics, syncerMetrics *syncMetrics.SyncerMetrics) (*neg.Controller, error) { + capturedStopCh = stopCh + return neg.NewController(kc, sc, ec, uid, ing, svc, pod, node, es, sn, netInf, gke, nt, synced, l4, defSP, cloud, zg, nm, + resync, gc, workers, enableRR, runL4, nonGCP, dualStack, lp, multiNetworking, ingressRegional, runNetLB, readOnly, enableNEGsForIngress, stopCh, l, negMetrics, syncerMetrics) + } + t.Cleanup(func() { newNEGController = orig }) + + testCases := []struct { + name string + closeProvider bool + }{ + {name: "provider-stop-closes-joined", closeProvider: true}, + {name: "global-stop-closes-joined", closeProvider: false}, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + + var joinStop <-chan struct{} + var providerStop chan<- struct{} + + if tc.closeProvider { + // Wire the join to the real globalStop for this subcase. + joinStop = globalStop + var err error + providerStop, err = StartNEGController(informers, kubeClient, kubeClient, svcnegfake.NewSimpleClientset(), networkclient.Interface(nil), nodetopologyclient.Interface(nil), kubeSystemUID, rootNamer, l4Namer, lpCfg, cloud, joinStop, logger, pc, syncMetrics.FakeSyncerMetrics()) + if err != nil { + t.Fatalf("StartNEGController: %v", err) + } + close(providerStop) + } else { + // Use a dedicated join channel so informers keep running. + js := make(chan struct{}) + joinStop = js + var err error + providerStop, err = StartNEGController(informers, kubeClient, kubeClient, svcnegfake.NewSimpleClientset(), networkclient.Interface(nil), nodetopologyclient.Interface(nil), kubeSystemUID, rootNamer, l4Namer, lpCfg, cloud, joinStop, logger, &providerconfig.ProviderConfig{ObjectMeta: metav1.ObjectMeta{Name: "pc-2"}}, syncMetrics.FakeSyncerMetrics()) + if err != nil { + t.Fatalf("StartNEGController (2): %v", err) + } + close(js) + defer close(providerStop) // safe if already closed + } + + if capturedStopCh == nil { + t.Fatalf("capturedStopCh is nil; stub did not run") + } + select { + case <-capturedStopCh: + // ok + case <-time.After(2 * time.Second): + t.Fatalf("joined stopCh did not close for case %q", tc.name) + } + }) + } +} + +// TestStartNEGController_NilSvcNegClientErrors verifies StartNEGController returns an error +// when the svcneg client is nil (which makes controller construction fail). +func TestStartNEGController_NilSvcNegClientErrors(t *testing.T) { + t.Parallel() + + logger, _ := ktesting.NewTestContext(t) + kubeClient := k8sfake.NewSimpleClientset() + informers := multiprojectinformers.NewInformerSet(kubeClient, nil, networkclient.Interface(nil), nodetopologyclient.Interface(nil), metav1.Duration{}) + globalStop := make(chan struct{}) + t.Cleanup(func() { close(globalStop) }) + if err := informers.Start(globalStop, logger); err != nil { + t.Fatalf("start informers: %v", err) + } + + pc := &providerconfig.ProviderConfig{ObjectMeta: metav1.ObjectMeta{Name: "pc-err"}} + kubeSystemUID := types.UID("uid") + rootNamer := namer.NewNamer("clusteruid", "", logger) + l4Namer := namer.NewL4Namer(string(kubeSystemUID), rootNamer) + lpCfg := labels.PodLabelPropagationConfig{} + gceCreator := multiprojectgce.NewGCEFake() + pcForCloud := &providerconfig.ProviderConfig{ + ObjectMeta: metav1.ObjectMeta{Name: "pc-err"}, + Spec: providerconfig.ProviderConfigSpec{ + ProjectID: "test-project", + ProjectNumber: 123, + NetworkConfig: providerconfig.ProviderNetworkConfig{ + Network: "net-1", + SubnetInfo: providerconfig.ProviderConfigSubnetInfo{Subnetwork: "sub-1"}, + }, + }, + } + cloud, err := gceCreator.GCEForProviderConfig(pcForCloud, logger) + if err != nil { + t.Fatalf("create fake cloud: %v", err) + } + + // newNEGController remains default (neg.NewController), which errors when svcNegClient is nil + ch, err := StartNEGController(informers, kubeClient, kubeClient, nil /* svcneg */, networkclient.Interface(nil), nodetopologyclient.Interface(nil), kubeSystemUID, rootNamer, l4Namer, lpCfg, cloud, globalStop, logger, pc, syncMetrics.FakeSyncerMetrics()) + if err == nil { + t.Fatalf("expected error from StartNEGController when svcNegClient is nil, got nil and channel=%v", ch) + } +} From 73f22ebd8add700710244c2cb25b90b58f57d294 Mon Sep 17 00:00:00 2001 From: Slavik Panasovets Date: Thu, 9 Oct 2025 18:24:33 -0700 Subject: [PATCH 5/6] Fix mutex contention blocking concurrent ProviderConfig processing The ProviderConfigControllersManager was using a single mutex that blocked all 5 workers when one ProviderConfig's initialization took a long time (e.g., GCE client creation with infinite retry loops). This prevented concurrent processing of different ProviderConfigs. Changes: - Extract thread-safe ControllerMap with minimal lock scope - Only hold lock during map operations (Get/Set/Delete) - Expensive operations (finalizer, GCE client, NEG controller startup) now execute without holding the lock - Add comprehensive unit tests for concurrent access patterns Impact: - Multiple ProviderConfigs can now initialize concurrently - One stuck ProviderConfig no longer blocks others - Workqueue guarantees prevent same-key concurrent processing - All 5 workers can now operate in parallel on different configs --- pkg/multiproject/manager/controller_map.go | 45 ++++++ .../manager/controller_map_test.go | 153 ++++++++++++++++++ pkg/multiproject/manager/manager.go | 34 ++-- 3 files changed, 211 insertions(+), 21 deletions(-) create mode 100644 pkg/multiproject/manager/controller_map.go create mode 100644 pkg/multiproject/manager/controller_map_test.go diff --git a/pkg/multiproject/manager/controller_map.go b/pkg/multiproject/manager/controller_map.go new file mode 100644 index 0000000000..b75cae2367 --- /dev/null +++ b/pkg/multiproject/manager/controller_map.go @@ -0,0 +1,45 @@ +package manager + +import "sync" + +// ControllerMap is a thread-safe map for storing ControllerSet instances. +// It uses fine-grained locking to allow concurrent operations on different keys. +type ControllerMap struct { + mu sync.Mutex + data map[string]*controllerSet +} + +// NewControllerMap creates a new thread-safe ControllerMap. +func NewControllerMap() *ControllerMap { + return &ControllerMap{ + data: make(map[string]*controllerSet), + } +} + +// Get retrieves a ControllerSet for the given key. +// Returns the ControllerSet and a boolean indicating whether it exists. +func (cm *ControllerMap) Get(key string) (*controllerSet, bool) { + cm.mu.Lock() + defer cm.mu.Unlock() + cs, exists := cm.data[key] + return cs, exists +} + +// Set stores a ControllerSet for the given key. +func (cm *ControllerMap) Set(key string, cs *controllerSet) { + cm.mu.Lock() + defer cm.mu.Unlock() + cm.data[key] = cs +} + +// Delete removes and returns a ControllerSet for the given key. +// Returns the ControllerSet and a boolean indicating whether it existed. +func (cm *ControllerMap) Delete(key string) (*controllerSet, bool) { + cm.mu.Lock() + defer cm.mu.Unlock() + cs, exists := cm.data[key] + if exists { + delete(cm.data, key) + } + return cs, exists +} diff --git a/pkg/multiproject/manager/controller_map_test.go b/pkg/multiproject/manager/controller_map_test.go new file mode 100644 index 0000000000..375f0c49d1 --- /dev/null +++ b/pkg/multiproject/manager/controller_map_test.go @@ -0,0 +1,153 @@ +package manager + +import ( + "sync" + "sync/atomic" + "testing" + "time" +) + +// TestControllerMapBasicOperations verifies basic Get, Set, and Delete operations. +func TestControllerMapBasicOperations(t *testing.T) { + cm := NewControllerMap() + + // Test Get on non-existent key + _, exists := cm.Get("test-key") + if exists { + t.Error("Expected controller to not exist") + } + + // Test Set and Get + testCS := &controllerSet{stopCh: make(chan struct{})} + cm.Set("test-key", testCS) + + cs, exists := cm.Get("test-key") + if !exists { + t.Error("Expected controller to exist after Set") + } + if cs != testCS { + t.Error("Retrieved controller does not match set controller") + } + + // Test Delete + deletedCS, exists := cm.Delete("test-key") + if !exists { + t.Error("Expected controller to exist for Delete") + } + if deletedCS != testCS { + t.Error("Deleted controller does not match original") + } + + // Test Get after Delete + _, exists = cm.Get("test-key") + if exists { + t.Error("Expected controller to not exist after Delete") + } + + // Test Delete on non-existent key + _, exists = cm.Delete("non-existent") + if exists { + t.Error("Expected Delete to return false for non-existent key") + } +} + +// TestControllerMapConcurrentAccess verifies that concurrent operations on different keys +// can proceed without blocking each other. +func TestControllerMapConcurrentAccess(t *testing.T) { + cm := NewControllerMap() + + // Track concurrent operations + var concurrentOps atomic.Int32 + var maxConcurrentOps atomic.Int32 + + const numGoroutines = 10 + const operationDelay = 50 * time.Millisecond + + var wg sync.WaitGroup + wg.Add(numGoroutines) + + for i := 0; i < numGoroutines; i++ { + go func(idx int) { + defer wg.Done() + + key := string(rune('a' + idx)) // Different keys: "a", "b", "c", etc. + + // Track concurrent execution + current := concurrentOps.Add(1) + defer concurrentOps.Add(-1) + + // Update max concurrent ops + for { + max := maxConcurrentOps.Load() + if current <= max || maxConcurrentOps.CompareAndSwap(max, current) { + break + } + } + + // Simulate some work while "holding" access to this key + time.Sleep(operationDelay) + + // Perform operations + cs := &controllerSet{stopCh: make(chan struct{})} + cm.Set(key, cs) + + retrievedCS, exists := cm.Get(key) + if !exists || retrievedCS != cs { + t.Errorf("Failed to retrieve controller for key %s", key) + } + + cm.Delete(key) + }(i) + } + + wg.Wait() + + // Verify that multiple operations ran concurrently + // With 10 goroutines and 50ms delay, we should see significant concurrency + actualMax := maxConcurrentOps.Load() + if actualMax < 2 { + t.Errorf("Expected at least 2 concurrent operations, got %d. This suggests blocking that shouldn't occur.", actualMax) + } + + t.Logf("Max concurrent operations: %d (out of %d goroutines)", actualMax, numGoroutines) +} + +// TestControllerMapConcurrentSameKey verifies that concurrent operations on the same key +// are properly serialized and don't cause race conditions. +func TestControllerMapConcurrentSameKey(t *testing.T) { + cm := NewControllerMap() + + const numGoroutines = 100 + const key = "shared-key" + + var wg sync.WaitGroup + wg.Add(numGoroutines) + + // All goroutines try to set and get the same key + for i := 0; i < numGoroutines; i++ { + go func(idx int) { + defer wg.Done() + + cs := &controllerSet{stopCh: make(chan struct{})} + cm.Set(key, cs) + + retrievedCS, exists := cm.Get(key) + if !exists { + t.Errorf("Expected controller to exist for key %s", key) + } + // The retrieved controller might not be the one we just set + // (due to concurrent operations), but it should be valid + if retrievedCS == nil { + t.Error("Retrieved nil controller") + } + }(i) + } + + wg.Wait() + + // Cleanup - should still have one controller + _, exists := cm.Get(key) + if !exists { + t.Error("Expected at least one controller to remain") + } +} diff --git a/pkg/multiproject/manager/manager.go b/pkg/multiproject/manager/manager.go index ab7caf487d..2f96cf5dea 100644 --- a/pkg/multiproject/manager/manager.go +++ b/pkg/multiproject/manager/manager.go @@ -3,7 +3,6 @@ package manager import ( "context" "fmt" - "sync" networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" @@ -32,8 +31,7 @@ var startNEGController = neg.StartNEGController // idempotent, adds/removes the NEG cleanup finalizer, and wires a stop channel // for clean shutdown. type ProviderConfigControllersManager struct { - mu sync.Mutex - controllers map[string]*controllerSet + controllers *ControllerMap logger klog.Logger providerConfigClient providerconfigclient.Interface @@ -54,7 +52,6 @@ type ProviderConfigControllersManager struct { } // controllerSet holds controller-specific resources for a ProviderConfig. -// Unexported because it is an internal implementation detail. type controllerSet struct { stopCh chan<- struct{} } @@ -79,7 +76,7 @@ func NewProviderConfigControllerManager( syncerMetrics *syncMetrics.SyncerMetrics, ) *ProviderConfigControllersManager { return &ProviderConfigControllersManager{ - controllers: make(map[string]*controllerSet), + controllers: NewControllerMap(), logger: logger, providerConfigClient: providerConfigClient, informers: informers, @@ -107,23 +104,18 @@ func providerConfigKey(pc *providerconfig.ProviderConfig) string { // idempotent per ProviderConfig: concurrent or repeated calls for the same // ProviderConfig will only start controllers once. func (pccm *ProviderConfigControllersManager) StartControllersForProviderConfig(pc *providerconfig.ProviderConfig) error { - pccm.mu.Lock() - defer pccm.mu.Unlock() - logger := pccm.logger.WithValues("providerConfigId", pc.Name) pcKey := providerConfigKey(pc) logger.Info("Starting controllers for provider config") - if _, exists := pccm.controllers[pcKey]; exists { + + // Check if controller already exists + if _, exists := pccm.controllers.Get(pcKey); exists { logger.Info("Controllers for provider config already exist, skipping start") return nil } - // Add the cleanup finalizer up front to avoid a window where controllers - // may create external resources without a finalizer present. If deletion - // happens in that window, cleanup could be skipped. We roll this back on - // any subsequent startup error. err := finalizer.EnsureProviderConfigNEGCleanupFinalizer(pc, pccm.providerConfigClient, logger) if err != nil { return fmt.Errorf("failed to ensure NEG cleanup finalizer for project %s: %w", pcKey, err) @@ -159,30 +151,30 @@ func (pccm *ProviderConfigControllersManager) StartControllersForProviderConfig( return fmt.Errorf("failed to start NEG controller for project %s: %w", pcKey, err) } - pccm.controllers[pcKey] = &controllerSet{ + pccm.controllers.Set(pcKey, &controllerSet{ stopCh: negControllerStopCh, - } + }) logger.Info("Started controllers for provider config") return nil } func (pccm *ProviderConfigControllersManager) StopControllersForProviderConfig(pc *providerconfig.ProviderConfig) { - pccm.mu.Lock() - defer pccm.mu.Unlock() - logger := pccm.logger.WithValues("providerConfigId", pc.Name) csKey := providerConfigKey(pc) - _, exists := pccm.controllers[csKey] + + // Remove controller from map using minimal lock scope + cs, exists := pccm.controllers.Delete(csKey) if !exists { logger.Info("Controllers for provider config do not exist") return } - close(pccm.controllers[csKey].stopCh) + // Perform cleanup operations without holding the lock + close(cs.stopCh) - delete(pccm.controllers, csKey) + pccm.controllers.Delete(csKey) // Ensure we remove the finalizer from the latest object state. pcLatest := pccm.latestPCWithCleanupFinalizer(pc) From 4619ba954121e79f2b02b6571a55863c428a1303 Mon Sep 17 00:00:00 2001 From: Slavik Panasovets Date: Mon, 20 Oct 2025 15:45:37 +0400 Subject: [PATCH 6/6] Refactor multiproject into reusable framework Extract common utilities into pkg/multiproject/common and create generic controller framework in pkg/multiproject/framework. Reorganize NEG implementation to use the new structure. This refactoring enables reuse of controller patterns across different multiproject implementations while maintaining clean separation of concerns. --- cmd/glbc/main.go | 2 +- .../filteredinformer/filteredcache.go | 0 .../filteredinformer/filteredcache_test.go | 0 .../filteredinformer/filteredinformer.go | 0 .../filteredinformer/filteredinformer_test.go | 0 .../{ => common}/filteredinformer/helpers.go | 0 .../filteredinformer/helpers_test.go | 0 .../{ => common}/finalizer/finalizer.go | 10 +- .../{ => common}/finalizer/finalizer_test.go | 0 pkg/multiproject/{ => common}/gce/fake.go | 0 .../{ => common}/gce/fake_test.go | 0 pkg/multiproject/{ => common}/gce/gce.go | 0 pkg/multiproject/{ => common}/gce/gce_test.go | 0 .../testutil/providerconfigwebhook.go | 0 .../{controller => framework}/controller.go | 31 +- pkg/multiproject/framework/controller_map.go | 79 ++ .../controller_map_test.go | 25 +- .../controller_test.go | 173 ++++- pkg/multiproject/framework/framework.go | 76 ++ pkg/multiproject/framework/manager.go | 175 +++++ pkg/multiproject/framework/manager_test.go | 703 ++++++++++++++++++ pkg/multiproject/manager/controller_map.go | 45 -- pkg/multiproject/manager/manager.go | 208 ------ pkg/multiproject/manager/manager_test.go | 280 ------- .../{ => neg}/informerset/informerset.go | 2 +- .../{ => neg}/informerset/informerset_test.go | 0 pkg/multiproject/neg/neg.go | 2 +- pkg/multiproject/neg/neg_test.go | 4 +- pkg/multiproject/neg/starter.go | 106 +++ pkg/multiproject/start/start.go | 27 +- pkg/multiproject/start/start_test.go | 6 +- 31 files changed, 1350 insertions(+), 604 deletions(-) rename pkg/multiproject/{ => common}/filteredinformer/filteredcache.go (100%) rename pkg/multiproject/{ => common}/filteredinformer/filteredcache_test.go (100%) rename pkg/multiproject/{ => common}/filteredinformer/filteredinformer.go (100%) rename pkg/multiproject/{ => common}/filteredinformer/filteredinformer_test.go (100%) rename pkg/multiproject/{ => common}/filteredinformer/helpers.go (100%) rename pkg/multiproject/{ => common}/filteredinformer/helpers_test.go (100%) rename pkg/multiproject/{ => common}/finalizer/finalizer.go (81%) rename pkg/multiproject/{ => common}/finalizer/finalizer_test.go (100%) rename pkg/multiproject/{ => common}/gce/fake.go (100%) rename pkg/multiproject/{ => common}/gce/fake_test.go (100%) rename pkg/multiproject/{ => common}/gce/gce.go (100%) rename pkg/multiproject/{ => common}/gce/gce_test.go (100%) rename pkg/multiproject/{ => common}/testutil/providerconfigwebhook.go (100%) rename pkg/multiproject/{controller => framework}/controller.go (81%) create mode 100644 pkg/multiproject/framework/controller_map.go rename pkg/multiproject/{manager => framework}/controller_map_test.go (86%) rename pkg/multiproject/{controller => framework}/controller_test.go (54%) create mode 100644 pkg/multiproject/framework/framework.go create mode 100644 pkg/multiproject/framework/manager.go create mode 100644 pkg/multiproject/framework/manager_test.go delete mode 100644 pkg/multiproject/manager/controller_map.go delete mode 100644 pkg/multiproject/manager/manager.go delete mode 100644 pkg/multiproject/manager/manager_test.go rename pkg/multiproject/{ => neg}/informerset/informerset.go (99%) rename pkg/multiproject/{ => neg}/informerset/informerset_test.go (100%) create mode 100644 pkg/multiproject/neg/starter.go diff --git a/cmd/glbc/main.go b/cmd/glbc/main.go index 1f889a07b9..2534bf8ef6 100644 --- a/cmd/glbc/main.go +++ b/cmd/glbc/main.go @@ -42,7 +42,7 @@ import ( frontendconfigclient "k8s.io/ingress-gce/pkg/frontendconfig/client/clientset/versioned" "k8s.io/ingress-gce/pkg/instancegroups" "k8s.io/ingress-gce/pkg/l4lb" - multiprojectgce "k8s.io/ingress-gce/pkg/multiproject/gce" + multiprojectgce "k8s.io/ingress-gce/pkg/multiproject/common/gce" multiprojectstart "k8s.io/ingress-gce/pkg/multiproject/start" "k8s.io/ingress-gce/pkg/network" providerconfigclient "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned" diff --git a/pkg/multiproject/filteredinformer/filteredcache.go b/pkg/multiproject/common/filteredinformer/filteredcache.go similarity index 100% rename from pkg/multiproject/filteredinformer/filteredcache.go rename to pkg/multiproject/common/filteredinformer/filteredcache.go diff --git a/pkg/multiproject/filteredinformer/filteredcache_test.go b/pkg/multiproject/common/filteredinformer/filteredcache_test.go similarity index 100% rename from pkg/multiproject/filteredinformer/filteredcache_test.go rename to pkg/multiproject/common/filteredinformer/filteredcache_test.go diff --git a/pkg/multiproject/filteredinformer/filteredinformer.go b/pkg/multiproject/common/filteredinformer/filteredinformer.go similarity index 100% rename from pkg/multiproject/filteredinformer/filteredinformer.go rename to pkg/multiproject/common/filteredinformer/filteredinformer.go diff --git a/pkg/multiproject/filteredinformer/filteredinformer_test.go b/pkg/multiproject/common/filteredinformer/filteredinformer_test.go similarity index 100% rename from pkg/multiproject/filteredinformer/filteredinformer_test.go rename to pkg/multiproject/common/filteredinformer/filteredinformer_test.go diff --git a/pkg/multiproject/filteredinformer/helpers.go b/pkg/multiproject/common/filteredinformer/helpers.go similarity index 100% rename from pkg/multiproject/filteredinformer/helpers.go rename to pkg/multiproject/common/filteredinformer/helpers.go diff --git a/pkg/multiproject/filteredinformer/helpers_test.go b/pkg/multiproject/common/filteredinformer/helpers_test.go similarity index 100% rename from pkg/multiproject/filteredinformer/helpers_test.go rename to pkg/multiproject/common/filteredinformer/helpers_test.go diff --git a/pkg/multiproject/finalizer/finalizer.go b/pkg/multiproject/common/finalizer/finalizer.go similarity index 81% rename from pkg/multiproject/finalizer/finalizer.go rename to pkg/multiproject/common/finalizer/finalizer.go index 1310585559..6169b78eb4 100644 --- a/pkg/multiproject/finalizer/finalizer.go +++ b/pkg/multiproject/common/finalizer/finalizer.go @@ -16,14 +16,15 @@ const ( ) func EnsureProviderConfigNEGCleanupFinalizer(cs *providerconfig.ProviderConfig, csClient providerconfigclient.Interface, logger klog.Logger) error { - return ensureProviderConfigFinalizer(cs, ProviderConfigNEGCleanupFinalizer, csClient, logger) + return EnsureProviderConfigFinalizer(cs, ProviderConfigNEGCleanupFinalizer, csClient, logger) } func DeleteProviderConfigNEGCleanupFinalizer(cs *providerconfig.ProviderConfig, csClient providerconfigclient.Interface, logger klog.Logger) error { - return deleteProviderConfigFinalizer(cs, ProviderConfigNEGCleanupFinalizer, csClient, logger) + return DeleteProviderConfigFinalizer(cs, ProviderConfigNEGCleanupFinalizer, csClient, logger) } -func ensureProviderConfigFinalizer(pc *providerconfig.ProviderConfig, key string, csClient providerconfigclient.Interface, logger klog.Logger) error { +// EnsureProviderConfigFinalizer ensures a finalizer with the given name is present on the ProviderConfig. +func EnsureProviderConfigFinalizer(pc *providerconfig.ProviderConfig, key string, csClient providerconfigclient.Interface, logger klog.Logger) error { if HasGivenFinalizer(pc.ObjectMeta, key) { return nil } @@ -36,7 +37,8 @@ func ensureProviderConfigFinalizer(pc *providerconfig.ProviderConfig, key string return patch.PatchProviderConfigObjectMetadata(csClient, pc, *updatedObjectMeta) } -func deleteProviderConfigFinalizer(pc *providerconfig.ProviderConfig, key string, csClient providerconfigclient.Interface, logger klog.Logger) error { +// DeleteProviderConfigFinalizer removes a finalizer with the given name from the ProviderConfig. +func DeleteProviderConfigFinalizer(pc *providerconfig.ProviderConfig, key string, csClient providerconfigclient.Interface, logger klog.Logger) error { if !HasGivenFinalizer(pc.ObjectMeta, key) { return nil } diff --git a/pkg/multiproject/finalizer/finalizer_test.go b/pkg/multiproject/common/finalizer/finalizer_test.go similarity index 100% rename from pkg/multiproject/finalizer/finalizer_test.go rename to pkg/multiproject/common/finalizer/finalizer_test.go diff --git a/pkg/multiproject/gce/fake.go b/pkg/multiproject/common/gce/fake.go similarity index 100% rename from pkg/multiproject/gce/fake.go rename to pkg/multiproject/common/gce/fake.go diff --git a/pkg/multiproject/gce/fake_test.go b/pkg/multiproject/common/gce/fake_test.go similarity index 100% rename from pkg/multiproject/gce/fake_test.go rename to pkg/multiproject/common/gce/fake_test.go diff --git a/pkg/multiproject/gce/gce.go b/pkg/multiproject/common/gce/gce.go similarity index 100% rename from pkg/multiproject/gce/gce.go rename to pkg/multiproject/common/gce/gce.go diff --git a/pkg/multiproject/gce/gce_test.go b/pkg/multiproject/common/gce/gce_test.go similarity index 100% rename from pkg/multiproject/gce/gce_test.go rename to pkg/multiproject/common/gce/gce_test.go diff --git a/pkg/multiproject/testutil/providerconfigwebhook.go b/pkg/multiproject/common/testutil/providerconfigwebhook.go similarity index 100% rename from pkg/multiproject/testutil/providerconfigwebhook.go rename to pkg/multiproject/common/testutil/providerconfigwebhook.go diff --git a/pkg/multiproject/controller/controller.go b/pkg/multiproject/framework/controller.go similarity index 81% rename from pkg/multiproject/controller/controller.go rename to pkg/multiproject/framework/controller.go index 60786a4ac4..3561fb2a43 100644 --- a/pkg/multiproject/controller/controller.go +++ b/pkg/multiproject/framework/controller.go @@ -1,5 +1,5 @@ -// Package controller implements the ProviderConfig controller that starts and stops controllers for each ProviderConfig. -package controller +// Package framework implements the ProviderConfig controller that starts and stops controllers for each ProviderConfig. +package framework import ( "fmt" @@ -17,17 +17,17 @@ const ( workersNum = 5 ) -// ProviderConfigControllerManager implements the logic for starting and stopping controllers for each ProviderConfig. -type ProviderConfigControllerManager interface { +// providerConfigControllerManager implements the logic for starting and stopping controllers for each ProviderConfig. +type providerConfigControllerManager interface { StartControllersForProviderConfig(pc *providerconfig.ProviderConfig) error StopControllersForProviderConfig(pc *providerconfig.ProviderConfig) } -// ProviderConfigController is a controller that manages the ProviderConfig resource. +// providerConfigController is a controller that manages the ProviderConfig resource. // It is responsible for starting and stopping controllers for each ProviderConfig. -// Currently, it only manages the NEG controller using the ProviderConfigControllerManager. -type ProviderConfigController struct { - manager ProviderConfigControllerManager +// Currently, it only manages the NEG controller using the providerConfigControllerManager. +type providerConfigController struct { + manager providerConfigControllerManager providerConfigLister cache.Indexer providerConfigQueue utils.TaskQueue @@ -37,10 +37,10 @@ type ProviderConfigController struct { hasSynced func() bool } -// NewProviderConfigController creates a new instance of the ProviderConfig controller. -func NewProviderConfigController(manager ProviderConfigControllerManager, providerConfigInformer cache.SharedIndexInformer, stopCh <-chan struct{}, logger klog.Logger) *ProviderConfigController { +// newProviderConfigController creates a new instance of the ProviderConfig controller. +func newProviderConfigController(manager providerConfigControllerManager, providerConfigInformer cache.SharedIndexInformer, stopCh <-chan struct{}, logger klog.Logger) *providerConfigController { logger = logger.WithName(providerConfigControllerName) - pcc := &ProviderConfigController{ + pcc := &providerConfigController{ providerConfigLister: providerConfigInformer.GetIndexer(), stopCh: stopCh, numWorkers: workersNum, @@ -67,7 +67,7 @@ func NewProviderConfigController(manager ProviderConfigControllerManager, provid return pcc } -func (pcc *ProviderConfigController) Run() { +func (pcc *providerConfigController) Run() { defer pcc.shutdown() pcc.logger.Info("Starting ProviderConfig controller") @@ -76,6 +76,7 @@ func (pcc *ProviderConfigController) Run() { ok := cache.WaitForCacheSync(pcc.stopCh, pcc.hasSynced) if !ok { pcc.logger.Error(nil, "Failed to wait for initial cache sync before starting ProviderConfig Controller") + return } pcc.logger.Info("Started ProviderConfig Controller", "numWorkers", pcc.numWorkers) @@ -84,12 +85,12 @@ func (pcc *ProviderConfigController) Run() { <-pcc.stopCh } -func (pcc *ProviderConfigController) shutdown() { +func (pcc *providerConfigController) shutdown() { pcc.logger.Info("Shutting down ProviderConfig Controller") pcc.providerConfigQueue.Shutdown() } -func (pcc *ProviderConfigController) syncWrapper(key string) error { +func (pcc *providerConfigController) syncWrapper(key string) error { syncID := rand.Int31() svcLogger := pcc.logger.WithValues("providerConfigKey", key, "syncId", syncID) @@ -106,7 +107,7 @@ func (pcc *ProviderConfigController) syncWrapper(key string) error { return err } -func (pcc *ProviderConfigController) sync(key string, logger klog.Logger) error { +func (pcc *providerConfigController) sync(key string, logger klog.Logger) error { logger = logger.WithName("providerConfig.sync") providerConfig, exists, err := pcc.providerConfigLister.GetByKey(key) diff --git a/pkg/multiproject/framework/controller_map.go b/pkg/multiproject/framework/controller_map.go new file mode 100644 index 0000000000..418c77b03b --- /dev/null +++ b/pkg/multiproject/framework/controller_map.go @@ -0,0 +1,79 @@ +package framework + +import "sync" + +// controllerSet holds controller-specific resources for a ProviderConfig. +// It contains the stop channel used to signal controller shutdown and a mutex +// that serializes lifecycle transitions for the underlying controllers. +type controllerSet struct { + mu sync.Mutex + stopCh chan<- struct{} +} + +// ControllerMap is a thread-safe map for storing controllerSet instances. +// It uses read-write locking to allow concurrent read operations. +type ControllerMap struct { + mu sync.RWMutex + data map[string]*controllerSet +} + +// NewControllerMap creates a new thread-safe ControllerMap. +func NewControllerMap() *ControllerMap { + return &ControllerMap{ + data: make(map[string]*controllerSet), + } +} + +// Get retrieves a controllerSet for the given key. +// Returns the controllerSet and a boolean indicating whether it exists. +func (cm *ControllerMap) Get(key string) (*controllerSet, bool) { + cm.mu.RLock() + defer cm.mu.RUnlock() + cs, exists := cm.data[key] + return cs, exists +} + +// GetOrCreate retrieves the controllerSet for the given key, creating a new entry when absent. +// The second return value indicates whether the controllerSet already existed. +func (cm *ControllerMap) GetOrCreate(key string) (*controllerSet, bool) { + cm.mu.Lock() + defer cm.mu.Unlock() + if cs, exists := cm.data[key]; exists { + return cs, true + } + cs := &controllerSet{} + cm.data[key] = cs + return cs, false +} + +// Set stores a controllerSet for the given key. +func (cm *ControllerMap) Set(key string, cs *controllerSet) { + cm.mu.Lock() + defer cm.mu.Unlock() + cm.data[key] = cs +} + +// Delete removes and returns a controllerSet for the given key. +// Returns the controllerSet and a boolean indicating whether it existed. +func (cm *ControllerMap) Delete(key string) (*controllerSet, bool) { + cm.mu.Lock() + defer cm.mu.Unlock() + cs, exists := cm.data[key] + if exists { + delete(cm.data, key) + } + return cs, exists +} + +// DeleteIfSame removes the controllerSet for key only if it currently equals expected. +// Returns true if a deletion happened. +func (cm *ControllerMap) DeleteIfSame(key string, expected *controllerSet) bool { + cm.mu.Lock() + defer cm.mu.Unlock() + cur, ok := cm.data[key] + if !ok || cur != expected { + return false + } + delete(cm.data, key) + return true +} diff --git a/pkg/multiproject/manager/controller_map_test.go b/pkg/multiproject/framework/controller_map_test.go similarity index 86% rename from pkg/multiproject/manager/controller_map_test.go rename to pkg/multiproject/framework/controller_map_test.go index 375f0c49d1..87d03f0172 100644 --- a/pkg/multiproject/manager/controller_map_test.go +++ b/pkg/multiproject/framework/controller_map_test.go @@ -1,6 +1,7 @@ -package manager +package framework import ( + "fmt" "sync" "sync/atomic" "testing" @@ -51,6 +52,26 @@ func TestControllerMapBasicOperations(t *testing.T) { } } +func TestControllerMapGetOrCreate(t *testing.T) { + cm := NewControllerMap() + + first, existed := cm.GetOrCreate("alpha") + if existed { + t.Fatal("expected first GetOrCreate call to report non-existence") + } + if first == nil { + t.Fatal("expected controllerSet instance on first GetOrCreate call") + } + + second, existed := cm.GetOrCreate("alpha") + if !existed { + t.Fatal("expected second GetOrCreate call to report existence") + } + if first != second { + t.Fatal("expected GetOrCreate to return the same controllerSet instance") + } +} + // TestControllerMapConcurrentAccess verifies that concurrent operations on different keys // can proceed without blocking each other. func TestControllerMapConcurrentAccess(t *testing.T) { @@ -70,7 +91,7 @@ func TestControllerMapConcurrentAccess(t *testing.T) { go func(idx int) { defer wg.Done() - key := string(rune('a' + idx)) // Different keys: "a", "b", "c", etc. + key := fmt.Sprintf("key-%d", idx) // Track concurrent execution current := concurrentOps.Add(1) diff --git a/pkg/multiproject/controller/controller_test.go b/pkg/multiproject/framework/controller_test.go similarity index 54% rename from pkg/multiproject/controller/controller_test.go rename to pkg/multiproject/framework/controller_test.go index 2132f43c1b..f3895752ed 100644 --- a/pkg/multiproject/controller/controller_test.go +++ b/pkg/multiproject/framework/controller_test.go @@ -1,4 +1,4 @@ -package controller +package framework import ( "context" @@ -9,9 +9,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/cache" - providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" providerconfigv1 "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" - "k8s.io/ingress-gce/pkg/multiproject/manager" fakeproviderconfigclient "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned/fake" providerconfiginformers "k8s.io/ingress-gce/pkg/providerconfig/client/informers/externalversions" "k8s.io/klog/v2" @@ -22,13 +20,25 @@ func init() { providerconfigv1.AddToScheme(scheme.Scheme) } -// fakeProviderConfigControllersManager implements manager.ProviderConfigControllersManager +// pollForCondition polls for a condition to be true, with a timeout. +// Returns true if the condition is met within the timeout, false otherwise. +func pollForCondition(condition func() bool, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if condition() { + return true + } + time.Sleep(10 * time.Millisecond) + } + return false +} + +// fakeProviderConfigControllersManager implements ProviderConfigControllerManager // and lets us track calls to StartControllersForProviderConfig/StopControllersForProviderConfig. type fakeProviderConfigControllersManager struct { - manager.ProviderConfigControllersManager mu sync.Mutex - startedConfigs map[string]*providerconfig.ProviderConfig - stoppedConfigs map[string]*providerconfig.ProviderConfig + startedConfigs map[string]*providerconfigv1.ProviderConfig + stoppedConfigs map[string]*providerconfigv1.ProviderConfig startErr error // optional injected error stopErr error // optional injected error @@ -36,13 +46,12 @@ type fakeProviderConfigControllersManager struct { func newFakeProviderConfigControllersManager() *fakeProviderConfigControllersManager { return &fakeProviderConfigControllersManager{ - ProviderConfigControllersManager: manager.ProviderConfigControllersManager{}, - startedConfigs: make(map[string]*providerconfig.ProviderConfig), - stoppedConfigs: make(map[string]*providerconfig.ProviderConfig), + startedConfigs: make(map[string]*providerconfigv1.ProviderConfig), + stoppedConfigs: make(map[string]*providerconfigv1.ProviderConfig), } } -func (f *fakeProviderConfigControllersManager) StartControllersForProviderConfig(pc *providerconfig.ProviderConfig) error { +func (f *fakeProviderConfigControllersManager) StartControllersForProviderConfig(pc *providerconfigv1.ProviderConfig) error { f.mu.Lock() defer f.mu.Unlock() if f.startErr != nil { @@ -52,7 +61,7 @@ func (f *fakeProviderConfigControllersManager) StartControllersForProviderConfig return nil } -func (f *fakeProviderConfigControllersManager) StopControllersForProviderConfig(pc *providerconfig.ProviderConfig) { +func (f *fakeProviderConfigControllersManager) StopControllersForProviderConfig(pc *providerconfigv1.ProviderConfig) { f.mu.Lock() defer f.mu.Unlock() if f.stopErr != nil { @@ -81,7 +90,7 @@ type testProviderConfigController struct { stopCh chan struct{} manager *fakeProviderConfigControllersManager - pcController *ProviderConfigController + pcController *providerConfigController pcClient *fakeproviderconfigclient.Clientset pcInformer cache.SharedIndexInformer } @@ -96,7 +105,7 @@ func newTestProviderConfigController(t *testing.T) *testProviderConfigController stopCh := make(chan struct{}) logger := klog.TODO() - ctrl := NewProviderConfigController( + ctrl := newProviderConfigController( fakeManager, providerConfigInformer, stopCh, @@ -119,7 +128,7 @@ func newTestProviderConfigController(t *testing.T) *testProviderConfigController } } -func addProviderConfig(t *testing.T, tc *testProviderConfigController, pc *providerconfig.ProviderConfig) { +func addProviderConfig(t *testing.T, tc *testProviderConfigController, pc *providerconfigv1.ProviderConfig) { t.Helper() _, err := tc.pcClient.CloudV1().ProviderConfigs().Create(context.TODO(), pc, metav1.CreateOptions{}) if err != nil { @@ -131,7 +140,7 @@ func addProviderConfig(t *testing.T, tc *testProviderConfigController, pc *provi } } -func updateProviderConfig(t *testing.T, tc *testProviderConfigController, pc *providerconfig.ProviderConfig) { +func updateProviderConfig(t *testing.T, tc *testProviderConfigController, pc *providerconfigv1.ProviderConfig) { t.Helper() _, err := tc.pcClient.CloudV1().ProviderConfigs().Update(context.TODO(), pc, metav1.UpdateOptions{}) if err != nil { @@ -144,16 +153,23 @@ func TestStartAndStop(t *testing.T) { tc := newTestProviderConfigController(t) // Start the controller in a separate goroutine - go tc.pcController.Run() + controllerDone := make(chan struct{}) + go func() { + tc.pcController.Run() + close(controllerDone) + }() // Let it run briefly, then stop - time.Sleep(200 * time.Millisecond) + time.Sleep(50 * time.Millisecond) close(tc.stopCh) // triggers stop - // Wait some time for graceful shutdown - time.Sleep(200 * time.Millisecond) - - // If no panic or deadlock => success + // Wait for graceful shutdown with timeout + select { + case <-controllerDone: + // Success - controller shut down gracefully + case <-time.After(1 * time.Second): + t.Fatal("Controller did not shut down within timeout") + } } func TestCreateDeleteProviderConfig(t *testing.T) { @@ -161,17 +177,18 @@ func TestCreateDeleteProviderConfig(t *testing.T) { go tc.pcController.Run() defer close(tc.stopCh) - pc := &providerconfig.ProviderConfig{ + pc := &providerconfigv1.ProviderConfig{ ObjectMeta: metav1.ObjectMeta{ Name: "pc-delete", }, } addProviderConfig(t, tc, pc) - time.Sleep(100 * time.Millisecond) - // Manager should have started it - if !tc.manager.HasStarted("pc-delete") { - t.Errorf("expected manager to have started 'pc-delete'") + // Poll for manager to start the controller + if !pollForCondition(func() bool { + return tc.manager.HasStarted("pc-delete") + }, 1*time.Second) { + t.Errorf("expected manager to have started 'pc-delete' within timeout") } if tc.manager.HasStopped("pc-delete") { t.Errorf("did not expect manager to have stopped 'pc-delete'") @@ -181,10 +198,12 @@ func TestCreateDeleteProviderConfig(t *testing.T) { pc2 := pc.DeepCopy() pc2.DeletionTimestamp = &metav1.Time{Time: time.Now()} updateProviderConfig(t, tc, pc2) - time.Sleep(100 * time.Millisecond) - if !tc.manager.HasStopped("pc-delete") { - t.Errorf("expected manager to stop 'pc-delete', but it didn't") + // Poll for manager to stop the controller + if !pollForCondition(func() bool { + return tc.manager.HasStopped("pc-delete") + }, 1*time.Second) { + t.Errorf("expected manager to stop 'pc-delete' within timeout, but it didn't") } } @@ -197,7 +216,9 @@ func TestSyncNonExistent(t *testing.T) { key := "some-ns/some-nonexistent" tc.pcController.providerConfigQueue.Enqueue(key) - time.Sleep(200 * time.Millisecond) + // Poll to ensure the queue has been processed + // We check that after a reasonable time, no starts or stops happened + time.Sleep(100 * time.Millisecond) // No starts or stops should have happened if len(tc.manager.startedConfigs) != 0 { @@ -217,7 +238,8 @@ func TestSyncBadObjectType(t *testing.T) { // Insert something that is not *ProviderConfig tc.pcInformer.GetIndexer().Add(&struct{ Name string }{Name: "not-a-pc"}) - time.Sleep(200 * time.Millisecond) + // Give time for queue to process the invalid object + time.Sleep(100 * time.Millisecond) if len(tc.manager.startedConfigs) != 0 { t.Errorf("did not expect manager starts with a non-ProviderConfig object") @@ -226,3 +248,90 @@ func TestSyncBadObjectType(t *testing.T) { t.Errorf("did not expect manager stops with a non-ProviderConfig object") } } + +// fakePanickingManager implements providerConfigControllerManager and panics on Start. +type fakePanickingManager struct { + panicOccurred bool + mu sync.Mutex +} + +func (f *fakePanickingManager) StartControllersForProviderConfig(pc *providerconfigv1.ProviderConfig) error { + f.mu.Lock() + f.panicOccurred = true + f.mu.Unlock() + panic("intentional panic for testing") +} + +func (f *fakePanickingManager) StopControllersForProviderConfig(pc *providerconfigv1.ProviderConfig) { + // no-op +} + +func (f *fakePanickingManager) didPanic() bool { + f.mu.Lock() + defer f.mu.Unlock() + return f.panicOccurred +} + +// TestPanicRecovery verifies that panics in sync are caught and don't crash the worker. +func TestPanicRecovery(t *testing.T) { + pcClient := fakeproviderconfigclient.NewSimpleClientset() + panicManager := &fakePanickingManager{} + providerConfigInformer := providerconfiginformers.NewSharedInformerFactory(pcClient, 0).Cloud().V1().ProviderConfigs().Informer() + stopCh := make(chan struct{}) + defer close(stopCh) + + logger := klog.TODO() + ctrl := newProviderConfigController( + panicManager, + providerConfigInformer, + stopCh, + logger, + ) + + go providerConfigInformer.Run(stopCh) + if !cache.WaitForCacheSync(stopCh, providerConfigInformer.HasSynced) { + t.Fatalf("Failed to sync caches") + } + + // Start controller in background + go ctrl.Run() + + // Create a ProviderConfig that will trigger the panic + pc := &providerconfigv1.ProviderConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "panic-test", + }, + } + _, err := pcClient.CloudV1().ProviderConfigs().Create(context.TODO(), pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("failed to create ProviderConfig: %v", err) + } + providerConfigInformer.GetIndexer().Add(pc) + + // Poll to verify the panic occurred but didn't crash the controller + if !pollForCondition(func() bool { + return panicManager.didPanic() + }, 1*time.Second) { + t.Errorf("expected panic to occur within timeout") + } + + // Verify the controller is still running by adding another ProviderConfig + // If the worker crashed, this won't be processed + pc2 := &providerconfigv1.ProviderConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "after-panic", + }, + } + _, err = pcClient.CloudV1().ProviderConfigs().Create(context.TODO(), pc2, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("failed to create second ProviderConfig: %v", err) + } + providerConfigInformer.GetIndexer().Add(pc2) + + // Give some time for the second item to be processed + // Since there are multiple workers, at least one should still be alive + time.Sleep(100 * time.Millisecond) + + // If we reach here without the test crashing, panic recovery worked + t.Log("Controller survived panic and continued processing") +} diff --git a/pkg/multiproject/framework/framework.go b/pkg/multiproject/framework/framework.go new file mode 100644 index 0000000000..91ab7cf361 --- /dev/null +++ b/pkg/multiproject/framework/framework.go @@ -0,0 +1,76 @@ +package framework + +import ( + "k8s.io/client-go/tools/cache" + providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" + providerconfigclient "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned" + "k8s.io/klog/v2" +) + +// ControllerStarter defines the interface for starting a controller for a ProviderConfig. +// Implementations encapsulate all controller-specific startup logic and dependencies. +type ControllerStarter interface { + // StartController starts controller(s) for the given ProviderConfig. + // Returns: + // - A channel that should be closed to stop the controller + // - An error if startup fails + // + // The returned stop channel will be closed by the framework when the + // ProviderConfig is deleted or the controller needs to shut down. + StartController(pc *providerconfig.ProviderConfig) (chan<- struct{}, error) +} + +// Framework provides a single entry point for managing ProviderConfig-scoped controllers. +// It orchestrates the lifecycle of controllers, ensuring proper startup, shutdown, and +// finalizer management for each ProviderConfig resource. +type Framework struct { + controller *providerConfigController +} + +// New creates a new Framework instance that manages controllers for ProviderConfig resources. +// It requires: +// - providerConfigClient: client for managing ProviderConfig resources +// - providerConfigInformer: informer for watching ProviderConfig changes +// - finalizerName: name of the finalizer to add/remove on ProviderConfigs +// - controllerStarter: implementation that knows how to start controller for a ProviderConfig +// - stopCh: channel to signal shutdown +// - logger: logger for framework operations +// +// The framework will automatically: +// - Add/remove finalizers on ProviderConfig resources +// - Start controllers idempotently (only once per ProviderConfig) +// - Stop controllers when ProviderConfig is deleted +// - Handle errors and rollback finalizers if startup fails +func New( + providerConfigClient providerconfigclient.Interface, + providerConfigInformer cache.SharedIndexInformer, + finalizerName string, + controllerStarter ControllerStarter, + stopCh <-chan struct{}, + logger klog.Logger, +) *Framework { + mgr := newManager( + providerConfigClient, + finalizerName, + controllerStarter, + logger, + ) + + ctrl := newProviderConfigController(mgr, providerConfigInformer, stopCh, logger) + + return &Framework{ + controller: ctrl, + } +} + +// Run starts the framework and blocks until the stop channel is closed. +// It will: +// - Wait for the ProviderConfig informer to sync +// - Start processing ProviderConfig resources +// - Handle Add/Update/Delete events for ProviderConfigs +// - Block until stopCh is closed +// +// The stop channel (provided to New) is used for graceful shutdown coordination. +func (f *Framework) Run() { + f.controller.Run() +} diff --git a/pkg/multiproject/framework/manager.go b/pkg/multiproject/framework/manager.go new file mode 100644 index 0000000000..0f4ee2b12d --- /dev/null +++ b/pkg/multiproject/framework/manager.go @@ -0,0 +1,175 @@ +package framework + +import ( + "context" + "fmt" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" + "k8s.io/ingress-gce/pkg/multiproject/common/finalizer" + providerconfigclient "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned" + "k8s.io/klog/v2" +) + +// manager coordinates lifecycle of controllers scoped to individual ProviderConfigs. +// It ensures per-ProviderConfig controller startup is idempotent, adds/removes +// finalizers, and wires stop channels for clean shutdown. +type manager struct { + controllers *ControllerMap + + logger klog.Logger + providerConfigClient providerconfigclient.Interface + finalizerName string + controllerStarter ControllerStarter +} + +// newManager constructs a new generic ProviderConfig controller manager. +// It does not start any controllers until StartControllersForProviderConfig is invoked. +func newManager( + providerConfigClient providerconfigclient.Interface, + finalizerName string, + controllerStarter ControllerStarter, + logger klog.Logger, +) *manager { + return &manager{ + controllers: NewControllerMap(), + logger: logger, + providerConfigClient: providerConfigClient, + finalizerName: finalizerName, + controllerStarter: controllerStarter, + } +} + +// providerConfigKey returns the key for a ProviderConfig in the controller map. +func providerConfigKey(pc *providerconfig.ProviderConfig) string { + return pc.Name +} + +// latestPCWithCleanupFinalizer returns the latest ProviderConfig from the API server. +// If the Get fails, it returns a local copy of the provided pc with the cleanup +// finalizer appended to ensure a subsequent delete attempt is not a no-op. +func (m *manager) latestPCWithCleanupFinalizer(pc *providerconfig.ProviderConfig) *providerconfig.ProviderConfig { + pcLatest, err := m.providerConfigClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) + if err != nil { + pcCopy := pc.DeepCopy() + if !finalizer.HasGivenFinalizer(pcCopy.ObjectMeta, m.finalizerName) { + pcCopy.Finalizers = append(pcCopy.Finalizers, m.finalizerName) + } + return pcCopy + } + return pcLatest +} + +// rollbackFinalizerOnStartFailure removes the finalizer after a start failure +// so that ProviderConfig deletion is not blocked. +func (m *manager) rollbackFinalizerOnStartFailure(pc *providerconfig.ProviderConfig, logger klog.Logger, cause error) { + pcLatest := m.latestPCWithCleanupFinalizer(pc) + if err := finalizer.DeleteProviderConfigFinalizer(pcLatest, m.finalizerName, m.providerConfigClient, logger); err != nil { + logger.Error(err, "failed to clean up finalizer after start failure", "finalizer", m.finalizerName, "originalError", cause) + } +} + +// StartControllersForProviderConfig ensures finalizers are present and starts +// the controllers associated with the given ProviderConfig. The call is +// idempotent per ProviderConfig: concurrent or repeated calls for the same +// ProviderConfig will only start controllers once. +func (m *manager) StartControllersForProviderConfig(pc *providerconfig.ProviderConfig) error { + logger := m.logger.WithValues("providerConfigName", pc.Name) + + // Don't start controllers for ProviderConfigs that are being deleted. + if pc.DeletionTimestamp != nil { + logger.V(3).Info("ProviderConfig is terminating; skipping start") + return nil + } + + pcKey := providerConfigKey(pc) + + logger.Info("Starting controllers for provider config") + + cs, existed := m.controllers.GetOrCreate(pcKey) + + cs.mu.Lock() + defer cs.mu.Unlock() + + // Re-validate: ensure the map still points to this cs before we proceed. + if cur, ok := m.controllers.Get(pcKey); !ok || cur != cs { + logger.V(3).Info("ControllerSet is no longer current; aborting start") + return nil + } + + // Idempotent start: already running. + if cs.stopCh != nil { + logger.Info("Controllers for provider config already exist, skipping start") + return nil + } + + // Ensure finalizer before starting. + err := finalizer.EnsureProviderConfigFinalizer(pc, m.finalizerName, m.providerConfigClient, logger) + if err != nil { + // If we created this cs entry, remove it while still holding cs.mu, + // but only if the map still points to this cs. + if !existed { + _ = m.controllers.DeleteIfSame(pcKey, cs) + } + // No more access to cs after this point from other goroutines due to revalidation. + return fmt.Errorf("failed to ensure finalizer %s for provider config %s: %w", m.finalizerName, pcKey, err) + } + + controllerStopCh, err := m.controllerStarter.StartController(pc) + if err != nil { + if !existed { + _ = m.controllers.DeleteIfSame(pcKey, cs) + } + // If startup fails, make a best-effort to roll back the finalizer. + m.rollbackFinalizerOnStartFailure(pc, logger, err) + return fmt.Errorf("failed to start controller for provider config %s: %w", pcKey, err) + } + + cs.stopCh = controllerStopCh + + logger.Info("Started controllers for provider config") + return nil +} + +// StopControllersForProviderConfig stops the controllers for the given ProviderConfig +// and removes the associated finalizer. Finalizer removal is attempted even if no +// controller mapping exists, ensuring deletion can proceed after process restarts +// or when controllers were previously stopped. +func (m *manager) StopControllersForProviderConfig(pc *providerconfig.ProviderConfig) { + logger := m.logger.WithValues("providerConfigName", pc.Name) + + csKey := providerConfigKey(pc) + + // Try to stop any running controller if we still track one. + if cs, exists := m.controllers.Get(csKey); exists { + cs.mu.Lock() + stopCh := cs.stopCh + + // Already stopped; clean up the mapping if it still points to this cs. + if stopCh == nil { + _ = m.controllers.DeleteIfSame(csKey, cs) + cs.mu.Unlock() + logger.Info("Controllers for provider config already stopped") + } else { + // Transition to 'stopped' state under lock and remove mapping. + cs.stopCh = nil + _ = m.controllers.DeleteIfSame(csKey, cs) + cs.mu.Unlock() + // Now it is safe to close the stop channel. + close(stopCh) + logger.Info("Signaled controller stop") + } + } else { + logger.Info("Controllers for provider config do not exist") + } + + // Ensure we remove the finalizer from the latest object state. + // This happens regardless of whether a controller was running, ensuring + // deletion can proceed even if the controller mapping is gone. + pcLatest := m.latestPCWithCleanupFinalizer(pc) + err := finalizer.DeleteProviderConfigFinalizer(pcLatest, m.finalizerName, m.providerConfigClient, logger) + if err != nil { + logger.Error(err, "failed to delete finalizer for provider config", "finalizer", m.finalizerName) + } + logger.Info("Stopped controllers for provider config") +} diff --git a/pkg/multiproject/framework/manager_test.go b/pkg/multiproject/framework/manager_test.go new file mode 100644 index 0000000000..9050281ec4 --- /dev/null +++ b/pkg/multiproject/framework/manager_test.go @@ -0,0 +1,703 @@ +package framework + +import ( + "context" + "fmt" + "slices" + "sync" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" + "k8s.io/ingress-gce/pkg/multiproject/common/finalizer" + providerconfigclient "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned/fake" + "k8s.io/klog/v2/ktesting" +) + +// mockControllerStarter is a mock implementation of ControllerStarter for testing. +type mockControllerStarter struct { + mu sync.Mutex + startCalls int + shouldFailStart bool + startedControllers map[string]chan<- struct{} + startCounts map[string]int +} + +func newMockControllerStarter() *mockControllerStarter { + return &mockControllerStarter{ + startedControllers: make(map[string]chan<- struct{}), + startCounts: make(map[string]int), + } +} + +func (m *mockControllerStarter) StartController(pc *providerconfig.ProviderConfig) (chan<- struct{}, error) { + m.mu.Lock() + defer m.mu.Unlock() + + m.startCalls++ + m.startCounts[pc.Name]++ + + if m.shouldFailStart { + return nil, fmt.Errorf("mock start failure") + } + + stopCh := make(chan struct{}) + m.startedControllers[pc.Name] = stopCh + return stopCh, nil +} + +func (m *mockControllerStarter) getStartCallCount() int { + m.mu.Lock() + defer m.mu.Unlock() + return m.startCalls +} + +func (m *mockControllerStarter) getStartCountForKey(key string) int { + m.mu.Lock() + defer m.mu.Unlock() + return m.startCounts[key] +} + +func createTestProviderConfig(name string) *providerconfig.ProviderConfig { + return &providerconfig.ProviderConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: providerconfig.ProviderConfigSpec{ + ProjectID: "test-project", + }, + } +} + +// TestManagerStartIdempotent verifies that starting the same controller multiple times is idempotent. +func TestManagerStartIdempotent(t *testing.T) { + ctx := context.TODO() + logger, _ := ktesting.NewTestContext(t) + client := providerconfigclient.NewSimpleClientset() + mockStarter := newMockControllerStarter() + + manager := newManager( + client, + "test-finalizer", + mockStarter, + logger, + ) + + pc := createTestProviderConfig("test-pc") + + // Create the ProviderConfig in the fake client + _, err := client.CloudV1().ProviderConfigs().Create(ctx, pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create test ProviderConfig: %v", err) + } + + // First start should succeed + err = manager.StartControllersForProviderConfig(pc) + if err != nil { + t.Fatalf("First start failed: %v", err) + } + + // Second start should be idempotent (no error, no additional controller started) + err = manager.StartControllersForProviderConfig(pc) + if err != nil { + t.Fatalf("Second start failed: %v", err) + } + + // Should only have started once + if mockStarter.getStartCallCount() != 1 { + t.Errorf("Expected 1 start call, got %d", mockStarter.getStartCallCount()) + } +} + +// TestManagerStartAddsFinalizerBeforeControllerStarts verifies that the finalizer +// is added before the controller starts. +func TestManagerStartAddsFinalizerBeforeControllerStarts(t *testing.T) { + ctx := context.TODO() + logger, _ := ktesting.NewTestContext(t) + client := providerconfigclient.NewSimpleClientset() + mockStarter := newMockControllerStarter() + + finalizerName := "test-finalizer" + manager := newManager( + client, + finalizerName, + mockStarter, + logger, + ) + + pc := createTestProviderConfig("test-pc") + + // Create the ProviderConfig in the fake client + _, err := client.CloudV1().ProviderConfigs().Create(ctx, pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create test ProviderConfig: %v", err) + } + + err = manager.StartControllersForProviderConfig(pc) + if err != nil { + t.Fatalf("Start failed: %v", err) + } + + // Verify finalizer was added + updatedPC, err := client.CloudV1().ProviderConfigs().Get(ctx, pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated ProviderConfig: %v", err) + } + + if !finalizer.HasGivenFinalizer(updatedPC.ObjectMeta, finalizerName) { + t.Errorf("Finalizer %s was not added to ProviderConfig", finalizerName) + } +} + +// TestManagerStartFailureRollsBackFinalizer verifies that if controller startup fails, +// the finalizer is rolled back. +func TestManagerStartFailureRollsBackFinalizer(t *testing.T) { + ctx := context.TODO() + logger, _ := ktesting.NewTestContext(t) + client := providerconfigclient.NewSimpleClientset() + mockStarter := newMockControllerStarter() + mockStarter.shouldFailStart = true + + finalizerName := "test-finalizer" + manager := newManager( + client, + finalizerName, + mockStarter, + logger, + ) + + pc := createTestProviderConfig("test-pc") + + // Create the ProviderConfig in the fake client + _, err := client.CloudV1().ProviderConfigs().Create(ctx, pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create test ProviderConfig: %v", err) + } + + // Start should fail + err = manager.StartControllersForProviderConfig(pc) + if err == nil { + t.Fatal("Expected start to fail, but it succeeded") + } + + // Verify finalizer was rolled back + updatedPC, err := client.CloudV1().ProviderConfigs().Get(ctx, pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated ProviderConfig: %v", err) + } + + if finalizer.HasGivenFinalizer(updatedPC.ObjectMeta, finalizerName) { + t.Errorf("Finalizer %s was not rolled back after start failure", finalizerName) + } +} + +// TestManagerStopRemovesFinalizer verifies that stopping a controller removes the finalizer. +func TestManagerStopRemovesFinalizer(t *testing.T) { + ctx := context.TODO() + logger, _ := ktesting.NewTestContext(t) + client := providerconfigclient.NewSimpleClientset() + mockStarter := newMockControllerStarter() + + finalizerName := "test-finalizer" + manager := newManager( + client, + finalizerName, + mockStarter, + logger, + ) + + pc := createTestProviderConfig("test-pc") + + // Create the ProviderConfig in the fake client + _, err := client.CloudV1().ProviderConfigs().Create(ctx, pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create test ProviderConfig: %v", err) + } + + // Start the controller + err = manager.StartControllersForProviderConfig(pc) + if err != nil { + t.Fatalf("Start failed: %v", err) + } + + // Verify finalizer exists + updatedPC, err := client.CloudV1().ProviderConfigs().Get(ctx, pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get updated ProviderConfig: %v", err) + } + if !finalizer.HasGivenFinalizer(updatedPC.ObjectMeta, finalizerName) { + t.Fatal("Finalizer was not added") + } + + // Stop the controller + manager.StopControllersForProviderConfig(updatedPC) + + // Verify finalizer was removed + finalPC, err := client.CloudV1().ProviderConfigs().Get(ctx, pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get final ProviderConfig: %v", err) + } + + if finalizer.HasGivenFinalizer(finalPC.ObjectMeta, finalizerName) { + t.Errorf("Finalizer %s was not removed after stop", finalizerName) + } +} + +// TestManagerStopIdempotent verifies that stopping a non-existent controller is safe. +func TestManagerStopIdempotent(t *testing.T) { + ctx := context.TODO() + logger, _ := ktesting.NewTestContext(t) + client := providerconfigclient.NewSimpleClientset() + mockStarter := newMockControllerStarter() + + manager := newManager( + client, + "test-finalizer", + mockStarter, + logger, + ) + + pc := createTestProviderConfig("test-pc") + + // Create the ProviderConfig in the fake client + _, err := client.CloudV1().ProviderConfigs().Create(ctx, pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create test ProviderConfig: %v", err) + } + + // Stop without start should not panic + manager.StopControllersForProviderConfig(pc) + + // Double stop should also be safe + manager.StopControllersForProviderConfig(pc) +} + +// TestManagerStopRemovesFinalizerWhenNoControllerExists verifies the critical bug fix: +// StopControllersForProviderConfig must remove the finalizer even when no controller +// mapping exists (e.g., after process restart or if controller was never started). +// This ensures ProviderConfig deletion can proceed instead of stalling indefinitely. +func TestManagerStopRemovesFinalizerWhenNoControllerExists(t *testing.T) { + ctx := context.TODO() + logger, _ := ktesting.NewTestContext(t) + client := providerconfigclient.NewSimpleClientset() + mockStarter := newMockControllerStarter() + + finalizerName := "test-finalizer" + manager := newManager( + client, + finalizerName, + mockStarter, + logger, + ) + + // Create a ProviderConfig with the finalizer already present. + // This simulates a scenario where: + // 1. The controller previously started and added the finalizer + // 2. The process restarted, losing the in-memory controller mapping + // 3. The ProviderConfig is now being deleted + pc := createTestProviderConfig("test-pc") + pc.Finalizers = []string{finalizerName} + + _, err := client.CloudV1().ProviderConfigs().Create(ctx, pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create test ProviderConfig: %v", err) + } + + // Verify finalizer exists + pcBefore, err := client.CloudV1().ProviderConfigs().Get(ctx, pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get ProviderConfig: %v", err) + } + if !finalizer.HasGivenFinalizer(pcBefore.ObjectMeta, finalizerName) { + t.Fatal("Finalizer was not present on initial ProviderConfig") + } + + // Critical test: Call Stop WITHOUT ever calling Start. + // This means no controller mapping exists in the manager. + // Before the fix, this would return early and never remove the finalizer. + // After the fix, it should remove the finalizer regardless. + manager.StopControllersForProviderConfig(pcBefore) + + // Verify finalizer was removed even though no controller existed + pcAfter, err := client.CloudV1().ProviderConfigs().Get(ctx, pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get ProviderConfig after stop: %v", err) + } + + if finalizer.HasGivenFinalizer(pcAfter.ObjectMeta, finalizerName) { + t.Errorf("Finalizer %s was NOT removed when no controller existed - THIS IS THE BUG", finalizerName) + } +} + +// TestManagerMultipleProviderConfigs verifies that multiple ProviderConfigs can be managed independently. +func TestManagerMultipleProviderConfigs(t *testing.T) { + ctx := context.TODO() + logger, _ := ktesting.NewTestContext(t) + client := providerconfigclient.NewSimpleClientset() + mockStarter := newMockControllerStarter() + + manager := newManager( + client, + "test-finalizer", + mockStarter, + logger, + ) + + pc1 := createTestProviderConfig("pc-1") + pc2 := createTestProviderConfig("pc-2") + pc3 := createTestProviderConfig("pc-3") + + // Create all ProviderConfigs + for _, pc := range []*providerconfig.ProviderConfig{pc1, pc2, pc3} { + _, err := client.CloudV1().ProviderConfigs().Create(ctx, pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create ProviderConfig %s: %v", pc.Name, err) + } + } + + // Start all controllers + testCases := []struct { + pc *providerconfig.ProviderConfig + }{ + {pc: pc1}, + {pc: pc2}, + {pc: pc3}, + } + + for _, tc := range testCases { + err := manager.StartControllersForProviderConfig(tc.pc) + if err != nil { + t.Fatalf("Failed to start controller for %s: %v", tc.pc.Name, err) + } + } + + // Verify all controllers started + if mockStarter.getStartCallCount() != 3 { + t.Errorf("Expected 3 start calls, got %d", mockStarter.getStartCallCount()) + } + + // Stop one controller + manager.StopControllersForProviderConfig(pc2) + + // Start pc2 again + err := manager.StartControllersForProviderConfig(pc2) + if err != nil { + t.Fatalf("Failed to restart controller for pc-2: %v", err) + } + + // Should have 4 total starts now + if mockStarter.getStartCallCount() != 4 { + t.Errorf("Expected 4 start calls after restart, got %d", mockStarter.getStartCallCount()) + } +} + +// TestManagerConcurrentStartStop verifies concurrent start/stop operations are safe. +func TestManagerConcurrentStartStop(t *testing.T) { + ctx := context.TODO() + logger, _ := ktesting.NewTestContext(t) + client := providerconfigclient.NewSimpleClientset() + mockStarter := newMockControllerStarter() + + manager := newManager( + client, + "test-finalizer", + mockStarter, + logger, + ) + + const numGoroutines = 10 + const numProviderConfigs = 5 + + // Create test ProviderConfigs + providerConfigs := make([]*providerconfig.ProviderConfig, numProviderConfigs) + for i := 0; i < numProviderConfigs; i++ { + pc := createTestProviderConfig(fmt.Sprintf("pc-%d", i)) + providerConfigs[i] = pc + _, err := client.CloudV1().ProviderConfigs().Create(ctx, pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create ProviderConfig: %v", err) + } + } + + var wg sync.WaitGroup + wg.Add(numGoroutines) + + // Concurrently start and stop controllers + for i := 0; i < numGoroutines; i++ { + go func(idx int) { + defer wg.Done() + pc := providerConfigs[idx%numProviderConfigs] + + // Try to start + _ = manager.StartControllersForProviderConfig(pc) + + // Try to stop + manager.StopControllersForProviderConfig(pc) + }(i) + } + + wg.Wait() + + // Should not panic and should handle concurrent operations gracefully + t.Log("Concurrent operations completed without panic") +} + +// TestManagerConcurrentStartSameProviderConfig verifies concurrent start attempts on the same ProviderConfig +// only result in a single controller start. +func TestManagerConcurrentStartSameProviderConfig(t *testing.T) { + ctx := context.TODO() + logger, _ := ktesting.NewTestContext(t) + client := providerconfigclient.NewSimpleClientset() + mockStarter := newMockControllerStarter() + + manager := newManager( + client, + "test-finalizer", + mockStarter, + logger, + ) + + pc := createTestProviderConfig("shared-pc") + + _, err := client.CloudV1().ProviderConfigs().Create(ctx, pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create ProviderConfig: %v", err) + } + + const numGoroutines = 50 + var wg sync.WaitGroup + wg.Add(numGoroutines) + + for i := 0; i < numGoroutines; i++ { + go func() { + defer wg.Done() + err := manager.StartControllersForProviderConfig(pc) + if err != nil { + t.Errorf("StartControllersForProviderConfig failed: %v", err) + } + }() + } + + wg.Wait() + + startCount := mockStarter.getStartCountForKey(pc.Name) + if startCount != 1 { + t.Fatalf("Expected exactly one controller start, got %d", startCount) + } + + updatedPC, err := client.CloudV1().ProviderConfigs().Get(ctx, pc.Name, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to retrieve ProviderConfig: %v", err) + } + manager.StopControllersForProviderConfig(updatedPC) +} + +// TestLatestPCWithCleanupFinalizerGetFailure verifies that when API Get fails, +// latestPCWithCleanupFinalizer returns a copy with the finalizer without duplicates. +func TestLatestPCWithCleanupFinalizerGetFailure(t *testing.T) { + logger, _ := ktesting.NewTestContext(t) + // Use an empty clientset so Get will fail + client := providerconfigclient.NewSimpleClientset() + mockStarter := newMockControllerStarter() + + finalizerName := "test-finalizer" + manager := newManager( + client, + finalizerName, + mockStarter, + logger, + ) + + testCases := []struct { + name string + initialFinalizers []string + expectedFinalizers []string + }{ + { + name: "no existing finalizers", + initialFinalizers: nil, + expectedFinalizers: []string{finalizerName}, + }, + { + name: "has other finalizers", + initialFinalizers: []string{"other-finalizer"}, + expectedFinalizers: []string{"other-finalizer", finalizerName}, + }, + { + name: "already has cleanup finalizer", + initialFinalizers: []string{finalizerName}, + expectedFinalizers: []string{finalizerName}, + }, + { + name: "has both cleanup and other finalizers", + initialFinalizers: []string{"other-finalizer", finalizerName}, + expectedFinalizers: []string{"other-finalizer", finalizerName}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + pc := &providerconfig.ProviderConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pc", + Finalizers: tc.initialFinalizers, + }, + } + + // Call latestPCWithCleanupFinalizer - Get will fail since PC doesn't exist + result := manager.latestPCWithCleanupFinalizer(pc) + + // Verify result has the expected finalizers + if len(result.Finalizers) != len(tc.expectedFinalizers) { + t.Errorf("expected %d finalizers, got %d: %v", len(tc.expectedFinalizers), len(result.Finalizers), result.Finalizers) + } + + for _, expectedFinalizer := range tc.expectedFinalizers { + if !slices.Contains(result.Finalizers, expectedFinalizer) { + t.Errorf("expected finalizer %s not found in result: %v", expectedFinalizer, result.Finalizers) + } + } + + // Verify no duplicate finalizers + seen := make(map[string]bool) + for _, f := range result.Finalizers { + if seen[f] { + t.Errorf("duplicate finalizer found: %s", f) + } + seen[f] = true + } + }) + } +} + +// failOnceMockStarter is a mock ControllerStarter that fails on the first attempt +// and succeeds on subsequent attempts for each ProviderConfig. +type failOnceMockStarter struct { + mu sync.Mutex + attemptCounts map[string]int + startedControllers map[string]chan<- struct{} +} + +func newFailOnceMockStarter() *failOnceMockStarter { + return &failOnceMockStarter{ + attemptCounts: make(map[string]int), + startedControllers: make(map[string]chan<- struct{}), + } +} + +func (m *failOnceMockStarter) StartController(pc *providerconfig.ProviderConfig) (chan<- struct{}, error) { + m.mu.Lock() + defer m.mu.Unlock() + + m.attemptCounts[pc.Name]++ + + // Fail on first attempt + if m.attemptCounts[pc.Name] == 1 { + return nil, fmt.Errorf("mock start failure on first attempt") + } + + // Succeed on subsequent attempts + stopCh := make(chan struct{}) + m.startedControllers[pc.Name] = stopCh + return stopCh, nil +} + +func (m *failOnceMockStarter) isStarted(key string) bool { + m.mu.Lock() + defer m.mu.Unlock() + _, started := m.startedControllers[key] + return started +} + +// TestManagerConcurrentStartWithFailureDoesNotLeakControllers verifies that when +// concurrent Start calls race with a failing Start, we don't leak running controllers. +// This is a regression test for the race condition where unlock happened before delete, +// allowing another goroutine to start a controller that would then be orphaned. +func TestManagerConcurrentStartWithFailureDoesNotLeakControllers(t *testing.T) { + ctx := context.TODO() + logger, _ := ktesting.NewTestContext(t) + client := providerconfigclient.NewSimpleClientset() + mockStarter := newFailOnceMockStarter() + + manager := newManager( + client, + "test-finalizer", + mockStarter, + logger, + ) + + pc := createTestProviderConfig("race-test-pc") + + _, err := client.CloudV1().ProviderConfigs().Create(ctx, pc, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Failed to create ProviderConfig: %v", err) + } + + // Launch two concurrent Start attempts. + // Due to the failOnceMockStarter, the first attempt will fail. + // The key test is that we don't leak controllers when racing start/failure. + var wg sync.WaitGroup + wg.Add(2) + + errors := make([]error, 2) + + go func() { + defer wg.Done() + errors[0] = manager.StartControllersForProviderConfig(pc) + }() + + go func() { + defer wg.Done() + errors[1] = manager.StartControllersForProviderConfig(pc) + }() + + wg.Wait() + + // Due to the race and revalidation, both might fail/abort in this specific scenario. + // What matters is that if a controller was started, it's properly tracked. + // If both failed, we should be able to start successfully now. + if errors[0] != nil && errors[1] != nil { + // Both failed/aborted - this can happen due to revalidation race. + // Try starting again, should succeed now. + err = manager.StartControllersForProviderConfig(pc) + if err != nil { + t.Fatalf("Expected final start to succeed after concurrent failures, got: %v", err) + } + } + + // Critical assertion: verify controller state is consistent + isStarted := mockStarter.isStarted(pc.Name) + cs, exists := manager.controllers.Get(providerConfigKey(pc)) + + // Both should agree: either controller is running and tracked, or neither + if isStarted && !exists { + t.Fatal("Controller was started but not tracked in manager - this indicates a leaked controller (THE BUG)") + } + if !isStarted && exists && cs.stopCh != nil { + t.Fatal("Controller is tracked as running but was never started - inconsistent state") + } + if isStarted && exists && cs.stopCh == nil { + t.Fatal("Controller was started and tracked but stopCh is nil - inconsistent state") + } + + // If controller is running, verify it's properly tracked + if isStarted { + if !exists { + t.Fatal("Started controller is not tracked") + } + if cs.stopCh == nil { + t.Fatal("Started controller has nil stopCh") + } + } + + // Clean up if controller is running + if isStarted { + manager.StopControllersForProviderConfig(pc) + + // Verify proper cleanup + _, exists = manager.controllers.Get(providerConfigKey(pc)) + if exists { + t.Error("Controller entry still exists after stop") + } + } +} diff --git a/pkg/multiproject/manager/controller_map.go b/pkg/multiproject/manager/controller_map.go deleted file mode 100644 index b75cae2367..0000000000 --- a/pkg/multiproject/manager/controller_map.go +++ /dev/null @@ -1,45 +0,0 @@ -package manager - -import "sync" - -// ControllerMap is a thread-safe map for storing ControllerSet instances. -// It uses fine-grained locking to allow concurrent operations on different keys. -type ControllerMap struct { - mu sync.Mutex - data map[string]*controllerSet -} - -// NewControllerMap creates a new thread-safe ControllerMap. -func NewControllerMap() *ControllerMap { - return &ControllerMap{ - data: make(map[string]*controllerSet), - } -} - -// Get retrieves a ControllerSet for the given key. -// Returns the ControllerSet and a boolean indicating whether it exists. -func (cm *ControllerMap) Get(key string) (*controllerSet, bool) { - cm.mu.Lock() - defer cm.mu.Unlock() - cs, exists := cm.data[key] - return cs, exists -} - -// Set stores a ControllerSet for the given key. -func (cm *ControllerMap) Set(key string, cs *controllerSet) { - cm.mu.Lock() - defer cm.mu.Unlock() - cm.data[key] = cs -} - -// Delete removes and returns a ControllerSet for the given key. -// Returns the ControllerSet and a boolean indicating whether it existed. -func (cm *ControllerMap) Delete(key string) (*controllerSet, bool) { - cm.mu.Lock() - defer cm.mu.Unlock() - cs, exists := cm.data[key] - if exists { - delete(cm.data, key) - } - return cs, exists -} diff --git a/pkg/multiproject/manager/manager.go b/pkg/multiproject/manager/manager.go deleted file mode 100644 index 2f96cf5dea..0000000000 --- a/pkg/multiproject/manager/manager.go +++ /dev/null @@ -1,208 +0,0 @@ -package manager - -import ( - "context" - "fmt" - - networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" - nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes" - providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" - "k8s.io/ingress-gce/pkg/multiproject/finalizer" - "k8s.io/ingress-gce/pkg/multiproject/gce" - multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/informerset" - "k8s.io/ingress-gce/pkg/multiproject/neg" - syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" - "k8s.io/ingress-gce/pkg/neg/syncers/labels" - providerconfigclient "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned" - svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" - "k8s.io/ingress-gce/pkg/utils/namer" - "k8s.io/klog/v2" -) - -// startNEGController is a package-level variable to allow tests to stub the -// actual NEG controller startup routine. -var startNEGController = neg.StartNEGController - -// ProviderConfigControllersManager coordinates lifecycle of controllers scoped to -// a single ProviderConfig. It ensures per-ProviderConfig controller startup is -// idempotent, adds/removes the NEG cleanup finalizer, and wires a stop channel -// for clean shutdown. -type ProviderConfigControllersManager struct { - controllers *ControllerMap - - logger klog.Logger - providerConfigClient providerconfigclient.Interface - // Base informers shared across all ProviderConfigs - informers *multiprojectinformers.InformerSet - kubeClient kubernetes.Interface - svcNegClient svcnegclient.Interface - eventRecorderClient kubernetes.Interface - networkClient networkclient.Interface - nodetopologyClient nodetopologyclient.Interface - kubeSystemUID types.UID - clusterNamer *namer.Namer - l4Namer *namer.L4Namer - lpConfig labels.PodLabelPropagationConfig - gceCreator gce.GCECreator - globalStopCh <-chan struct{} - syncerMetrics *syncMetrics.SyncerMetrics -} - -// controllerSet holds controller-specific resources for a ProviderConfig. -type controllerSet struct { - stopCh chan<- struct{} -} - -// NewProviderConfigControllerManager constructs a new ProviderConfigControllersManager. -// It does not start any controllers until StartControllersForProviderConfig is invoked. -func NewProviderConfigControllerManager( - kubeClient kubernetes.Interface, - informers *multiprojectinformers.InformerSet, - providerConfigClient providerconfigclient.Interface, - svcNegClient svcnegclient.Interface, - networkClient networkclient.Interface, - nodetopologyClient nodetopologyclient.Interface, - eventRecorderClient kubernetes.Interface, - kubeSystemUID types.UID, - clusterNamer *namer.Namer, - l4Namer *namer.L4Namer, - lpConfig labels.PodLabelPropagationConfig, - gceCreator gce.GCECreator, - globalStopCh <-chan struct{}, - logger klog.Logger, - syncerMetrics *syncMetrics.SyncerMetrics, -) *ProviderConfigControllersManager { - return &ProviderConfigControllersManager{ - controllers: NewControllerMap(), - logger: logger, - providerConfigClient: providerConfigClient, - informers: informers, - kubeClient: kubeClient, - svcNegClient: svcNegClient, - eventRecorderClient: eventRecorderClient, - networkClient: networkClient, - nodetopologyClient: nodetopologyClient, - kubeSystemUID: kubeSystemUID, - clusterNamer: clusterNamer, - l4Namer: l4Namer, - lpConfig: lpConfig, - gceCreator: gceCreator, - globalStopCh: globalStopCh, - syncerMetrics: syncerMetrics, - } -} - -func providerConfigKey(pc *providerconfig.ProviderConfig) string { - return pc.Name -} - -// StartControllersForProviderConfig ensures finalizers are present and starts -// the controllers associated with the given ProviderConfig. The call is -// idempotent per ProviderConfig: concurrent or repeated calls for the same -// ProviderConfig will only start controllers once. -func (pccm *ProviderConfigControllersManager) StartControllersForProviderConfig(pc *providerconfig.ProviderConfig) error { - logger := pccm.logger.WithValues("providerConfigId", pc.Name) - - pcKey := providerConfigKey(pc) - - logger.Info("Starting controllers for provider config") - - // Check if controller already exists - if _, exists := pccm.controllers.Get(pcKey); exists { - logger.Info("Controllers for provider config already exist, skipping start") - return nil - } - - err := finalizer.EnsureProviderConfigNEGCleanupFinalizer(pc, pccm.providerConfigClient, logger) - if err != nil { - return fmt.Errorf("failed to ensure NEG cleanup finalizer for project %s: %w", pcKey, err) - } - - cloud, err := pccm.gceCreator.GCEForProviderConfig(pc, logger) - if err != nil { - // If GCE client creation fails after finalizer was added, roll it back. - pccm.rollbackFinalizerOnStartFailure(pc, logger, err) - return fmt.Errorf("failed to create GCE client for provider config %+v: %w", pc, err) - } - - negControllerStopCh, err := startNEGController( - pccm.informers, - pccm.kubeClient, - pccm.eventRecorderClient, - pccm.svcNegClient, - pccm.networkClient, - pccm.nodetopologyClient, - pccm.kubeSystemUID, - pccm.clusterNamer, - pccm.l4Namer, - pccm.lpConfig, - cloud, - pccm.globalStopCh, - logger, - pc, - pccm.syncerMetrics, - ) - if err != nil { - // If startup fails, make a best-effort to roll back the finalizer. - pccm.rollbackFinalizerOnStartFailure(pc, logger, err) - return fmt.Errorf("failed to start NEG controller for project %s: %w", pcKey, err) - } - - pccm.controllers.Set(pcKey, &controllerSet{ - stopCh: negControllerStopCh, - }) - - logger.Info("Started controllers for provider config") - return nil -} - -func (pccm *ProviderConfigControllersManager) StopControllersForProviderConfig(pc *providerconfig.ProviderConfig) { - logger := pccm.logger.WithValues("providerConfigId", pc.Name) - - csKey := providerConfigKey(pc) - - // Remove controller from map using minimal lock scope - cs, exists := pccm.controllers.Delete(csKey) - if !exists { - logger.Info("Controllers for provider config do not exist") - return - } - - // Perform cleanup operations without holding the lock - close(cs.stopCh) - - pccm.controllers.Delete(csKey) - - // Ensure we remove the finalizer from the latest object state. - pcLatest := pccm.latestPCWithCleanupFinalizer(pc) - err := finalizer.DeleteProviderConfigNEGCleanupFinalizer(pcLatest, pccm.providerConfigClient, logger) - if err != nil { - logger.Error(err, "failed to delete NEG cleanup finalizer for project") - } - logger.Info("Stopped controllers for provider config") -} - -// rollbackFinalizerOnStartFailure removes the NEG cleanup finalizer after a -// start failure so that ProviderConfig deletion is not blocked. -func (pccm *ProviderConfigControllersManager) rollbackFinalizerOnStartFailure(pc *providerconfig.ProviderConfig, logger klog.Logger, cause error) { - pcLatest := pccm.latestPCWithCleanupFinalizer(pc) - if err := finalizer.DeleteProviderConfigNEGCleanupFinalizer(pcLatest, pccm.providerConfigClient, logger); err != nil { - logger.Error(err, "failed to clean up NEG finalizer after start failure", "originalError", cause) - } -} - -// latestPCWithCleanupFinalizer returns the latest ProviderConfig from the API server. -// If the Get fails, it returns a local copy of the provided pc with the cleanup -// finalizer appended to ensure a subsequent delete attempt is not a no-op. -func (pccm *ProviderConfigControllersManager) latestPCWithCleanupFinalizer(pc *providerconfig.ProviderConfig) *providerconfig.ProviderConfig { - pcLatest, err := pccm.providerConfigClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) - if err != nil { - pcCopy := pc.DeepCopy() - pcCopy.Finalizers = append(pcCopy.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) - return pcCopy - } - return pcLatest -} diff --git a/pkg/multiproject/manager/manager_test.go b/pkg/multiproject/manager/manager_test.go deleted file mode 100644 index d343f37322..0000000000 --- a/pkg/multiproject/manager/manager_test.go +++ /dev/null @@ -1,280 +0,0 @@ -package manager - -import ( - "context" - "errors" - "slices" - "sync" - "testing" - - networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" - nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/fake" - cloudgce "k8s.io/cloud-provider-gcp/providers/gce" - providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" - "k8s.io/ingress-gce/pkg/multiproject/finalizer" - multiprojectgce "k8s.io/ingress-gce/pkg/multiproject/gce" - multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/informerset" - syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" - "k8s.io/ingress-gce/pkg/neg/syncers/labels" - providerconfigclientfake "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned/fake" - svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" - svcnegfake "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned/fake" - "k8s.io/ingress-gce/pkg/utils/namer" - klog "k8s.io/klog/v2" - ktesting "k8s.io/klog/v2/ktesting" -) - -// stubbed start function handle -type startCall struct { - pcName string -} - -// failingGCECreator is a test double that fails GCE client creation. -type failingGCECreator struct{} - -func (f failingGCECreator) GCEForProviderConfig(_ *providerconfig.ProviderConfig, _ klog.Logger) (*cloudgce.Cloud, error) { - return nil, errors.New("boom") -} - -func makePC(name string) *providerconfig.ProviderConfig { - return &providerconfig.ProviderConfig{ - ObjectMeta: metav1.ObjectMeta{Name: name}, - Spec: providerconfig.ProviderConfigSpec{ - ProjectID: "test-project", - ProjectNumber: 123, - NetworkConfig: providerconfig.ProviderNetworkConfig{ - Network: "net-1", - SubnetInfo: providerconfig.ProviderConfigSubnetInfo{Subnetwork: "sub-1"}, - }, - }, - } -} - -func newManagerForTest(t *testing.T, svcNeg svcnegclient.Interface) (*ProviderConfigControllersManager, *providerconfigclientfake.Clientset) { - t.Helper() - kubeClient := fake.NewSimpleClientset() - pcClient := providerconfigclientfake.NewSimpleClientset() - informers := multiprojectinformers.NewInformerSet(kubeClient, svcNeg, networkclient.Interface(nil), nodetopologyclient.Interface(nil), metav1.Duration{}) - logger, _ := ktesting.NewTestContext(t) - - rootNamer := namer.NewNamer("clusteruid", "", logger) - kubeSystemUID := types.UID("uid") - gceCreator := multiprojectgce.NewGCEFake() - lpCfg := labels.PodLabelPropagationConfig{} - globalStop := make(chan struct{}) - t.Cleanup(func() { close(globalStop) }) - - mgr := NewProviderConfigControllerManager( - kubeClient, - informers, - pcClient, - svcNeg, - networkclient.Interface(nil), - nodetopologyclient.Interface(nil), - kubeClient, // event recorder - kubeSystemUID, - rootNamer, - namer.NewL4Namer(string(kubeSystemUID), rootNamer), - lpCfg, - gceCreator, - globalStop, - logger, - syncMetrics.FakeSyncerMetrics(), - ) - return mgr, pcClient -} - -func createProviderConfig(t *testing.T, pcClient *providerconfigclientfake.Clientset, name string) *providerconfig.ProviderConfig { - t.Helper() - pc := makePC(name) - _, err := pcClient.CloudV1().ProviderConfigs().Create(context.Background(), pc, metav1.CreateOptions{}) - if err != nil { - t.Fatalf("create pc: %v", err) - } - return pc -} - -// TestStart_AddsFinalizer_AndIsIdempotent verifies that starting controllers -// for a ProviderConfig adds the NEG cleanup finalizer and that repeated starts -// are idempotent (the underlying controller is launched only once). -func TestStart_AddsFinalizer_AndIsIdempotent(t *testing.T) { - var calls []startCall - orig := startNEGController - startNEGController = func(_ *multiprojectinformers.InformerSet, _ kubernetes.Interface, _ kubernetes.Interface, _ svcnegclient.Interface, - _ networkclient.Interface, _ nodetopologyclient.Interface, _ types.UID, _ *namer.Namer, _ *namer.L4Namer, - _ labels.PodLabelPropagationConfig, _ *cloudgce.Cloud, _ <-chan struct{}, _ klog.Logger, pc *providerconfig.ProviderConfig, _ *syncMetrics.SyncerMetrics) (chan<- struct{}, error) { - calls = append(calls, startCall{pcName: pc.Name}) - return make(chan struct{}), nil - } - t.Cleanup(func() { startNEGController = orig }) - - mgr, pcClient := newManagerForTest(t, svcnegfake.NewSimpleClientset()) - pc := createProviderConfig(t, pcClient, "pc-1") - - err := mgr.StartControllersForProviderConfig(pc) - if err != nil { - t.Fatalf("StartControllersForProviderConfig error: %v", err) - } - - got, err := pcClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) - if err != nil { - t.Fatalf("get pc: %v", err) - } - if !slices.Contains(got.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) { - t.Fatalf("expected finalizer to be added, got %v", got.Finalizers) - } - - err = mgr.StartControllersForProviderConfig(pc) - if err != nil { - t.Fatalf("second start returned error: %v", err) - } - - if len(calls) != 1 || calls[0].pcName != pc.Name { - t.Fatalf("unexpected start calls: %#v", calls) - } - -} - -// TestStart_FailureCleansFinalizer verifies that when starting controllers -// fails, the added finalizer is rolled back and removed from the object. -func TestStart_FailureCleansFinalizer(t *testing.T) { - orig := startNEGController - startNEGController = func(_ *multiprojectinformers.InformerSet, _ kubernetes.Interface, _ kubernetes.Interface, _ svcnegclient.Interface, - _ networkclient.Interface, _ nodetopologyclient.Interface, _ types.UID, _ *namer.Namer, _ *namer.L4Namer, - _ labels.PodLabelPropagationConfig, _ *cloudgce.Cloud, _ <-chan struct{}, _ klog.Logger, _ *providerconfig.ProviderConfig, _ *syncMetrics.SyncerMetrics) (chan<- struct{}, error) { - return nil, errors.New("boom") - } - t.Cleanup(func() { startNEGController = orig }) - - mgr, pcClient := newManagerForTest(t, svcnegfake.NewSimpleClientset()) - pc := createProviderConfig(t, pcClient, "pc-err") - - err := mgr.StartControllersForProviderConfig(pc) - if err == nil { - t.Fatalf("expected error from StartControllersForProviderConfig") - } - - got, err := pcClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) - if err != nil { - t.Fatalf("get pc: %v", err) - } - if slices.Contains(got.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) { - t.Fatalf("expected finalizer removed on failure, got %v", got.Finalizers) - } - -} - -// TestStart_GCEClientError_CleansFinalizer verifies that if the GCE client -// creation fails after adding the finalizer, the finalizer is rolled back so -// deletion is not blocked unnecessarily. -func TestStart_GCEClientError_CleansFinalizer(t *testing.T) { - mgr, pcClient := newManagerForTest(t, svcnegfake.NewSimpleClientset()) - pc := createProviderConfig(t, pcClient, "pc-gce-err") - - // Inject failing GCE creator. - mgr.gceCreator = failingGCECreator{} - - err := mgr.StartControllersForProviderConfig(pc) - if err == nil { - t.Fatalf("expected error from StartControllersForProviderConfig when GCEForProviderConfig fails") - } - - got, err := pcClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) - if err != nil { - t.Fatalf("get pc: %v", err) - } - if slices.Contains(got.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) { - t.Fatalf("expected finalizer removed on GCE client failure, got %v", got.Finalizers) - } -} - -// TestStop_ClosesChannel_AndRemovesFinalizer verifies that stopping a -// ProviderConfig's controllers closes the stop channel and removes the -// cleanup finalizer from the ProviderConfig resource. -func TestStop_ClosesChannel_AndRemovesFinalizer(t *testing.T) { - var ch chan struct{} - orig := startNEGController - startNEGController = func(_ *multiprojectinformers.InformerSet, _ kubernetes.Interface, _ kubernetes.Interface, _ svcnegclient.Interface, - _ networkclient.Interface, _ nodetopologyclient.Interface, _ types.UID, _ *namer.Namer, _ *namer.L4Namer, - _ labels.PodLabelPropagationConfig, _ *cloudgce.Cloud, _ <-chan struct{}, _ klog.Logger, _ *providerconfig.ProviderConfig, _ *syncMetrics.SyncerMetrics) (chan<- struct{}, error) { - ch = make(chan struct{}) - return ch, nil - } - t.Cleanup(func() { startNEGController = orig }) - - mgr, pcClient := newManagerForTest(t, svcnegfake.NewSimpleClientset()) - pc := createProviderConfig(t, pcClient, "pc-stop") - - err := mgr.StartControllersForProviderConfig(pc) - if err != nil { - t.Fatalf("start: %v", err) - } - gotBefore, err := pcClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) - if err != nil { - t.Fatalf("get pc: %v", err) - } - if !slices.Contains(gotBefore.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) { - t.Fatalf("expected finalizer before stop") - } - - mgr.StopControllersForProviderConfig(pc) - - // Channel should be closed (non-blocking read succeeds) - select { - case <-ch: - // ok - default: - t.Fatalf("expected stop channel to be closed") - } - - gotAfter, err := pcClient.CloudV1().ProviderConfigs().Get(context.Background(), pc.Name, metav1.GetOptions{}) - if err != nil { - t.Fatalf("get pc: %v", err) - } - if slices.Contains(gotAfter.Finalizers, finalizer.ProviderConfigNEGCleanupFinalizer) { - t.Fatalf("expected finalizer removed after stop, got %v", gotAfter.Finalizers) - } - - // Second stop should not panic - mgr.StopControllersForProviderConfig(pc) - -} - -// TestConcurrentStart_IsSingleShot verifies that concurrent calls to start -// controllers for the same ProviderConfig result in a single start. -func TestConcurrentStart_IsSingleShot(t *testing.T) { - var mu sync.Mutex - var calls []startCall - orig := startNEGController - startNEGController = func(_ *multiprojectinformers.InformerSet, _ kubernetes.Interface, _ kubernetes.Interface, _ svcnegclient.Interface, - _ networkclient.Interface, _ nodetopologyclient.Interface, _ types.UID, _ *namer.Namer, _ *namer.L4Namer, - _ labels.PodLabelPropagationConfig, _ *cloudgce.Cloud, _ <-chan struct{}, _ klog.Logger, pc *providerconfig.ProviderConfig, _ *syncMetrics.SyncerMetrics) (chan<- struct{}, error) { - mu.Lock() - defer mu.Unlock() - calls = append(calls, startCall{pcName: pc.Name}) - return make(chan struct{}), nil - } - t.Cleanup(func() { startNEGController = orig }) - - mgr, pcClient := newManagerForTest(t, svcnegfake.NewSimpleClientset()) - pc := createProviderConfig(t, pcClient, "pc-concurrent") - - var wg sync.WaitGroup - const n = 10 - wg.Add(n) - for range n { - go func() { - defer wg.Done() - _ = mgr.StartControllersForProviderConfig(pc) - }() - } - wg.Wait() - - if len(calls) != 1 { - t.Fatalf("expected single start call, got %d", len(calls)) - } -} diff --git a/pkg/multiproject/informerset/informerset.go b/pkg/multiproject/neg/informerset/informerset.go similarity index 99% rename from pkg/multiproject/informerset/informerset.go rename to pkg/multiproject/neg/informerset/informerset.go index 8c7eb64986..23dcb4cda8 100644 --- a/pkg/multiproject/informerset/informerset.go +++ b/pkg/multiproject/neg/informerset/informerset.go @@ -13,7 +13,7 @@ import ( informernetworking "k8s.io/client-go/informers/networking/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" - "k8s.io/ingress-gce/pkg/multiproject/filteredinformer" + "k8s.io/ingress-gce/pkg/multiproject/common/filteredinformer" svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" informersvcneg "k8s.io/ingress-gce/pkg/svcneg/client/informers/externalversions/svcneg/v1beta1" "k8s.io/ingress-gce/pkg/utils" diff --git a/pkg/multiproject/informerset/informerset_test.go b/pkg/multiproject/neg/informerset/informerset_test.go similarity index 100% rename from pkg/multiproject/informerset/informerset_test.go rename to pkg/multiproject/neg/informerset/informerset_test.go diff --git a/pkg/multiproject/neg/neg.go b/pkg/multiproject/neg/neg.go index 1507d4d8fc..a0d165e9f9 100644 --- a/pkg/multiproject/neg/neg.go +++ b/pkg/multiproject/neg/neg.go @@ -11,7 +11,7 @@ import ( "k8s.io/cloud-provider-gcp/providers/gce" providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" "k8s.io/ingress-gce/pkg/flags" - multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/informerset" + multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/neg/informerset" "k8s.io/ingress-gce/pkg/neg" "k8s.io/ingress-gce/pkg/neg/metrics" syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" diff --git a/pkg/multiproject/neg/neg_test.go b/pkg/multiproject/neg/neg_test.go index deb0356e32..87fc99c3ed 100644 --- a/pkg/multiproject/neg/neg_test.go +++ b/pkg/multiproject/neg/neg_test.go @@ -12,8 +12,8 @@ import ( k8sfake "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/tools/cache" providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" - multiprojectgce "k8s.io/ingress-gce/pkg/multiproject/gce" - multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/informerset" + multiprojectgce "k8s.io/ingress-gce/pkg/multiproject/common/gce" + multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/neg/informerset" "k8s.io/ingress-gce/pkg/neg" "k8s.io/ingress-gce/pkg/neg/metrics" syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" diff --git a/pkg/multiproject/neg/starter.go b/pkg/multiproject/neg/starter.go new file mode 100644 index 0000000000..f9c8ce244d --- /dev/null +++ b/pkg/multiproject/neg/starter.go @@ -0,0 +1,106 @@ +package neg + +import ( + "fmt" + + networkclient "github.com/GoogleCloudPlatform/gke-networking-api/client/network/clientset/versioned" + nodetopologyclient "github.com/GoogleCloudPlatform/gke-networking-api/client/nodetopology/clientset/versioned" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + providerconfig "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" + "k8s.io/ingress-gce/pkg/multiproject/common/gce" + multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/neg/informerset" + syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" + "k8s.io/ingress-gce/pkg/neg/syncers/labels" + svcnegclient "k8s.io/ingress-gce/pkg/svcneg/client/clientset/versioned" + "k8s.io/ingress-gce/pkg/utils/namer" + "k8s.io/klog/v2" +) + +// NEGControllerStarter implements framework.ControllerStarter for NEG controllers. +// It encapsulates all NEG-specific dependencies and startup logic. +type NEGControllerStarter struct { + informers *multiprojectinformers.InformerSet + kubeClient kubernetes.Interface + svcNegClient svcnegclient.Interface + networkClient networkclient.Interface + nodetopologyClient nodetopologyclient.Interface + eventRecorderClient kubernetes.Interface + kubeSystemUID types.UID + clusterNamer *namer.Namer + l4Namer *namer.L4Namer + lpConfig labels.PodLabelPropagationConfig + gceCreator gce.GCECreator + globalStopCh <-chan struct{} + logger klog.Logger + syncerMetrics *syncMetrics.SyncerMetrics +} + +// NewNEGControllerStarter creates a new NEG controller starter with the given dependencies. +func NewNEGControllerStarter( + informers *multiprojectinformers.InformerSet, + kubeClient kubernetes.Interface, + svcNegClient svcnegclient.Interface, + networkClient networkclient.Interface, + nodetopologyClient nodetopologyclient.Interface, + eventRecorderClient kubernetes.Interface, + kubeSystemUID types.UID, + clusterNamer *namer.Namer, + l4Namer *namer.L4Namer, + lpConfig labels.PodLabelPropagationConfig, + gceCreator gce.GCECreator, + globalStopCh <-chan struct{}, + logger klog.Logger, + syncerMetrics *syncMetrics.SyncerMetrics, +) *NEGControllerStarter { + return &NEGControllerStarter{ + informers: informers, + kubeClient: kubeClient, + svcNegClient: svcNegClient, + networkClient: networkClient, + nodetopologyClient: nodetopologyClient, + eventRecorderClient: eventRecorderClient, + kubeSystemUID: kubeSystemUID, + clusterNamer: clusterNamer, + l4Namer: l4Namer, + lpConfig: lpConfig, + gceCreator: gceCreator, + globalStopCh: globalStopCh, + logger: logger, + syncerMetrics: syncerMetrics, + } +} + +// StartController implements framework.ControllerStarter. +// It creates a GCE client for the ProviderConfig and starts the NEG controller. +func (s *NEGControllerStarter) StartController(pc *providerconfig.ProviderConfig) (chan<- struct{}, error) { + logger := s.logger.WithValues("providerConfigId", pc.Name) + + cloud, err := s.gceCreator.GCEForProviderConfig(pc, logger) + if err != nil { + return nil, fmt.Errorf("failed to create GCE client for provider config %+v: %w", pc, err) + } + + negControllerStopCh, err := StartNEGController( + s.informers, + s.kubeClient, + s.eventRecorderClient, + s.svcNegClient, + s.networkClient, + s.nodetopologyClient, + s.kubeSystemUID, + s.clusterNamer, + s.l4Namer, + s.lpConfig, + cloud, + s.globalStopCh, + logger, + pc, + s.syncerMetrics, + ) + if err != nil { + return nil, fmt.Errorf("failed to start NEG controller: %w", err) + } + + return negControllerStopCh, nil +} diff --git a/pkg/multiproject/start/start.go b/pkg/multiproject/start/start.go index 1d911bea77..f2d9342429 100644 --- a/pkg/multiproject/start/start.go +++ b/pkg/multiproject/start/start.go @@ -17,10 +17,11 @@ import ( "k8s.io/client-go/tools/leaderelection/resourcelock" "k8s.io/ingress-gce/pkg/flags" _ "k8s.io/ingress-gce/pkg/klog" - pccontroller "k8s.io/ingress-gce/pkg/multiproject/controller" - "k8s.io/ingress-gce/pkg/multiproject/gce" - multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/informerset" - "k8s.io/ingress-gce/pkg/multiproject/manager" + "k8s.io/ingress-gce/pkg/multiproject/common/finalizer" + "k8s.io/ingress-gce/pkg/multiproject/common/gce" + "k8s.io/ingress-gce/pkg/multiproject/framework" + "k8s.io/ingress-gce/pkg/multiproject/neg" + multiprojectinformers "k8s.io/ingress-gce/pkg/multiproject/neg/informerset" syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" "k8s.io/ingress-gce/pkg/neg/syncers/labels" providerconfigclient "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned" @@ -169,10 +170,9 @@ func Start( return } - manager := manager.NewProviderConfigControllerManager( - kubeClient, + negStarter := neg.NewNEGControllerStarter( informers, - providerConfigClient, + kubeClient, svcNegClient, networkClient, nodeTopologyClient, @@ -186,7 +186,6 @@ func Start( logger, syncerMetrics, ) - logger.V(1).Info("Initialized ProviderConfig controller manager") // Create ProviderConfig informer providerConfigInformer := providerconfiginformers.NewSharedInformerFactory(providerConfigClient, flags.F.ResyncPeriod).Cloud().V1().ProviderConfigs().Informer() @@ -202,8 +201,16 @@ func Start( } logger.Info("Provider config informer synced successfully") - pcController := pccontroller.NewProviderConfigController(manager, providerConfigInformer, stopCh, logger) + fw := framework.New( + providerConfigClient, + providerConfigInformer, + finalizer.ProviderConfigNEGCleanupFinalizer, + negStarter, + stopCh, + logger, + ) + logger.V(1).Info("Running ProviderConfig controller") - pcController.Run() + fw.Run() logger.V(1).Info("ProviderConfig controller stopped") } diff --git a/pkg/multiproject/start/start_test.go b/pkg/multiproject/start/start_test.go index c59dc44702..08fb415cdc 100644 --- a/pkg/multiproject/start/start_test.go +++ b/pkg/multiproject/start/start_test.go @@ -25,9 +25,9 @@ import ( "k8s.io/ingress-gce/pkg/annotations" providerconfigv1 "k8s.io/ingress-gce/pkg/apis/providerconfig/v1" "k8s.io/ingress-gce/pkg/flags" - "k8s.io/ingress-gce/pkg/multiproject/finalizer" - multiprojectgce "k8s.io/ingress-gce/pkg/multiproject/gce" - "k8s.io/ingress-gce/pkg/multiproject/testutil" + "k8s.io/ingress-gce/pkg/multiproject/common/finalizer" + multiprojectgce "k8s.io/ingress-gce/pkg/multiproject/common/gce" + "k8s.io/ingress-gce/pkg/multiproject/common/testutil" syncMetrics "k8s.io/ingress-gce/pkg/neg/metrics/metricscollector" negtypes "k8s.io/ingress-gce/pkg/neg/types" pcclientfake "k8s.io/ingress-gce/pkg/providerconfig/client/clientset/versioned/fake"