Skip to content
Closed
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 34 additions & 11 deletions cns/service/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1609,23 +1609,45 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
// Start the Manager which starts the reconcile loop.
// The Reconciler will send an initial NodeNetworkConfig update to the PoolMonitor, starting the
// Monitor's internal loop.
managerErrCh := make(chan error, 1)
go func() {
logger.Printf("Starting controller-manager.")
for {
if err := manager.Start(ctx); err != nil {
logger.Errorf("Failed to start controller-manager: %v", err)
// retry to start the request controller
// inc the managerStartFailures metric for failure tracking
managerStartFailures.Inc()
} else {
logger.Printf("Stopped controller-manager.")
return
}
time.Sleep(time.Second) // TODO(rbtr): make this exponential backoff
// Add timeout for controller startup
managerStartTimeout := 5 * time.Minute
startManagerCtx, startManagerCancel := context.WithTimeout(ctx, managerStartTimeout)
defer startManagerCancel()

if err := manager.Start(startManagerCtx); err != nil {
logger.Errorf("Failed to start controller-manager: %v", err)
managerErrCh <- err
return
}
logger.Printf("Stopped controller-manager.")
managerErrCh <- nil
}()
logger.Printf("Initialized controller-manager.")

// Check if manager startup failed before proceeding
select {
case managerErr := <-managerErrCh:
if managerErr != nil {
return errors.Wrap(managerErr, "controller-manager failed to start")
}
case <-time.After(100 * time.Millisecond):
// Continue if no immediate error
}

for {
// Check if manager failed during startup or runtime
select {
case managerErr := <-managerErrCh:
if managerErr != nil {
return errors.Wrap(managerErr, "controller-manager failed")
}
default:
// Continue with normal flow if no manager error
}

logger.Printf("Waiting for NodeNetworkConfig reconciler to start.")
// wait for the Reconciler to run once on a NNC that was made for this Node.
// the nncReadyCtx has a timeout of 15 minutes, after which we will consider
Expand All @@ -1634,6 +1656,7 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
if started, err := nncReconciler.Started(nncReadyCtx); !started {
logger.Errorf("NNC reconciler has not started, does the NNC exist? err: %v", err)
nncReconcilerStartFailures.Inc()
cancel()
continue
}
logger.Printf("NodeNetworkConfig reconciler has started.")
Expand Down