diff --git a/pkg/cluster/condition.go b/pkg/cluster/condition.go index b9c377f02de..56e47dfe73c 100644 --- a/pkg/cluster/condition.go +++ b/pkg/cluster/condition.go @@ -6,6 +6,8 @@ package cluster import ( "context" "errors" + "slices" + "strings" "time" corev1 "k8s.io/api/core/v1" @@ -22,6 +24,8 @@ const workerMachineRoleLabel = "machine.openshift.io/cluster-api-machine-role=wo const workerNodeRoleLabel = "node-role.kubernetes.io/worker" const phaseRunning = "Running" +var clusterOperatorsToRequireSettled = []string{"kube-controller-manager", "kube-apiserver", "kube-scheduler", "console", "authentication"} + // condition functions should return an error only if it's not able to be retried // if a condition function encounters a error when retrying it should return false, nil. @@ -33,39 +37,6 @@ func (m *manager) apiServersReady(ctx context.Context) (bool, error) { return clusteroperators.IsOperatorAvailable(apiserver), nil } -// apiServersReadyAfterCertificateConfig provides a more robust check for apiserver readiness -// after certificate configuration. It waits for the operator to be available and not progressing, -// and also ensures that any new revisions triggered by certificate changes have completed. -func (m *manager) apiServersReadyAfterCertificateConfig(ctx context.Context) (bool, error) { - apiserver, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "kube-apiserver", metav1.GetOptions{}) - if err != nil { - return false, nil - } - - // First check if the operator is available and not progressing - if !clusteroperators.IsOperatorAvailable(apiserver) { - m.log.Infof("kube-apiserver operator not yet available: %s", clusteroperators.OperatorStatusText(apiserver)) - return false, nil - } - - // Additional check: ensure that all conditions are stable - // This helps catch cases where the operator reports as available but is still - // processing certificate-related revisions - for _, condition := range apiserver.Status.Conditions { - if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionTrue { - m.log.Infof("kube-apiserver operator still progressing: %s", clusteroperators.OperatorStatusText(apiserver)) - return false, nil - } - if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue { - m.log.Infof("kube-apiserver operator degraded: %s", clusteroperators.OperatorStatusText(apiserver)) - return false, nil - } - } - - m.log.Infof("kube-apiserver operator is ready: %s", clusteroperators.OperatorStatusText(apiserver)) - return true, nil -} - func (m *manager) minimumWorkerNodesReady(ctx context.Context) (bool, error) { machines, err := m.maocli.MachineV1beta1().Machines("openshift-machine-api").List(ctx, metav1.ListOptions{ LabelSelector: workerMachineRoleLabel, @@ -194,3 +165,31 @@ func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (bool, er timeSinceLastSync := time.Since(timestamp) return timeSinceLastSync.Minutes() < 5, nil } + +// Check if all ClusterOperators have settled (i.e. are available and not +// progressing). +func (m *manager) clusterOperatorsHaveSettled(ctx context.Context) (bool, error) { + coList := &configv1.ClusterOperatorList{} + + err := m.ch.List(ctx, coList) + if err != nil { + // Be resilient to failures as kube-apiserver might drop connections while it's reconciling + m.log.Errorf("failure listing cluster operators, retrying: %s", err.Error()) + return false, nil + } + + allSettled := true + + // Only check the COs we care about to prevent added ones in new OpenShift + // versions perhaps tripping us up later + for _, co := range coList.Items { + if slices.Contains(clusterOperatorsToRequireSettled, strings.ToLower(co.Name)) { + if !clusteroperators.IsOperatorAvailable(&co) { + allSettled = false + m.log.Warnf("ClusterOperator not yet settled: %s", clusteroperators.OperatorStatusText(&co)) + } + } + } + + return allSettled, nil +} diff --git a/pkg/cluster/condition_test.go b/pkg/cluster/condition_test.go index d2c8d3f25a0..0eb463a97c8 100644 --- a/pkg/cluster/condition_test.go +++ b/pkg/cluster/condition_test.go @@ -412,81 +412,3 @@ func TestAroCredentialsRequestReconciled(t *testing.T) { }) } } - -func TestApiServersReadyAfterCertificateConfig(t *testing.T) { - ctx := context.Background() - - for _, tt := range []struct { - name string - availableCondition configv1.ConditionStatus - progressingCondition configv1.ConditionStatus - degradedCondition configv1.ConditionStatus - want bool - }{ - { - name: "Available, not progressing, not degraded - ready", - availableCondition: configv1.ConditionTrue, - progressingCondition: configv1.ConditionFalse, - degradedCondition: configv1.ConditionFalse, - want: true, - }, - { - name: "Available but still progressing - not ready", - availableCondition: configv1.ConditionTrue, - progressingCondition: configv1.ConditionTrue, - degradedCondition: configv1.ConditionFalse, - want: false, - }, - { - name: "Available but degraded - not ready", - availableCondition: configv1.ConditionTrue, - progressingCondition: configv1.ConditionFalse, - degradedCondition: configv1.ConditionTrue, - want: false, - }, - { - name: "Not available - not ready", - availableCondition: configv1.ConditionFalse, - progressingCondition: configv1.ConditionFalse, - degradedCondition: configv1.ConditionFalse, - want: false, - }, - } { - t.Run(tt.name, func(t *testing.T) { - configcli := configfake.NewSimpleClientset(&configv1.ClusterOperator{ - ObjectMeta: metav1.ObjectMeta{ - Name: "kube-apiserver", - }, - Status: configv1.ClusterOperatorStatus{ - Conditions: []configv1.ClusterOperatorStatusCondition{ - { - Type: configv1.OperatorAvailable, - Status: tt.availableCondition, - }, - { - Type: configv1.OperatorProgressing, - Status: tt.progressingCondition, - }, - { - Type: configv1.OperatorDegraded, - Status: tt.degradedCondition, - }, - }, - }, - }) - - m := &manager{ - log: logrus.NewEntry(logrus.StandardLogger()), - configcli: configcli, - } - - result, err := m.apiServersReadyAfterCertificateConfig(ctx) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if result != tt.want { - t.Errorf("Result was %v, wanted %v", result, tt.want) - } - }) - } -} diff --git a/pkg/cluster/install.go b/pkg/cluster/install.go index a20dc6dcda9..c7d171e8fee 100644 --- a/pkg/cluster/install.go +++ b/pkg/cluster/install.go @@ -495,10 +495,8 @@ func (m *manager) Install(ctx context.Context) error { steps.Action(m.configureAPIServerCertificate), // Use a more robust check after certificate configuration to handle race conditions // where the apiserver is still processing certificate-related revisions - steps.Condition(m.apiServersReadyAfterCertificateConfig, 30*time.Minute, true), + steps.Condition(m.apiServersReady, 30*time.Minute, true), steps.Condition(m.minimumWorkerNodesReady, 30*time.Minute, true), - // Additional check after worker nodes are ready to ensure certificate revisions have fully propagated - steps.Condition(m.apiServersReadyAfterCertificateConfig, 30*time.Minute, true), steps.Condition(m.operatorConsoleExists, 30*time.Minute, true), steps.Action(m.updateConsoleBranding), steps.Condition(m.operatorConsoleReady, 20*time.Minute, true), @@ -513,8 +511,8 @@ func (m *manager) Install(ctx context.Context) error { steps.Action(m.configureDefaultStorageClass), steps.Action(m.removeAzureFileCSIStorageClass), steps.Action(m.disableOperatorReconciliation), - // Final check before finishing installation to ensure apiserver is fully stable - steps.Condition(m.apiServersReadyAfterCertificateConfig, 30*time.Minute, true), + // Ensure that the cluster operators have settled + steps.Condition(m.clusterOperatorsHaveSettled, 30*time.Minute, true), steps.Action(m.finishInstallation), }, }