Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 32 additions & 33 deletions pkg/cluster/condition.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ package cluster
import (
"context"
"errors"
"slices"
"strings"
"time"

corev1 "k8s.io/api/core/v1"
Expand All @@ -22,6 +24,8 @@ const workerMachineRoleLabel = "machine.openshift.io/cluster-api-machine-role=wo
const workerNodeRoleLabel = "node-role.kubernetes.io/worker"
const phaseRunning = "Running"

var clusterOperatorsToRequireSettled = []string{"kube-controller-manager", "kube-apiserver", "kube-scheduler", "console", "authentication"}

// condition functions should return an error only if it's not able to be retried
// if a condition function encounters a error when retrying it should return false, nil.

Expand All @@ -33,39 +37,6 @@ func (m *manager) apiServersReady(ctx context.Context) (bool, error) {
return clusteroperators.IsOperatorAvailable(apiserver), nil
}

// apiServersReadyAfterCertificateConfig provides a more robust check for apiserver readiness
// after certificate configuration. It waits for the operator to be available and not progressing,
// and also ensures that any new revisions triggered by certificate changes have completed.
func (m *manager) apiServersReadyAfterCertificateConfig(ctx context.Context) (bool, error) {
apiserver, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "kube-apiserver", metav1.GetOptions{})
if err != nil {
return false, nil
}

// First check if the operator is available and not progressing
if !clusteroperators.IsOperatorAvailable(apiserver) {
m.log.Infof("kube-apiserver operator not yet available: %s", clusteroperators.OperatorStatusText(apiserver))
return false, nil
}

// Additional check: ensure that all conditions are stable
// This helps catch cases where the operator reports as available but is still
// processing certificate-related revisions
for _, condition := range apiserver.Status.Conditions {
if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionTrue {
m.log.Infof("kube-apiserver operator still progressing: %s", clusteroperators.OperatorStatusText(apiserver))
return false, nil
}
if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue {
m.log.Infof("kube-apiserver operator degraded: %s", clusteroperators.OperatorStatusText(apiserver))
return false, nil
}
}

m.log.Infof("kube-apiserver operator is ready: %s", clusteroperators.OperatorStatusText(apiserver))
return true, nil
}

func (m *manager) minimumWorkerNodesReady(ctx context.Context) (bool, error) {
machines, err := m.maocli.MachineV1beta1().Machines("openshift-machine-api").List(ctx, metav1.ListOptions{
LabelSelector: workerMachineRoleLabel,
Expand Down Expand Up @@ -194,3 +165,31 @@ func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (bool, er
timeSinceLastSync := time.Since(timestamp)
return timeSinceLastSync.Minutes() < 5, nil
}

// Check if all ClusterOperators have settled (i.e. are available and not
// progressing).
func (m *manager) clusterOperatorsHaveSettled(ctx context.Context) (bool, error) {
coList := &configv1.ClusterOperatorList{}

err := m.ch.List(ctx, coList)
if err != nil {
// Be resilient to failures as kube-apiserver might drop connections while it's reconciling
m.log.Errorf("failure listing cluster operators, retrying: %s", err.Error())
return false, nil
}

allSettled := true

// Only check the COs we care about to prevent added ones in new OpenShift
// versions perhaps tripping us up later
for _, co := range coList.Items {
if slices.Contains(clusterOperatorsToRequireSettled, strings.ToLower(co.Name)) {
if !clusteroperators.IsOperatorAvailable(&co) {
allSettled = false
m.log.Warnf("ClusterOperator not yet settled: %s", clusteroperators.OperatorStatusText(&co))
}
}
}

return allSettled, nil
}
78 changes: 0 additions & 78 deletions pkg/cluster/condition_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,81 +412,3 @@ func TestAroCredentialsRequestReconciled(t *testing.T) {
})
}
}

func TestApiServersReadyAfterCertificateConfig(t *testing.T) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we include a test for the new function clusterOperatorsHaveSettled or just modify this one?

ctx := context.Background()

for _, tt := range []struct {
name string
availableCondition configv1.ConditionStatus
progressingCondition configv1.ConditionStatus
degradedCondition configv1.ConditionStatus
want bool
}{
{
name: "Available, not progressing, not degraded - ready",
availableCondition: configv1.ConditionTrue,
progressingCondition: configv1.ConditionFalse,
degradedCondition: configv1.ConditionFalse,
want: true,
},
{
name: "Available but still progressing - not ready",
availableCondition: configv1.ConditionTrue,
progressingCondition: configv1.ConditionTrue,
degradedCondition: configv1.ConditionFalse,
want: false,
},
{
name: "Available but degraded - not ready",
availableCondition: configv1.ConditionTrue,
progressingCondition: configv1.ConditionFalse,
degradedCondition: configv1.ConditionTrue,
want: false,
},
{
name: "Not available - not ready",
availableCondition: configv1.ConditionFalse,
progressingCondition: configv1.ConditionFalse,
degradedCondition: configv1.ConditionFalse,
want: false,
},
} {
t.Run(tt.name, func(t *testing.T) {
configcli := configfake.NewSimpleClientset(&configv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: "kube-apiserver",
},
Status: configv1.ClusterOperatorStatus{
Conditions: []configv1.ClusterOperatorStatusCondition{
{
Type: configv1.OperatorAvailable,
Status: tt.availableCondition,
},
{
Type: configv1.OperatorProgressing,
Status: tt.progressingCondition,
},
{
Type: configv1.OperatorDegraded,
Status: tt.degradedCondition,
},
},
},
})

m := &manager{
log: logrus.NewEntry(logrus.StandardLogger()),
configcli: configcli,
}

result, err := m.apiServersReadyAfterCertificateConfig(ctx)
if err != nil {
t.Errorf("Unexpected error: %v", err)
}
if result != tt.want {
t.Errorf("Result was %v, wanted %v", result, tt.want)
}
})
}
}
8 changes: 3 additions & 5 deletions pkg/cluster/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -495,10 +495,8 @@ func (m *manager) Install(ctx context.Context) error {
steps.Action(m.configureAPIServerCertificate),
// Use a more robust check after certificate configuration to handle race conditions
// where the apiserver is still processing certificate-related revisions
steps.Condition(m.apiServersReadyAfterCertificateConfig, 30*time.Minute, true),
steps.Condition(m.apiServersReady, 30*time.Minute, true),
steps.Condition(m.minimumWorkerNodesReady, 30*time.Minute, true),
// Additional check after worker nodes are ready to ensure certificate revisions have fully propagated
steps.Condition(m.apiServersReadyAfterCertificateConfig, 30*time.Minute, true),
steps.Condition(m.operatorConsoleExists, 30*time.Minute, true),
steps.Action(m.updateConsoleBranding),
steps.Condition(m.operatorConsoleReady, 20*time.Minute, true),
Expand All @@ -513,8 +511,8 @@ func (m *manager) Install(ctx context.Context) error {
steps.Action(m.configureDefaultStorageClass),
steps.Action(m.removeAzureFileCSIStorageClass),
steps.Action(m.disableOperatorReconciliation),
// Final check before finishing installation to ensure apiserver is fully stable
steps.Condition(m.apiServersReadyAfterCertificateConfig, 30*time.Minute, true),
// Ensure that the cluster operators have settled
steps.Condition(m.clusterOperatorsHaveSettled, 30*time.Minute, true),
steps.Action(m.finishInstallation),
},
}
Expand Down
Loading