Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions api/v1alpha1/etcdcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,124 @@ type EtcdClusterStatus struct {
// +listType=map
// +listMapKey=type
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`

// Recovery captures the state of an in-progress (or last attempted) automatic
// disaster recovery from quorum loss. It is managed entirely by the operator's
// quorum-loss recovery state machine and is nil when no recovery has ever been
// attempted.
// +optional
Recovery *RecoveryStatus `json:"recovery,omitempty"`
}

// RecoveryPhase enumerates the stages of the automatic quorum-loss recovery
// state machine. The phases form a strict, idempotent progression so that the
// controller can resume recovery safely after a restart or a transient error.
type RecoveryPhase string

const (
// RecoveryPhaseDetecting means the controller has observed a candidate
// quorum-loss event but has not yet confirmed it is sustained (it may still
// be a transient blip that self-heals before the grace window elapses).
RecoveryPhaseDetecting RecoveryPhase = "Detecting"
// RecoveryPhaseRebuilding means sustained quorum loss was confirmed and the
// controller is rebuilding a single-member cluster from a surviving member
// using --force-new-cluster.
RecoveryPhaseRebuilding RecoveryPhase = "Rebuilding"
// RecoveryPhaseScalingOut means the single-member cluster is healthy again
// and the controller is re-adding the remaining members one at a time via
// the normal learner-add path.
RecoveryPhaseScalingOut RecoveryPhase = "ScalingOut"
// RecoveryPhaseCompleted means the cluster was restored to its desired size
// and quorum.
RecoveryPhaseCompleted RecoveryPhase = "Completed"
)

// RecoveryStatus records the progress of the quorum-loss recovery state machine.
type RecoveryStatus struct {
// Phase is the current stage of the recovery state machine.
// +optional
Phase RecoveryPhase `json:"phase,omitempty"`

// SurvivorOrdinal is the StatefulSet pod ordinal whose data directory was
// chosen as the surviving source of truth for the rebuild. It is always 0
// today (the operator keeps ordinal-0's PVC) but is recorded explicitly so
// the choice is auditable and future survivor-selection policies remain
// backward compatible.
// +optional
SurvivorOrdinal int32 `json:"survivorOrdinal,omitempty"`

// DetectedTime is the first time a sustained-quorum-loss candidate was
// observed. It anchors the grace window used to distinguish true quorum loss
// from transient single-member failures.
// +optional
DetectedTime *metav1.Time `json:"detectedTime,omitempty"`

// LastTransitionTime is the time the recovery phase last changed.
// +optional
LastTransitionTime *metav1.Time `json:"lastTransitionTime,omitempty"`

// Message is a human-readable description of the current recovery step.
// +optional
Message string `json:"message,omitempty"`

// Attempts is the number of times the operator has committed to a destructive
// rebuild for this cluster (i.e. entered the Rebuilding phase from Detecting).
// It is a monotonically increasing counter that survives across recoveries and
// is never reset, giving operators a durable signal of how often this cluster
// has needed disaster recovery — repeated recoveries usually point at an
// underlying infrastructure problem rather than a one-off event.
// +optional
Attempts int32 `json:"attempts,omitempty"`

// DataLoss records the data-loss accounting for the most recent rebuild.
//
// Quorum-loss recovery via --force-new-cluster is NOT a lossless operation:
// the rebuilt cluster retains only the writes that were committed to the
// surviving member's local data directory. Any write that a now-destroyed
// majority had committed but had not yet replicated to the survivor is GONE.
// This field surfaces that fact explicitly (alongside a Warning Event, the
// DataLossPossible condition, and structured logs) so the loss is auditable
// and never silent. It is nil until a rebuild from a survivor completes.
// +optional
DataLoss *DataLossInfo `json:"dataLoss,omitempty"`
}

// DataLossInfo captures what the operator knows about the data retained by — and
// therefore the data potentially lost during — a force-new-cluster rebuild.
//
// The operator cannot enumerate exactly which keys were lost (the members that
// held the un-replicated writes are gone), so this records the provable lower
// bound on retained state: the survivor's identity and its last committed
// revision. Everything the destroyed majority committed beyond SurvivorRevision
// is unrecoverable.
type DataLossInfo struct {
// SurvivorMemberID is the hex-encoded etcd member ID of the survivor whose
// data directory was used to rebuild the cluster.
// +optional
SurvivorMemberID string `json:"survivorMemberID,omitempty"`

// SurvivorRevision is the key-value store revision present on the survivor at
// the moment the single-member cluster came back healthy. It is the highest
// revision guaranteed to be retained; any revision the lost majority committed
// above this value did not survive the rebuild.
// +optional
SurvivorRevision int64 `json:"survivorRevision,omitempty"`

// RaftIndex is the survivor's raft committed index at rebuild time, recorded
// for forensic correlation with member logs.
// +optional
RaftIndex uint64 `json:"raftIndex,omitempty"`

// RecoveredTime is when the rebuilt single-member cluster was confirmed
// healthy and this accounting was captured.
// +optional
RecoveredTime *metav1.Time `json:"recoveredTime,omitempty"`

// Message is a human-readable, operator-facing summary of the data-loss
// situation, e.g. "recovered with possible data loss; rebuilt from member
// <id> at revision <r>".
// +optional
Message string `json:"message,omitempty"`
}

// MemberStatus defines the observed state of a single etcd member.
Expand Down
54 changes: 53 additions & 1 deletion api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

93 changes: 92 additions & 1 deletion config/crd/bases/operator.etcd.io_etcdclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.20.1
controller-gen.kubebuilder.io/version: v0.21.0
name: etcdclusters.operator.etcd.io
spec:
group: operator.etcd.io
Expand Down Expand Up @@ -1332,6 +1332,97 @@ spec:
This reflects the .status.readyReplicas of the underlying StatefulSet.
format: int32
type: integer
recovery:
description: |-
Recovery captures the state of an in-progress (or last attempted) automatic
disaster recovery from quorum loss. It is managed entirely by the operator's
quorum-loss recovery state machine and is nil when no recovery has ever been
attempted.
properties:
attempts:
description: |-
Attempts is the number of times the operator has committed to a destructive
rebuild for this cluster (i.e. entered the Rebuilding phase from Detecting).
It is a monotonically increasing counter that survives across recoveries and
is never reset, giving operators a durable signal of how often this cluster
has needed disaster recovery — repeated recoveries usually point at an
underlying infrastructure problem rather than a one-off event.
format: int32
type: integer
dataLoss:
description: |-
DataLoss records the data-loss accounting for the most recent rebuild.

Quorum-loss recovery via --force-new-cluster is NOT a lossless operation:
the rebuilt cluster retains only the writes that were committed to the
surviving member's local data directory. Any write that a now-destroyed
majority had committed but had not yet replicated to the survivor is GONE.
This field surfaces that fact explicitly (alongside a Warning Event, the
DataLossPossible condition, and structured logs) so the loss is auditable
and never silent. It is nil until a rebuild from a survivor completes.
properties:
message:
description: |-
Message is a human-readable, operator-facing summary of the data-loss
situation, e.g. "recovered with possible data loss; rebuilt from member
<id> at revision <r>".
type: string
raftIndex:
description: |-
RaftIndex is the survivor's raft committed index at rebuild time, recorded
for forensic correlation with member logs.
format: int64
type: integer
recoveredTime:
description: |-
RecoveredTime is when the rebuilt single-member cluster was confirmed
healthy and this accounting was captured.
format: date-time
type: string
survivorMemberID:
description: |-
SurvivorMemberID is the hex-encoded etcd member ID of the survivor whose
data directory was used to rebuild the cluster.
type: string
survivorRevision:
description: |-
SurvivorRevision is the key-value store revision present on the survivor at
the moment the single-member cluster came back healthy. It is the highest
revision guaranteed to be retained; any revision the lost majority committed
above this value did not survive the rebuild.
format: int64
type: integer
type: object
detectedTime:
description: |-
DetectedTime is the first time a sustained-quorum-loss candidate was
observed. It anchors the grace window used to distinguish true quorum loss
from transient single-member failures.
format: date-time
type: string
lastTransitionTime:
description: LastTransitionTime is the time the recovery phase
last changed.
format: date-time
type: string
message:
description: Message is a human-readable description of the current
recovery step.
type: string
phase:
description: Phase is the current stage of the recovery state
machine.
type: string
survivorOrdinal:
description: |-
SurvivorOrdinal is the StatefulSet pod ordinal whose data directory was
chosen as the surviving source of truth for the rebuild. It is always 0
today (the operator keeps ordinal-0's PVC) but is recorded explicitly so
the choice is auditable and future survivor-selection policies remain
backward compatible.
format: int32
type: integer
type: object
type: object
type: object
served: true
Expand Down
8 changes: 8 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ rules:
- list
- patch
- update
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- watch
- apiGroups:
- apps
resources:
Expand Down
39 changes: 37 additions & 2 deletions internal/controller/etcdcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,21 @@ type EtcdClusterReconciler struct {
Scheme *runtime.Scheme
Recorder events.EventRecorder
ImageRegistry string

// clusterHealthFn probes the health of the given endpoints. It defaults to
// etcdutils.ClusterHealth and exists as a seam so the recovery state machine's
// survivor-health gate can be unit-tested without a live etcd. Resolved lazily
// via clusterHealth; nil means "use the real implementation".
clusterHealthFn func(eps []string) ([]etcdutils.EpHealth, error)
}

// clusterHealth probes endpoint health via the injected seam, falling back to the
// real implementation when unset (the production path).
func (r *EtcdClusterReconciler) clusterHealth(eps []string) ([]etcdutils.EpHealth, error) {
if r.clusterHealthFn != nil {
return r.clusterHealthFn(eps)
}
return etcdutils.ClusterHealth(eps)
}

// reconcileState holds all transient data for a single reconciliation loop.
Expand All @@ -68,6 +83,9 @@ type reconcileState struct {
// +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;create;update;patch;delete
// Quorum-loss recovery reads the survivor pod (cached client => list+watch) to
// confirm it exists before arming the irreversible --force-new-cluster rebuild.
// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch
// +kubebuilder:rbac:groups="",resources=events,verbs=create;patch;get;list;update
// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;patch;update;delete
// +kubebuilder:rbac:groups="cert-manager.io",resources=certificates,verbs=get;list;watch;create;patch;update;delete
Expand Down Expand Up @@ -105,8 +123,25 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return res, err
}

if err = r.performHealthChecks(ctx, state); err != nil {
return ctrl.Result{}, err
healthErr := r.performHealthChecks(ctx, state)

// Quorum-loss recovery gate. A failed health check on a multi-member cluster
// can mean the cluster has permanently lost quorum (a majority of members are
// gone) and cannot self-heal. maybeRecoverQuorum inspects the observed member
// health, and — only on sustained, true quorum loss — drives an idempotent
// disaster-recovery state machine (rebuild-from-survivor + re-add members).
// While it owns the reconcile (handled=true) we requeue and skip normal
// scaling so the two paths never fight. See quorum_recovery.go.
if handled, requeueAfter, recErr := r.maybeRecoverQuorum(ctx, state, state.memberHealth, healthErr); handled || recErr != nil {
return ctrl.Result{RequeueAfter: requeueAfter}, recErr
}

// During recovery scale-out the recovery state machine delegates membership
// re-adds to reconcileClusterState below; a transient per-member health error
// (e.g. a freshly added learner not yet caught up) must NOT short-circuit that
// path, or recovery would stall. Outside recovery, a health error is fatal.
if healthErr != nil && !recoveryActive(state.cluster) {
return ctrl.Result{}, healthErr
}

return r.reconcileClusterState(ctx, state)
Expand Down
Loading