etcd-io · xrl · Jun 18, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/api/v1alpha1/etcdcluster_types.go b/api/v1alpha1/etcdcluster_types.go
@@ -185,6 +185,124 @@ type EtcdClusterStatus struct {
 	// +listType=map
 	// +listMapKey=type
 	Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
+
+	// Recovery captures the state of an in-progress (or last attempted) automatic
+	// disaster recovery from quorum loss. It is managed entirely by the operator's
+	// quorum-loss recovery state machine and is nil when no recovery has ever been
+	// attempted.
+	// +optional
+	Recovery *RecoveryStatus `json:"recovery,omitempty"`
+}
+
+// RecoveryPhase enumerates the stages of the automatic quorum-loss recovery
+// state machine. The phases form a strict, idempotent progression so that the
+// controller can resume recovery safely after a restart or a transient error.
+type RecoveryPhase string
+
+const (
+	// RecoveryPhaseDetecting means the controller has observed a candidate
+	// quorum-loss event but has not yet confirmed it is sustained (it may still
+	// be a transient blip that self-heals before the grace window elapses).
+	RecoveryPhaseDetecting RecoveryPhase = "Detecting"
+	// RecoveryPhaseRebuilding means sustained quorum loss was confirmed and the
+	// controller is rebuilding a single-member cluster from a surviving member
+	// using --force-new-cluster.
+	RecoveryPhaseRebuilding RecoveryPhase = "Rebuilding"
+	// RecoveryPhaseScalingOut means the single-member cluster is healthy again
+	// and the controller is re-adding the remaining members one at a time via
+	// the normal learner-add path.
+	RecoveryPhaseScalingOut RecoveryPhase = "ScalingOut"
+	// RecoveryPhaseCompleted means the cluster was restored to its desired size
+	// and quorum.
+	RecoveryPhaseCompleted RecoveryPhase = "Completed"
+)
+
+// RecoveryStatus records the progress of the quorum-loss recovery state machine.
+type RecoveryStatus struct {
+	// Phase is the current stage of the recovery state machine.
+	// +optional
+	Phase RecoveryPhase `json:"phase,omitempty"`
+
+	// SurvivorOrdinal is the StatefulSet pod ordinal whose data directory was
+	// chosen as the surviving source of truth for the rebuild. It is always 0
+	// today (the operator keeps ordinal-0's PVC) but is recorded explicitly so
+	// the choice is auditable and future survivor-selection policies remain
+	// backward compatible.
+	// +optional
+	SurvivorOrdinal int32 `json:"survivorOrdinal,omitempty"`
+
+	// DetectedTime is the first time a sustained-quorum-loss candidate was
+	// observed. It anchors the grace window used to distinguish true quorum loss
+	// from transient single-member failures.
+	// +optional
+	DetectedTime *metav1.Time `json:"detectedTime,omitempty"`
+
+	// LastTransitionTime is the time the recovery phase last changed.
+	// +optional
+	LastTransitionTime *metav1.Time `json:"lastTransitionTime,omitempty"`
+
+	// Message is a human-readable description of the current recovery step.
+	// +optional
+	Message string `json:"message,omitempty"`
+
+	// Attempts is the number of times the operator has committed to a destructive
+	// rebuild for this cluster (i.e. entered the Rebuilding phase from Detecting).
+	// It is a monotonically increasing counter that survives across recoveries and
+	// is never reset, giving operators a durable signal of how often this cluster
+	// has needed disaster recovery — repeated recoveries usually point at an
+	// underlying infrastructure problem rather than a one-off event.
+	// +optional
+	Attempts int32 `json:"attempts,omitempty"`
+
+	// DataLoss records the data-loss accounting for the most recent rebuild.
+	//
+	// Quorum-loss recovery via --force-new-cluster is NOT a lossless operation:
+	// the rebuilt cluster retains only the writes that were committed to the
+	// surviving member's local data directory. Any write that a now-destroyed
+	// majority had committed but had not yet replicated to the survivor is GONE.
+	// This field surfaces that fact explicitly (alongside a Warning Event, the
+	// DataLossPossible condition, and structured logs) so the loss is auditable
+	// and never silent. It is nil until a rebuild from a survivor completes.
+	// +optional
+	DataLoss *DataLossInfo `json:"dataLoss,omitempty"`
+}
+
+// DataLossInfo captures what the operator knows about the data retained by — and
+// therefore the data potentially lost during — a force-new-cluster rebuild.
+//
+// The operator cannot enumerate exactly which keys were lost (the members that
+// held the un-replicated writes are gone), so this records the provable lower
+// bound on retained state: the survivor's identity and its last committed
+// revision. Everything the destroyed majority committed beyond SurvivorRevision
+// is unrecoverable.
+type DataLossInfo struct {
+	// SurvivorMemberID is the hex-encoded etcd member ID of the survivor whose
+	// data directory was used to rebuild the cluster.
+	// +optional
+	SurvivorMemberID string `json:"survivorMemberID,omitempty"`
+
+	// SurvivorRevision is the key-value store revision present on the survivor at
+	// the moment the single-member cluster came back healthy. It is the highest
+	// revision guaranteed to be retained; any revision the lost majority committed
+	// above this value did not survive the rebuild.
+	// +optional
+	SurvivorRevision int64 `json:"survivorRevision,omitempty"`
+
+	// RaftIndex is the survivor's raft committed index at rebuild time, recorded
+	// for forensic correlation with member logs.
+	// +optional
+	RaftIndex uint64 `json:"raftIndex,omitempty"`
+
+	// RecoveredTime is when the rebuilt single-member cluster was confirmed
+	// healthy and this accounting was captured.
+	// +optional
+	RecoveredTime *metav1.Time `json:"recoveredTime,omitempty"`
+
+	// Message is a human-readable, operator-facing summary of the data-loss
+	// situation, e.g. "recovered with possible data loss; rebuilt from member
+	// <id> at revision <r>".
+	// +optional
+	Message string `json:"message,omitempty"`
 }
 
 // MemberStatus defines the observed state of a single etcd member.

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/operator.etcd.io_etcdclusters.yaml b/config/crd/bases/operator.etcd.io_etcdclusters.yaml
@@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
-    controller-gen.kubebuilder.io/version: v0.20.1
+    controller-gen.kubebuilder.io/version: v0.21.0
   name: etcdclusters.operator.etcd.io
 spec:
   group: operator.etcd.io
@@ -1332,6 +1332,97 @@ spec:
                   This reflects the .status.readyReplicas of the underlying StatefulSet.
                 format: int32
                 type: integer
+              recovery:
+                description: |-
+                  Recovery captures the state of an in-progress (or last attempted) automatic
+                  disaster recovery from quorum loss. It is managed entirely by the operator's
+                  quorum-loss recovery state machine and is nil when no recovery has ever been
+                  attempted.
+                properties:
+                  attempts:
+                    description: |-
+                      Attempts is the number of times the operator has committed to a destructive
+                      rebuild for this cluster (i.e. entered the Rebuilding phase from Detecting).
+                      It is a monotonically increasing counter that survives across recoveries and
+                      is never reset, giving operators a durable signal of how often this cluster
+                      has needed disaster recovery — repeated recoveries usually point at an
+                      underlying infrastructure problem rather than a one-off event.
+                    format: int32
+                    type: integer
+                  dataLoss:
+                    description: |-
+                      DataLoss records the data-loss accounting for the most recent rebuild.
+
+                      Quorum-loss recovery via --force-new-cluster is NOT a lossless operation:
+                      the rebuilt cluster retains only the writes that were committed to the
+                      surviving member's local data directory. Any write that a now-destroyed
+                      majority had committed but had not yet replicated to the survivor is GONE.
+                      This field surfaces that fact explicitly (alongside a Warning Event, the
+                      DataLossPossible condition, and structured logs) so the loss is auditable
+                      and never silent. It is nil until a rebuild from a survivor completes.
+                    properties:
+                      message:
+                        description: |-
+                          Message is a human-readable, operator-facing summary of the data-loss
+                          situation, e.g. "recovered with possible data loss; rebuilt from member
+                          <id> at revision <r>".
+                        type: string
+                      raftIndex:
+                        description: |-
+                          RaftIndex is the survivor's raft committed index at rebuild time, recorded
+                          for forensic correlation with member logs.
+                        format: int64
+                        type: integer
+                      recoveredTime:
+                        description: |-
+                          RecoveredTime is when the rebuilt single-member cluster was confirmed
+                          healthy and this accounting was captured.
+                        format: date-time
+                        type: string
+                      survivorMemberID:
+                        description: |-
+                          SurvivorMemberID is the hex-encoded etcd member ID of the survivor whose
+                          data directory was used to rebuild the cluster.
+                        type: string
+                      survivorRevision:
+                        description: |-
+                          SurvivorRevision is the key-value store revision present on the survivor at
+                          the moment the single-member cluster came back healthy. It is the highest
+                          revision guaranteed to be retained; any revision the lost majority committed
+                          above this value did not survive the rebuild.
+                        format: int64
+                        type: integer
+                    type: object
+                  detectedTime:
+                    description: |-
+                      DetectedTime is the first time a sustained-quorum-loss candidate was
+                      observed. It anchors the grace window used to distinguish true quorum loss
+                      from transient single-member failures.
+                    format: date-time
+                    type: string
+                  lastTransitionTime:
+                    description: LastTransitionTime is the time the recovery phase
+                      last changed.
+                    format: date-time
+                    type: string
+                  message:
+                    description: Message is a human-readable description of the current
+                      recovery step.
+                    type: string
+                  phase:
+                    description: Phase is the current stage of the recovery state
+                      machine.
+                    type: string
+                  survivorOrdinal:
+                    description: |-
+                      SurvivorOrdinal is the StatefulSet pod ordinal whose data directory was
+                      chosen as the surviving source of truth for the rebuild. It is always 0
+                      today (the operator keeps ordinal-0's PVC) but is recorded explicitly so
+                      the choice is auditable and future survivor-selection policies remain
+                      backward compatible.
+                    format: int32
+                    type: integer
+                type: object
             type: object
         type: object
     served: true

diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
@@ -28,6 +28,14 @@ rules:
   - list
   - patch
   - update
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - get
+  - list
+  - watch
 - apiGroups:
   - apps
   resources:

diff --git a/internal/controller/etcdcluster_controller.go b/internal/controller/etcdcluster_controller.go
@@ -50,6 +50,21 @@ type EtcdClusterReconciler struct {
 	Scheme        *runtime.Scheme
 	Recorder      events.EventRecorder
 	ImageRegistry string
+
+	// clusterHealthFn probes the health of the given endpoints. It defaults to
+	// etcdutils.ClusterHealth and exists as a seam so the recovery state machine's
+	// survivor-health gate can be unit-tested without a live etcd. Resolved lazily
+	// via clusterHealth; nil means "use the real implementation".
+	clusterHealthFn func(eps []string) ([]etcdutils.EpHealth, error)
+}
+
+// clusterHealth probes endpoint health via the injected seam, falling back to the
+// real implementation when unset (the production path).
+func (r *EtcdClusterReconciler) clusterHealth(eps []string) ([]etcdutils.EpHealth, error) {
+	if r.clusterHealthFn != nil {
+		return r.clusterHealthFn(eps)
+	}
+	return etcdutils.ClusterHealth(eps)
 }
 
 // reconcileState holds all transient data for a single reconciliation loop.
@@ -68,6 +83,9 @@ type reconcileState struct {
 // +kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;create;update;patch;delete
+// Quorum-loss recovery reads the survivor pod (cached client => list+watch) to
+// confirm it exists before arming the irreversible --force-new-cluster rebuild.
+// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch
 // +kubebuilder:rbac:groups="",resources=events,verbs=create;patch;get;list;update
 // +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;patch;update;delete
 // +kubebuilder:rbac:groups="cert-manager.io",resources=certificates,verbs=get;list;watch;create;patch;update;delete
@@ -105,8 +123,25 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		return res, err
 	}
 
-	if err = r.performHealthChecks(ctx, state); err != nil {
-		return ctrl.Result{}, err
+	healthErr := r.performHealthChecks(ctx, state)
+
+	// Quorum-loss recovery gate. A failed health check on a multi-member cluster
+	// can mean the cluster has permanently lost quorum (a majority of members are
+	// gone) and cannot self-heal. maybeRecoverQuorum inspects the observed member
+	// health, and — only on sustained, true quorum loss — drives an idempotent
+	// disaster-recovery state machine (rebuild-from-survivor + re-add members).
+	// While it owns the reconcile (handled=true) we requeue and skip normal
+	// scaling so the two paths never fight. See quorum_recovery.go.
+	if handled, requeueAfter, recErr := r.maybeRecoverQuorum(ctx, state, state.memberHealth, healthErr); handled || recErr != nil {
+		return ctrl.Result{RequeueAfter: requeueAfter}, recErr
+	}
+
+	// During recovery scale-out the recovery state machine delegates membership
+	// re-adds to reconcileClusterState below; a transient per-member health error
+	// (e.g. a freshly added learner not yet caught up) must NOT short-circuit that
+	// path, or recovery would stall. Outside recovery, a health error is fatal.
+	if healthErr != nil && !recoveryActive(state.cluster) {
+		return ctrl.Result{}, healthErr
 	}
 
 	return r.reconcileClusterState(ctx, state)