etcd-io · xrl · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/README.md b/README.md
@@ -51,6 +51,14 @@ make deploy IMG=<some-registry>/etcd-operator:tag
 > **NOTE**: If you encounter RBAC errors, you may need to grant yourself cluster-admin
 privileges or be logged in as admin.
 
+**Tune the manager (optional):**
+
+The manager exposes process-level flags, including `--max-concurrent-reconciles`
+(reconcile worker pool, default `5`) and `--etcd-cpu-request` (CPU request on the
+etcd container, default `50m`, which makes etcd Burstable instead of BestEffort).
+See [docs/operator-flags.md](docs/operator-flags.md) for details and tuning
+guidance.
+
 **Create instances of your solution**
 You can apply the samples (examples) from the config/sample:
 

diff --git a/cmd/main.go b/cmd/main.go
@@ -64,6 +64,8 @@ func main() {
 	var probeAddr string
 	var secureMetrics bool
 	var enableHTTP2 bool
+	var maxConcurrentReconciles int
+	var etcdCPURequest string
 	var tlsOpts []func(*tls.Config)
 	flag.StringVar(&imageRegistry, "image-registry", "gcr.io/etcd-development/etcd",
 		"The container registry to pull etcd images from. Defaults to gcr.io/etcd-development/etcd.")
@@ -77,6 +79,18 @@ func main() {
 		"If set, the metrics endpoint is served securely via HTTPS. Use --metrics-secure=false to use HTTP instead.")
 	flag.BoolVar(&enableHTTP2, "enable-http2", false,
 		"If set, HTTP/2 will be enabled for the metrics and webhook servers")
+	flag.IntVar(&maxConcurrentReconciles, "max-concurrent-reconciles", 5,
+		"Number of reconcile workers run in parallel. Each EtcdCluster is keyed on its own "+
+			"workqueue entry, so a single cluster is never reconciled by two workers at once; this "+
+			"only parallelizes distinct clusters. Reconciles are relatively heavy/long-running, so a "+
+			"small pool (default 5) improves multi-cluster throughput. Going higher increases "+
+			"simultaneous apiserver and managed-etcd load, so tune it for your fleet. A value <= 0 "+
+			"falls back to controller-runtime's default of 1.")
+	flag.StringVar(&etcdCPURequest, "etcd-cpu-request", controller.DefaultEtcdCPURequest,
+		"CPU request set on the etcd container. A request (not a limit) lifts the etcd pod from "+
+			"BestEffort to Burstable QoS and raises its cpu.shares floor without ever throttling "+
+			"etcd. Set to \"\" or \"0\" to apply no request (original BestEffort behavior) so the "+
+			"effect can be A/B-measured. Defaults to "+controller.DefaultEtcdCPURequest+".")
 	opts := zap.Options{
 		Development: true,
 	}
@@ -85,6 +99,11 @@ func main() {
 
 	ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
 
+	// Apply the etcd CPU request knob to the controller package. This is an
+	// operator-wide tuning lever (identical for every cluster), so it is a flag
+	// rather than a CRD field. See controller.EtcdCPURequest.
+	controller.EtcdCPURequest = etcdCPURequest
+
 	// if the enable-http2 flag is false (the default), http/2 should be disabled
 	// due to its vulnerabilities. More specifically, disabling http/2 will
 	// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
@@ -151,9 +170,10 @@ func main() {
 	}
 
 	if err = (&controller.EtcdClusterReconciler{
-		Client:        mgr.GetClient(),
-		Scheme:        mgr.GetScheme(),
-		ImageRegistry: imageRegistry,
+		Client:                  mgr.GetClient(),
+		Scheme:                  mgr.GetScheme(),
+		ImageRegistry:           imageRegistry,
+		MaxConcurrentReconciles: maxConcurrentReconciles,
 	}).SetupWithManager(mgr); err != nil {
 		setupLog.Error(err, "unable to create controller", "controller", "EtcdCluster")
 		os.Exit(1)

diff --git a/docs/operator-flags.md b/docs/operator-flags.md
@@ -0,0 +1,75 @@
+# Operator Flags
+
+These flags configure the etcd-operator **manager process** itself (the
+controller), as opposed to per-cluster settings expressed on the `EtcdCluster`
+custom resource. They are passed as command-line arguments to the operator
+binary (see `cmd/main.go`).
+
+## `--max-concurrent-reconciles`
+
+- **Type:** int
+- **Default:** `5`
+
+Number of reconcile workers the controller runs in parallel.
+controller-runtime's own default is `1`.
+
+Each `EtcdCluster` is reconciled on its own workqueue key — the queue dedups by
+namespaced name — so a single cluster is **never** reconciled by two workers at
+the same time. Concurrency therefore only ever parallelizes work across
+**distinct** clusters; it does not introduce intra-cluster races.
+
+A reconcile in this operator is relatively heavy and long-running: it patches a
+StatefulSet, issues member-list and health RPCs against the managed etcd
+cluster, and may perform certificate work. With a single worker, one slow
+cluster blocks progress on every other cluster. A small pool (default `5`)
+meaningfully improves throughput when many clusters need attention at once
+(operator restart, mass upgrade, node churn).
+
+The cost of a **larger** pool is more simultaneous load on the apiserver and on
+the managed etcd clusters. Wise operators running large fleets should tune this
+value for their environment; operators with a handful of clusters can leave it
+at the default.
+
+A value `<= 0` falls back to controller-runtime's default of a single worker,
+which stays behaviorally safe.
+
+```sh
+# Widen to 10 workers for a large fleet.
+manager --max-concurrent-reconciles=10
+```
+
+## `--etcd-cpu-request`
+
+- **Type:** string (Kubernetes resource quantity)
+- **Default:** `50m`
+
+CPU **request** (never a limit) set on the etcd container of every managed
+cluster.
+
+Without any request, the etcd pod lands in the **BestEffort** QoS class. That
+gives its cgroup the kernel-floor `cpu.shares` of `2` and makes it the first
+workload the kubelet evicts under node memory pressure. Setting even a tiny CPU
+request lifts the pod to **Burstable**, raises `cpu.shares` to ~`51` (a
+scheduling floor that only matters under contention), and — because it is a
+request and not a limit — **never throttles** etcd.
+
+`50m` is deliberately tiny: it is a scheduling floor, not a reservation. It is
+expressed as a controller-level flag rather than a CRD field because it is an
+operator tuning lever that is identical for every cluster, which keeps it out of
+the `EtcdCluster` API.
+
+Set the value to an empty string (`""`) or `"0"` to apply **no** request,
+restoring the original BestEffort behavior. This makes the effect easy to
+A/B-measure and tune per fleet. A malformed quantity is treated the same as
+unset (no request) so a typo can never wedge cluster creation.
+
+```sh
+# Default: 50m request, Burstable QoS.
+manager --etcd-cpu-request=50m
+
+# Opt out: no request, BestEffort QoS (original behavior).
+manager --etcd-cpu-request=""
+
+# Larger floor for busy clusters.
+manager --etcd-cpu-request=200m
+```
diff --git a/internal/controller/etcdcluster_controller.go b/internal/controller/etcdcluster_controller.go
@@ -32,6 +32,7 @@ import (
 	"k8s.io/client-go/tools/events"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
+	controllerruntime "sigs.k8s.io/controller-runtime/pkg/controller"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 
 	ecv1alpha1 "go.etcd.io/etcd-operator/api/v1alpha1"
@@ -50,6 +51,22 @@ type EtcdClusterReconciler struct {
 	Scheme        *runtime.Scheme
 	Recorder      events.EventRecorder
 	ImageRegistry string
+
+	// MaxConcurrentReconciles is the number of reconcile workers the controller
+	// runs in parallel. controller-runtime's default is 1.
+	//
+	// Each EtcdCluster is reconciled on its own workqueue key (the queue dedups
+	// by namespaced name), so a single cluster is NEVER reconciled by two workers
+	// at once; concurrency only ever parallelizes DISTINCT clusters. Because a
+	// reconcile here is relatively heavy and long-running (StatefulSet patches,
+	// member-list/health RPCs against managed etcd, certificate work), a small
+	// pool meaningfully improves throughput when many clusters need attention at
+	// the same time. The cost of a larger pool is more simultaneous apiserver and
+	// managed-etcd load, so wise operators should tune this for their fleet.
+	//
+	// A value <= 0 falls back to controller-runtime's default of 1, which stays
+	// behaviorally safe.
+	MaxConcurrentReconciles int
 }
 
 // reconcileState holds all transient data for a single reconciliation loop.
@@ -599,5 +616,15 @@ func (r *EtcdClusterReconciler) SetupWithManager(mgr ctrl.Manager) error {
 		setupLog.Info("cert-manager CRDs not detected, only auto provider will be available. Restart the controller after cert-manager CRDs are installed")
 	}
 
+	// Widen the reconcile worker pool when configured. A value <= 0 leaves
+	// controller-runtime at its default of a single worker. See the doc comment
+	// on EtcdClusterReconciler.MaxConcurrentReconciles for why a small pool is a
+	// safe default for this operator.
+	if r.MaxConcurrentReconciles > 0 {
+		builder = builder.WithOptions(controllerruntime.Options{
+			MaxConcurrentReconciles: r.MaxConcurrentReconciles,
+		})
+	}
+
 	return builder.Complete(r)
 }
diff --git a/internal/controller/etcdcluster_controller_test.go b/internal/controller/etcdcluster_controller_test.go
@@ -26,9 +26,13 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
+	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+	"sigs.k8s.io/controller-runtime/pkg/config"
+	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
 
 	ecv1alpha1 "go.etcd.io/etcd-operator/api/v1alpha1"
 )
@@ -571,3 +575,51 @@ func TestBootstrapStatefulSet(t *testing.T) {
 		assert.Equal(t, storedCM.Data, fetchedCM.Data)
 	})
 }
+
+// TestSetupWithManagerThreadsMaxConcurrentReconciles verifies that the
+// --max-concurrent-reconciles knob is actually threaded through
+// SetupWithManager into controller-runtime's builder. A real manager is built
+// from the envtest rest.Config so the builder's option plumbing is exercised
+// end-to-end; the controller is never started.
+func TestSetupWithManagerThreadsMaxConcurrentReconciles(t *testing.T) {
+	if restCfg == nil {
+		t.Skip("envtest rest config unavailable; KUBEBUILDER_ASSETS not set")
+	}
+
+	testScheme := runtime.NewScheme()
+	require.NoError(t, clientgoscheme.AddToScheme(testScheme))
+	require.NoError(t, ecv1alpha1.AddToScheme(testScheme))
+
+	tests := []struct {
+		name                    string
+		maxConcurrentReconciles int
+	}{
+		{name: "widened pool", maxConcurrentReconciles: 5},
+		{name: "explicit single worker", maxConcurrentReconciles: 1},
+		{name: "non-positive falls back to default safely", maxConcurrentReconciles: 0},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mgr, err := ctrl.NewManager(restCfg, ctrl.Options{
+				Scheme:  testScheme,
+				Metrics: metricsserver.Options{BindAddress: "0"},
+				// Subtests register the same controller name on distinct managers;
+				// skip the global metric-name uniqueness guard so they can coexist.
+				Controller: config.Controller{SkipNameValidation: ptr.To(true)},
+			})
+			require.NoError(t, err)
+
+			r := &EtcdClusterReconciler{
+				Client:                  mgr.GetClient(),
+				Scheme:                  mgr.GetScheme(),
+				MaxConcurrentReconciles: tt.maxConcurrentReconciles,
+			}
+			// SetupWithManager must accept the configured pool size and register
+			// the controller without error for every value, including the
+			// non-positive fallback path.
+			require.NoError(t, r.SetupWithManager(mgr))
+			assert.Equal(t, tt.maxConcurrentReconciles, r.MaxConcurrentReconciles)
+		})
+	}
+}
diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go
@@ -25,6 +25,7 @@ import (
 	"testing"
 
 	"k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/rest"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/envtest"
@@ -37,6 +38,8 @@ import (
 
 var (
 	k8sClient client.Client
+	// restCfg is the envtest rest.Config, exposed so tests can build a manager.
+	restCfg *rest.Config
 )
 
 func TestMain(m *testing.M) {
@@ -67,6 +70,7 @@ func TestMain(m *testing.M) {
 	if cfg == nil {
 		logger.Fatalf("Test environment started with nil config")
 	}
+	restCfg = cfg
 
 	ctrl.SetLogger(zap.New())
 

diff --git a/internal/controller/utils.go b/internal/controller/utils.go
@@ -37,15 +37,57 @@ import (
 const (
 	etcdDataDir = "/var/lib/etcd"
 	volumeName  = "etcd-data"
+
+	// DefaultEtcdCPURequest is the default CPU *request* applied to the etcd
+	// container. With no request the etcd pod lands in the BestEffort QoS class,
+	// which gives its cgroup the kernel-floor cpu.shares of 2 and makes it the
+	// first thing the kubelet evicts under node pressure. A request lifts the pod
+	// to Burstable, raises cpu.shares to ~51 (a scheduling floor that only bites
+	// under contention), and — because it is a request, not a limit — never
+	// throttles etcd. 50m is deliberately tiny: it is a floor, not a reservation.
+	DefaultEtcdCPURequest = "50m"
 )
 
+// EtcdCPURequest is the CPU request applied to the etcd container, settable from
+// the operator's --etcd-cpu-request flag. Defaults to DefaultEtcdCPURequest.
+//
+// An empty string or "0" disables the request entirely, restoring the original
+// BestEffort behavior so the effect can be A/B-measured and tuned per fleet.
+// This is intentionally a controller-level knob rather than a CRD field: it is
+// an operator tuning lever, identical for every cluster, so keeping it out of
+// the API avoids a CRD change.
+var EtcdCPURequest = DefaultEtcdCPURequest
+
 type etcdClusterState string
 
 const (
 	etcdClusterStateNew      etcdClusterState = "new"
 	etcdClusterStateExisting etcdClusterState = "existing"
 )
 
+// etcdContainerResources builds the ResourceRequirements for the etcd container.
+//
+// When EtcdCPURequest is a non-empty, non-zero quantity it sets a CPU *request*
+// (never a limit) so the pod is Burstable rather than BestEffort. An empty or
+// "0" value yields zero-valued requirements, preserving the original BestEffort
+// behavior. A malformed quantity is treated the same as unset so a bad flag can
+// never wedge cluster creation.
+func etcdContainerResources() corev1.ResourceRequirements {
+	if EtcdCPURequest == "" || EtcdCPURequest == "0" {
+		return corev1.ResourceRequirements{}
+	}
+	qty, err := resource.ParseQuantity(EtcdCPURequest)
+	if err != nil || qty.IsZero() {
+		log.Printf("invalid --etcd-cpu-request %q, leaving etcd container without a CPU request: %v", EtcdCPURequest, err)
+		return corev1.ResourceRequirements{}
+	}
+	return corev1.ResourceRequirements{
+		Requests: corev1.ResourceList{
+			corev1.ResourceCPU: qty,
+		},
+	}
+}
+
 func reconcileStatefulSet(ctx context.Context, logger logr.Logger, ec *ecv1alpha1.EtcdCluster, c client.Client, replicas int32, scheme *runtime.Scheme) (*appsv1.StatefulSet, error) {
 
 	// prepare/update configmap for StatefulSet
@@ -143,10 +185,11 @@ func createOrPatchStatefulSet(ctx context.Context, logger logr.Logger, ec *ecv1a
 	podSpec := corev1.PodSpec{
 		Containers: []corev1.Container{
 			{
-				Name:    "etcd",
-				Command: []string{"/usr/local/bin/etcd"},
-				Args:    createArgs(ec.Name, ec.Spec.EtcdOptions),
-				Image:   fmt.Sprintf("%s:%s", ec.Spec.ImageRegistry, ec.Spec.Version),
+				Name:      "etcd",
+				Command:   []string{"/usr/local/bin/etcd"},
+				Args:      createArgs(ec.Name, ec.Spec.EtcdOptions),
+				Image:     fmt.Sprintf("%s:%s", ec.Spec.ImageRegistry, ec.Spec.Version),
+				Resources: etcdContainerResources(),
 				Env: []corev1.EnvVar{
 					{
 						Name: "POD_NAME",