From b230879ce0ace94cf2924914b59d844e994e1415 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Thu, 18 Jun 2026 19:24:26 -0400 Subject: [PATCH 1/6] feat: implement and register EtcdCluster admission webhooks The manager wires a webhook server but never registered any webhook, so malformed EtcdCluster specs were only (partially) caught at reconcile time with no synchronous, actionable feedback at apply time (see #380). Add a validating + defaulting webhook for EtcdCluster and register it via SetupWebhookWithManager in cmd/main.go (skippable with ENABLE_WEBHOOKS=false). Validations (each rejection carries a crisp, actionable message): - size: must be >= 1 and odd (etcd quorum); message names the next valid sizes. - version: must be valid semver; on update, must be a supported single-minor, non-downgrade transition (reuses the controller's ordered version table). - tls: provider must be one of {auto, cert-manager}; the matching providerCfg block must be present/complete; auto must not carry cert-manager config. - storageSpec: immutable once set. Defaulting: - tls.provider defaults to "auto" when a TLS block is present without a provider. Generate config/webhook + config/certmanager (cert-manager serving cert) and enable the WEBHOOK/CERTMANAGER kustomize sections, including CA injection. Tested: unit tests assert the exact rejection message text; envtest registers the webhook with a test manager and applies invalid CRs to assert rejection + message; a new test/e2e/webhook_test.go authors live apply-invalid assertions. Refs #380 Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Lange --- api/v1alpha1/etcdcluster_webhook.go | 376 ++++++++++++++++++++++ api/v1alpha1/etcdcluster_webhook_test.go | 345 ++++++++++++++++++++ api/v1alpha1/webhook_envtest_test.go | 270 ++++++++++++++++ cmd/main.go | 11 + config/certmanager/certificate.yaml | 32 ++ config/certmanager/kustomization.yaml | 5 + config/certmanager/kustomizeconfig.yaml | 8 + config/default/kustomization.yaml | 239 ++++++-------- config/default/manager_webhook_patch.yaml | 25 ++ config/webhook/kustomization.yaml | 6 + config/webhook/kustomizeconfig.yaml | 22 ++ config/webhook/manifests.yaml | 52 +++ config/webhook/service.yaml | 12 + test/e2e/webhook_test.go | 138 ++++++++ 14 files changed, 1408 insertions(+), 133 deletions(-) create mode 100644 api/v1alpha1/etcdcluster_webhook.go create mode 100644 api/v1alpha1/etcdcluster_webhook_test.go create mode 100644 api/v1alpha1/webhook_envtest_test.go create mode 100644 config/certmanager/certificate.yaml create mode 100644 config/certmanager/kustomization.yaml create mode 100644 config/certmanager/kustomizeconfig.yaml create mode 100644 config/default/manager_webhook_patch.yaml create mode 100644 config/webhook/kustomization.yaml create mode 100644 config/webhook/kustomizeconfig.yaml create mode 100644 config/webhook/manifests.yaml create mode 100644 config/webhook/service.yaml create mode 100644 test/e2e/webhook_test.go diff --git a/api/v1alpha1/etcdcluster_webhook.go b/api/v1alpha1/etcdcluster_webhook.go new file mode 100644 index 00000000..2b42bab0 --- /dev/null +++ b/api/v1alpha1/etcdcluster_webhook.go @@ -0,0 +1,376 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + "context" + "fmt" + "sort" + "strings" + + "github.com/coreos/go-semver/semver" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/validation/field" + ctrl "sigs.k8s.io/controller-runtime" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + etcdversions "go.etcd.io/etcd/api/v3/version" +) + +// etcdclusterlog is the logger used by the EtcdCluster webhooks. Every admission +// decision is logged with enough structured context (name, namespace, the +// offending field, and the user-facing remediation) that an operator reading the +// webhook-server logs can reconstruct exactly why a request was accepted, +// defaulted, or rejected. +var etcdclusterlog = logf.Log.WithName("etcdcluster-webhook") + +// Known TLS providers. Kept in one place so the validator's accepted-set and the +// defaulter agree, and so the error message can enumerate the valid choices. +const ( + tlsProviderAuto = "auto" + tlsProviderCertManager = "cert-manager" +) + +// knownTLSProviders is the canonical, lower-cased set of providers the operator +// understands, in a deterministic order for stable error messages. +var knownTLSProviders = []string{tlsProviderAuto, tlsProviderCertManager} + +// SetupWebhookWithManager registers the validating and defaulting webhooks for +// EtcdCluster with the manager's webhook server. This is the single entry point +// called from cmd/main.go; previously the webhook server was started but nothing +// was ever registered with it (see issue #380). +func (r *EtcdCluster) SetupWebhookWithManager(mgr ctrl.Manager) error { + etcdclusterlog.Info("registering EtcdCluster admission webhooks (validating + defaulting)") + return ctrl.NewWebhookManagedBy(mgr, r). + WithCustomValidator(&EtcdClusterCustomValidator{}). + WithCustomDefaulter(&EtcdClusterCustomDefaulter{}). + Complete() +} + +// +kubebuilder:webhook:path=/mutate-operator-etcd-io-v1alpha1-etcdcluster,mutating=true,failurePolicy=fail,sideEffects=None,groups=operator.etcd.io,resources=etcdclusters,verbs=create;update,versions=v1alpha1,name=metcdcluster-v1alpha1.kb.io,admissionReviewVersions=v1 + +// EtcdClusterCustomDefaulter applies defaults to EtcdCluster objects on admission. +// +kubebuilder:object:generate=false +type EtcdClusterCustomDefaulter struct{} + +var _ admission.Defaulter[runtime.Object] = &EtcdClusterCustomDefaulter{} + +// Default implements webhook.CustomDefaulter. +func (d *EtcdClusterCustomDefaulter) Default(_ context.Context, obj runtime.Object) error { + ec, ok := obj.(*EtcdCluster) + if !ok { + return fmt.Errorf("expected an EtcdCluster object but got %T", obj) + } + log := etcdclusterlog.WithValues("name", ec.Name, "namespace", ec.Namespace) + applyEtcdClusterDefaults(ec, log) + return nil +} + +// applyEtcdClusterDefaults mutates ec in place to fill in sensible defaults. It is +// pure (aside from the supplied logger) so it can be unit tested directly. +func applyEtcdClusterDefaults(ec *EtcdCluster, log logr) { + // When a TLS block is present but no provider is named, the operator falls + // back to the "auto" provider (matches the comment on TLSCertificate.Provider). + // Materializing it here means downstream code and validation see an explicit, + // canonical value instead of "". + if ec.Spec.TLS != nil && strings.TrimSpace(ec.Spec.TLS.Provider) == "" { + ec.Spec.TLS.Provider = tlsProviderAuto + if log != nil { + log.Info("defaulting spec.tls.provider to \"auto\" (no provider specified)") + } + } +} + +// +kubebuilder:webhook:path=/validate-operator-etcd-io-v1alpha1-etcdcluster,mutating=false,failurePolicy=fail,sideEffects=None,groups=operator.etcd.io,resources=etcdclusters,verbs=create;update,versions=v1alpha1,name=vetcdcluster-v1alpha1.kb.io,admissionReviewVersions=v1 + +// EtcdClusterCustomValidator validates EtcdCluster objects on admission. +// +kubebuilder:object:generate=false +type EtcdClusterCustomValidator struct{} + +var _ admission.Validator[runtime.Object] = &EtcdClusterCustomValidator{} + +// etcdClusterGroupKind is used when building aggregated field errors into an +// apierrors.NewInvalid status so the API server returns a structured 422. +var etcdClusterGroupKind = schema.GroupKind{Group: GroupVersion.Group, Kind: "EtcdCluster"} + +// ValidateCreate implements webhook.CustomValidator. +func (v *EtcdClusterCustomValidator) ValidateCreate( + _ context.Context, obj runtime.Object, +) (admission.Warnings, error) { + ec, ok := obj.(*EtcdCluster) + if !ok { + return nil, fmt.Errorf("expected an EtcdCluster object but got %T", obj) + } + log := etcdclusterlog.WithValues("name", ec.Name, "namespace", ec.Namespace, "operation", "create") + log.Info("validating EtcdCluster on create") + + errs := validateEtcdClusterSpec(&ec.Spec) + return finish(log, ec, errs) +} + +// ValidateUpdate implements webhook.CustomValidator. +func (v *EtcdClusterCustomValidator) ValidateUpdate( + _ context.Context, oldObj, newObj runtime.Object, +) (admission.Warnings, error) { + ec, ok := newObj.(*EtcdCluster) + if !ok { + return nil, fmt.Errorf("expected an EtcdCluster object for the new object but got %T", newObj) + } + oldEc, ok := oldObj.(*EtcdCluster) + if !ok { + return nil, fmt.Errorf("expected an EtcdCluster object for the old object but got %T", oldObj) + } + log := etcdclusterlog.WithValues("name", ec.Name, "namespace", ec.Namespace, "operation", "update") + log.Info("validating EtcdCluster on update") + + errs := validateEtcdClusterSpec(&ec.Spec) + errs = append(errs, validateEtcdClusterUpdate(&oldEc.Spec, &ec.Spec)...) + return finish(log, ec, errs) +} + +// ValidateDelete implements webhook.CustomValidator. Deletion is always allowed. +func (v *EtcdClusterCustomValidator) ValidateDelete( + _ context.Context, obj runtime.Object, +) (admission.Warnings, error) { + if _, ok := obj.(*EtcdCluster); !ok { + return nil, fmt.Errorf("expected an EtcdCluster object but got %T", obj) + } + return nil, nil +} + +// finish converts an accumulated field.ErrorList into either nil (accept) or a +// structured apierrors.NewInvalid (reject), logging the outcome either way. +func finish(log logr, ec *EtcdCluster, errs field.ErrorList) (admission.Warnings, error) { + if len(errs) == 0 { + log.Info("EtcdCluster admitted") + return nil, nil + } + // Log each rejection reason individually so the actionable detail is greppable. + for _, e := range errs { + log.Info("EtcdCluster rejected", "field", e.Field, "reason", e.Detail) + } + return nil, apierrors.NewInvalid(etcdClusterGroupKind, ec.Name, errs) +} + +// --------------------------------------------------------------------------- +// Pure validation helpers. These take only the spec(s) and return field errors +// with crisp, actionable Detail strings. They are unit-tested directly, including +// exact message text. +// --------------------------------------------------------------------------- + +// validateEtcdClusterSpec runs all create-time (and update-time) spec invariants. +func validateEtcdClusterSpec(spec *EtcdClusterSpec) field.ErrorList { + errs := make(field.ErrorList, 0, 3) + specPath := field.NewPath("spec") + + errs = append(errs, validateSize(spec.Size, specPath.Child("size"))...) + errs = append(errs, validateVersionFormat(spec.Version, specPath.Child("version"))...) + errs = append(errs, validateTLS(spec.TLS, specPath.Child("tls"))...) + + return errs +} + +// validateSize enforces size >= 1 and an odd member count. etcd forms a quorum of +// (n/2)+1; an even cluster has the same fault tolerance as the next-lower odd size +// while being strictly more likely to lose quorum, so even sizes are rejected. +func validateSize(size int, path *field.Path) field.ErrorList { + if size < 1 { + return field.ErrorList{field.Invalid(path, size, + fmt.Sprintf("size must be at least 1; got %d. Set spec.size to a positive odd number (e.g. 1, 3, or 5).", size))} + } + if size%2 == 0 { + return field.ErrorList{field.Invalid(path, size, + fmt.Sprintf("size must be an odd number so the cluster can form a majority quorum; got %d. "+ + "An even-sized etcd cluster tolerates no more failures than the next-smaller odd size while being more "+ + "likely to lose quorum. Use %d or %d instead.", size, size-1, size+1))} + } + return nil +} + +// validateVersionFormat ensures spec.version is non-empty and parses as semver. +func validateVersionFormat(version string, path *field.Path) field.ErrorList { + if strings.TrimSpace(version) == "" { + return field.ErrorList{field.Required(path, + "version is required; set spec.version to a semver etcd image tag such as \"3.6.1\".")} + } + if _, err := semver.NewVersion(version); err != nil { + return field.ErrorList{field.Invalid(path, version, + fmt.Sprintf("version %q is not a valid semantic version (expected MAJOR.MINOR.PATCH, e.g. \"3.6.1\"): %v.", + version, err))} + } + return nil +} + +// validateTLS checks that the TLS surface is internally coherent: the provider is +// one the operator understands, and the provider-specific config block required by +// that provider is present and complete. +func validateTLS(tls *TLSCertificate, path *field.Path) field.ErrorList { + if tls == nil { + return nil + } + var errs field.ErrorList + + // An empty provider is defaulted to "auto" by the defaulting webhook; treat it + // as "auto" here too so validation is correct even if the defaulter is bypassed + // (e.g. in envtest where only the validating webhook is registered). + provider := strings.TrimSpace(tls.Provider) + if provider == "" { + provider = tlsProviderAuto + } + + switch provider { + case tlsProviderAuto: + // auto provider self-generates certs; cert-manager config must not be set. + if tls.ProviderCfg.CertManagerCfg != nil { + errs = append(errs, field.Invalid(path.Child("providerCfg").Child("certManagerCfg"), + "", + "providerCfg.certManagerCfg must not be set when provider is \"auto\"; "+ + "either remove providerCfg.certManagerCfg or set provider to \"cert-manager\".")) + } + case tlsProviderCertManager: + cm := tls.ProviderCfg.CertManagerCfg + if cm == nil { + errs = append(errs, field.Required(path.Child("providerCfg").Child("certManagerCfg"), + "providerCfg.certManagerCfg is required when provider is \"cert-manager\"; "+ + "supply issuerKind and issuerName.")) + break + } + if strings.TrimSpace(cm.IssuerName) == "" { + errs = append(errs, field.Required(path.Child("providerCfg").Child("certManagerCfg").Child("issuerName"), + "issuerName is required for the cert-manager provider; set it to the name of an Issuer or ClusterIssuer.")) + } + if k := strings.TrimSpace(cm.IssuerKind); k != "" && k != "Issuer" && k != "ClusterIssuer" { + errs = append(errs, field.NotSupported( + path.Child("providerCfg").Child("certManagerCfg").Child("issuerKind"), + cm.IssuerKind, []string{"Issuer", "ClusterIssuer"})) + } + default: + errs = append(errs, field.NotSupported(path.Child("provider"), tls.Provider, knownTLSProviders)) + } + + return errs +} + +// validateEtcdClusterUpdate enforces transition invariants between the old and new +// spec: immutable fields and the supported etcd upgrade path. +func validateEtcdClusterUpdate(oldSpec, newSpec *EtcdClusterSpec) field.ErrorList { + var errs field.ErrorList + specPath := field.NewPath("spec") + + // storageSpec is immutable once set: changing the PVC template after the + // StatefulSet exists cannot be reconciled in place and risks data loss. + if oldSpec.StorageSpec != nil && newSpec.StorageSpec == nil { + errs = append(errs, field.Invalid(specPath.Child("storageSpec"), nil, + "storageSpec is immutable and cannot be removed once set; restore the original storageSpec or recreate the cluster.")) + } else if oldSpec.StorageSpec != nil && newSpec.StorageSpec != nil && *oldSpec.StorageSpec != *newSpec.StorageSpec { + errs = append(errs, field.Invalid(specPath.Child("storageSpec"), newSpec.StorageSpec, + "storageSpec is immutable and cannot be changed once set; revert spec.storageSpec to its original value or recreate the cluster.")) + } + + // Upgrade-path check. Only meaningful when both versions parse and actually + // differ; format errors are already surfaced by validateVersionFormat. + errs = append(errs, validateUpgradePath(oldSpec.Version, newSpec.Version, specPath.Child("version"))...) + + return errs +} + +// validateUpgradePath rejects unsupported version transitions (skip-minor +// upgrades and downgrades), reusing the same ordered version table the controller +// uses at reconcile time. A no-op (equal versions) or an unparseable version is +// not reported here (the latter is handled by validateVersionFormat). +func validateUpgradePath(current, target string, path *field.Path) field.ErrorList { + if current == target { + return nil + } + if _, err := semver.NewVersion(current); err != nil { + return nil // old object had a bad version; nothing actionable to add here. + } + if _, err := semver.NewVersion(target); err != nil { + return nil // format error already reported by validateVersionFormat. + } + + if err := checkUpgradePath(etcdversions.AllVersions, current, target); err != nil { + return field.ErrorList{field.Invalid(path, target, + fmt.Sprintf("%v. etcd only supports sequential single-minor upgrades and forbids downgrades; "+ + "upgrade one minor version at a time (current %q -> target %q).", err, current, target))} + } + return nil +} + +// checkUpgradePath mirrors the controller's validateEtcdUpgradePath ordering logic +// but is local to the api package (controller internals are not importable here) +// and returns only the actionable error. supportedVersions must be ascending. +func checkUpgradePath(supportedVersions []semver.Version, current, target string) error { + currentVer, err := semver.NewVersion(current) + if err != nil { + return fmt.Errorf("failed to parse current version %s: %w", current, err) + } + targetVer, err := semver.NewVersion(target) + if err != nil { + return fmt.Errorf("failed to parse target version %s: %w", target, err) + } + + currentIdx, targetIdx := -1, -1 + for idx, v := range supportedVersions { + if v.Major == currentVer.Major && v.Minor == currentVer.Minor { + currentIdx = idx + } + if v.Major == targetVer.Major && v.Minor == targetVer.Minor { + targetIdx = idx + } + } + + switch { + case currentIdx == -1: + return fmt.Errorf("unknown current version %s (supported minor lines: %s)", + currentVer, supportedMinorLines(supportedVersions)) + case targetIdx == -1: + return fmt.Errorf("unknown target version %s (supported minor lines: %s)", + targetVer, supportedMinorLines(supportedVersions)) + case currentIdx > targetIdx || (currentIdx == targetIdx && currentVer.Patch > targetVer.Patch): + return fmt.Errorf("downgrading from version %s to version %s is not allowed", currentVer, targetVer) + case targetIdx > currentIdx+1: + return fmt.Errorf("upgrading from version %s to version %s is not allowed (skips a minor version)", + currentVer, targetVer) + } + return nil +} + +// supportedMinorLines renders the supported MAJOR.MINOR lines for error messages. +func supportedMinorLines(versions []semver.Version) string { + seen := make(map[string]struct{}, len(versions)) + var lines []string + for _, v := range versions { + key := fmt.Sprintf("%d.%d", v.Major, v.Minor) + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + lines = append(lines, key) + } + sort.Strings(lines) + return strings.Join(lines, ", ") +} + +// logr is the minimal logging surface applyEtcdClusterDefaults needs; it lets the +// unit tests pass nil while production code passes a real logr.Logger. +type logr interface { + Info(msg string, keysAndValues ...any) +} diff --git a/api/v1alpha1/etcdcluster_webhook_test.go b/api/v1alpha1/etcdcluster_webhook_test.go new file mode 100644 index 00000000..c1659558 --- /dev/null +++ b/api/v1alpha1/etcdcluster_webhook_test.go @@ -0,0 +1,345 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + "context" + "errors" + "strings" + "testing" + + apierrors "k8s.io/apimachinery/pkg/api/errors" +) + +// causeMessages flattens the aggregated field errors carried by an +// apierrors.NewInvalid status into ": " strings so tests can +// assert on the exact user-facing rejection text. +func causeMessages(t *testing.T, err error) []string { + t.Helper() + if err == nil { + return nil + } + var status apierrors.APIStatus + if !errors.As(err, &status) { + t.Fatalf("expected an apierrors status error, got %T: %v", err, err) + } + var msgs []string + if status.Status().Details != nil { + for _, c := range status.Status().Details.Causes { + msgs = append(msgs, c.Field+": "+c.Message) + } + } + return msgs +} + +func newCluster(size int, version string) *EtcdCluster { + return &EtcdCluster{Spec: EtcdClusterSpec{Size: size, Version: version}} +} + +func TestValidateCreate_Messages(t *testing.T) { + v := &EtcdClusterCustomValidator{} + + tests := []struct { + name string + cluster *EtcdCluster + wantOK bool + wantField string + wantMessage string // exact, fully-formed message text + }{ + { + name: "valid odd size with semver version", + cluster: newCluster(3, "3.6.1"), + wantOK: true, + }, + { + name: "even size is rejected with quorum guidance", + cluster: newCluster(4, "3.6.1"), + wantField: "spec.size", + wantMessage: "Invalid value: 4: size must be an odd number so the cluster can form a majority quorum; got 4. An even-sized etcd cluster tolerates no more failures than the next-smaller odd size while being more likely to lose quorum. Use 3 or 5 instead.", + }, + { + name: "zero size is rejected", + cluster: newCluster(0, "3.6.1"), + wantField: "spec.size", + wantMessage: "Invalid value: 0: size must be at least 1; got 0. Set spec.size to a positive odd number (e.g. 1, 3, or 5).", + }, + { + name: "empty version is rejected as required", + cluster: newCluster(1, ""), + wantField: "spec.version", + wantMessage: "Required value: version is required; set spec.version to a semver etcd image tag such as \"3.6.1\".", + }, + { + name: "non-semver version is rejected", + cluster: newCluster(1, "latest"), + wantField: "spec.version", + wantMessage: "Invalid value: \"latest\": version \"latest\" is not a valid semantic version (expected MAJOR.MINOR.PATCH, e.g. \"3.6.1\"): latest is not in dotted-tri format.", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + _, err := v.ValidateCreate(context.Background(), tc.cluster) + if tc.wantOK { + if err != nil { + t.Fatalf("expected admission to succeed, got error: %v", err) + } + return + } + msgs := causeMessages(t, err) + joined := strings.Join(msgs, " | ") + found := false + for _, m := range msgs { + if m == tc.wantField+": "+tc.wantMessage { + found = true + break + } + } + if !found { + t.Fatalf("did not find expected cause.\n want: %s: %s\n got: %s", + tc.wantField, tc.wantMessage, joined) + } + }) + } +} + +func TestValidateTLS_Messages(t *testing.T) { + v := &EtcdClusterCustomValidator{} + + tests := []struct { + name string + tls *TLSCertificate + wantOK bool + wantMessage string + }{ + { + name: "auto provider with no config is valid", + tls: &TLSCertificate{Provider: "auto"}, + wantOK: true, + }, + { + name: "empty provider is treated as auto and is valid", + tls: &TLSCertificate{}, + wantOK: true, + }, + { + name: "unknown provider lists the supported choices", + tls: &TLSCertificate{Provider: "vault"}, + wantMessage: "spec.tls.provider: Unsupported value: \"vault\": supported values: \"auto\", \"cert-manager\"", + }, + { + name: "cert-manager provider without config block", + tls: &TLSCertificate{Provider: "cert-manager"}, + wantMessage: "spec.tls.providerCfg.certManagerCfg: Required value: providerCfg.certManagerCfg is required when provider is \"cert-manager\"; supply issuerKind and issuerName.", + }, + { + name: "cert-manager provider missing issuerName", + tls: &TLSCertificate{ + Provider: "cert-manager", + ProviderCfg: ProviderConfig{CertManagerCfg: &ProviderCertManagerConfig{IssuerKind: "Issuer"}}, + }, + wantMessage: "spec.tls.providerCfg.certManagerCfg.issuerName: Required value: issuerName is required for the cert-manager provider; set it to the name of an Issuer or ClusterIssuer.", + }, + { + name: "cert-manager provider with bad issuerKind", + tls: &TLSCertificate{ + Provider: "cert-manager", + ProviderCfg: ProviderConfig{CertManagerCfg: &ProviderCertManagerConfig{IssuerKind: "Bogus", IssuerName: "ca"}}, + }, + wantMessage: "spec.tls.providerCfg.certManagerCfg.issuerKind: Unsupported value: \"Bogus\": supported values: \"Issuer\", \"ClusterIssuer\"", + }, + { + name: "auto provider must not carry cert-manager config", + tls: &TLSCertificate{ + Provider: "auto", + ProviderCfg: ProviderConfig{CertManagerCfg: &ProviderCertManagerConfig{IssuerName: "ca"}}, + }, + wantMessage: "spec.tls.providerCfg.certManagerCfg: Invalid value: \"\": providerCfg.certManagerCfg must not be set when provider is \"auto\"; either remove providerCfg.certManagerCfg or set provider to \"cert-manager\".", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + c := newCluster(1, "3.6.1") + c.Spec.TLS = tc.tls + _, err := v.ValidateCreate(context.Background(), c) + if tc.wantOK { + if err != nil { + t.Fatalf("expected admission to succeed, got: %v", err) + } + return + } + msgs := causeMessages(t, err) + found := false + for _, m := range msgs { + if m == tc.wantMessage { + found = true + } + } + if !found { + t.Fatalf("did not find expected TLS cause.\n want: %s\n got: %s", + tc.wantMessage, strings.Join(msgs, " | ")) + } + }) + } +} + +func TestValidateUpdate_UpgradePathAndImmutability(t *testing.T) { + v := &EtcdClusterCustomValidator{} + + tests := []struct { + name string + old *EtcdCluster + updated *EtcdCluster + wantOK bool + wantMessage string + }{ + { + name: "same version no-op is allowed", + old: newCluster(3, "3.6.1"), + updated: newCluster(3, "3.6.2"), + wantOK: true, + }, + { + name: "single minor upgrade is allowed", + old: newCluster(3, "3.5.1"), + updated: newCluster(3, "3.6.1"), + wantOK: true, + }, + { + name: "skip-minor upgrade is rejected", + old: newCluster(3, "3.5.1"), + updated: newCluster(3, "3.7.1"), + wantMessage: "spec.version: Invalid value: \"3.7.1\": upgrading from version 3.5.1 to version 3.7.1 is not allowed (skips a minor version). etcd only supports sequential single-minor upgrades and forbids downgrades; upgrade one minor version at a time (current \"3.5.1\" -> target \"3.7.1\").", + }, + { + name: "downgrade is rejected", + old: newCluster(3, "3.6.1"), + updated: newCluster(3, "3.5.1"), + wantMessage: "spec.version: Invalid value: \"3.5.1\": downgrading from version 3.6.1 to version 3.5.1 is not allowed. etcd only supports sequential single-minor upgrades and forbids downgrades; upgrade one minor version at a time (current \"3.6.1\" -> target \"3.5.1\").", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + _, err := v.ValidateUpdate(context.Background(), tc.old, tc.updated) + if tc.wantOK { + if err != nil { + t.Fatalf("expected update to succeed, got: %v", err) + } + return + } + msgs := causeMessages(t, err) + found := false + for _, m := range msgs { + if m == tc.wantMessage { + found = true + } + } + if !found { + t.Fatalf("did not find expected upgrade cause.\n want: %s\n got: %s", + tc.wantMessage, strings.Join(msgs, " | ")) + } + }) + } +} + +func TestValidateUpdate_StorageSpecImmutable(t *testing.T) { + v := &EtcdClusterCustomValidator{} + + withStorage := func() *EtcdCluster { + c := newCluster(3, "3.6.1") + c.Spec.StorageSpec = &StorageSpec{StorageClassName: "standard"} + return c + } + + t.Run("removing storageSpec is rejected", func(t *testing.T) { + _, err := v.ValidateUpdate(context.Background(), withStorage(), newCluster(3, "3.6.1")) + want := "spec.storageSpec: Invalid value: null: storageSpec is immutable and cannot be removed once set; restore the original storageSpec or recreate the cluster." + assertCause(t, err, want) + }) + + t.Run("changing storageSpec is rejected", func(t *testing.T) { + updated := withStorage() + updated.Spec.StorageSpec.StorageClassName = "fast" + _, err := v.ValidateUpdate(context.Background(), withStorage(), updated) + msgs := causeMessages(t, err) + ok := false + for _, m := range msgs { + if strings.HasPrefix(m, "spec.storageSpec: Invalid value:") && + strings.Contains(m, "storageSpec is immutable and cannot be changed once set") { + ok = true + } + } + if !ok { + t.Fatalf("expected storageSpec immutability error, got: %s", strings.Join(msgs, " | ")) + } + }) + + t.Run("unchanged storageSpec is allowed", func(t *testing.T) { + if _, err := v.ValidateUpdate(context.Background(), withStorage(), withStorage()); err != nil { + t.Fatalf("expected unchanged storageSpec to be allowed, got: %v", err) + } + }) +} + +func assertCause(t *testing.T, err error, want string) { + t.Helper() + msgs := causeMessages(t, err) + for _, m := range msgs { + if m == want { + return + } + } + t.Fatalf("did not find expected cause.\n want: %s\n got: %s", want, strings.Join(msgs, " | ")) +} + +func TestDefault_TLSProvider(t *testing.T) { + d := &EtcdClusterCustomDefaulter{} + + t.Run("empty provider defaults to auto", func(t *testing.T) { + c := newCluster(3, "3.6.1") + c.Spec.TLS = &TLSCertificate{} + if err := d.Default(context.Background(), c); err != nil { + t.Fatalf("Default returned error: %v", err) + } + if c.Spec.TLS.Provider != "auto" { + t.Fatalf("expected provider to default to \"auto\", got %q", c.Spec.TLS.Provider) + } + }) + + t.Run("explicit provider is preserved", func(t *testing.T) { + c := newCluster(3, "3.6.1") + c.Spec.TLS = &TLSCertificate{Provider: "cert-manager"} + if err := d.Default(context.Background(), c); err != nil { + t.Fatalf("Default returned error: %v", err) + } + if c.Spec.TLS.Provider != "cert-manager" { + t.Fatalf("expected provider preserved, got %q", c.Spec.TLS.Provider) + } + }) + + t.Run("nil TLS is left untouched", func(t *testing.T) { + c := newCluster(3, "3.6.1") + if err := d.Default(context.Background(), c); err != nil { + t.Fatalf("Default returned error: %v", err) + } + if c.Spec.TLS != nil { + t.Fatalf("expected TLS to stay nil") + } + }) +} diff --git a/api/v1alpha1/webhook_envtest_test.go b/api/v1alpha1/webhook_envtest_test.go new file mode 100644 index 00000000..be059a02 --- /dev/null +++ b/api/v1alpha1/webhook_envtest_test.go @@ -0,0 +1,270 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1_test + +import ( + "context" + "crypto/tls" + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/webhook" + + operatorv1alpha1 "go.etcd.io/etcd-operator/api/v1alpha1" +) + +// This file exercises the validating + defaulting webhooks against a real +// (envtest) API server: the generated webhook configurations are installed, a +// manager serves the webhook endpoints over the envtest-provisioned serving +// cert, and invalid EtcdCluster objects are applied to assert they are rejected +// with the exact, actionable messages produced by the webhook. + +var ( + whCfg *rest.Config + whEnv *envtest.Environment + whClient client.Client + whTestNS = "default" + whStarted bool +) + +func TestMain(m *testing.M) { + if os.Getenv("KUBEBUILDER_ASSETS") == "" { + // Without envtest binaries we cannot stand up an API server; the pure-unit + // tests in this package still run via the normal `go test` path. Skipping + // here keeps `go vet` / `go build` green on machines without assets. + fmt.Fprintln(os.Stderr, "KUBEBUILDER_ASSETS not set; skipping webhook envtest setup") + os.Exit(m.Run()) + } + + logf.SetLogger(zap.New(zap.UseDevMode(true))) + + envTestK8sVersion := os.Getenv("ENVTEST_K8S_VERSION") + if envTestK8sVersion == "" { + envTestK8sVersion = "1.31.0" + } + + whEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + BinaryAssetsDirectory: filepath.Join( + "..", "..", "bin", "k8s", + fmt.Sprintf("%s-%s-%s", envTestK8sVersion, runtime.GOOS, runtime.GOARCH), + ), + WebhookInstallOptions: envtest.WebhookInstallOptions{ + Paths: []string{filepath.Join("..", "..", "config", "webhook", "manifests.yaml")}, + }, + } + + var err error + whCfg, err = whEnv.Start() + if err != nil { + fmt.Fprintf(os.Stderr, "failed to start webhook envtest: %v\n", err) + os.Exit(1) + } + whStarted = true + + if err := operatorv1alpha1.AddToScheme(scheme.Scheme); err != nil { + fmt.Fprintf(os.Stderr, "failed to add scheme: %v\n", err) + os.Exit(1) + } + + whClient, err = client.New(whCfg, client.Options{Scheme: scheme.Scheme}) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to create client: %v\n", err) + os.Exit(1) + } + + // Start a manager serving the webhooks on the envtest-provisioned cert. + wo := whEnv.WebhookInstallOptions + mgr, err := manager.New(whCfg, manager.Options{ + Scheme: scheme.Scheme, + Metrics: server.Options{BindAddress: "0"}, + WebhookServer: webhook.NewServer(webhook.Options{ + Host: wo.LocalServingHost, + Port: wo.LocalServingPort, + CertDir: wo.LocalServingCertDir, + TLSOpts: []func(*tls.Config){func(c *tls.Config) { c.MinVersion = tls.VersionTLS12 }}, + }), + }) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to create manager: %v\n", err) + os.Exit(1) + } + if err := (&operatorv1alpha1.EtcdCluster{}).SetupWebhookWithManager(mgr); err != nil { + fmt.Fprintf(os.Stderr, "failed to register webhook: %v\n", err) + os.Exit(1) + } + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + if err := mgr.Start(ctx); err != nil { + fmt.Fprintf(os.Stderr, "manager exited: %v\n", err) + } + }() + + // Wait for the webhook server to accept connections before running tests. + if err := waitForWebhookServer(wo.LocalServingHost, wo.LocalServingPort); err != nil { + fmt.Fprintf(os.Stderr, "webhook server never became ready: %v\n", err) + cancel() + _ = whEnv.Stop() + os.Exit(1) + } + + code := m.Run() + + cancel() + if err := whEnv.Stop(); err != nil { + fmt.Fprintf(os.Stderr, "failed to stop envtest: %v\n", err) + } + os.Exit(code) +} + +func waitForWebhookServer(host string, port int) error { + addr := fmt.Sprintf("%s:%d", host, port) + dialer := &tls.Dialer{Config: &tls.Config{InsecureSkipVerify: true}} //nolint:gosec // test-only + deadline := time.Now().Add(30 * time.Second) + for time.Now().Before(deadline) { + conn, err := dialer.Dial("tcp", addr) + if err == nil { + _ = conn.Close() + return nil + } + time.Sleep(250 * time.Millisecond) + } + return fmt.Errorf("timed out dialing %s", addr) +} + +func skipIfNoEnvtest(t *testing.T) { + t.Helper() + if !whStarted { + t.Skip("webhook envtest not started (KUBEBUILDER_ASSETS unset)") + } +} + +func mkCluster(name string, size int, version string) *operatorv1alpha1.EtcdCluster { + return &operatorv1alpha1.EtcdCluster{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: whTestNS}, + Spec: operatorv1alpha1.EtcdClusterSpec{Size: size, Version: version}, + } +} + +func TestEnvtest_RejectsEvenSize(t *testing.T) { + skipIfNoEnvtest(t) + err := whClient.Create(context.Background(), mkCluster("even-size", 2, "3.6.1")) + if err == nil { + t.Fatal("expected even size to be rejected by the validating webhook") + } + if !strings.Contains(err.Error(), "size must be an odd number") { + t.Fatalf("unexpected rejection message: %v", err) + } + if !strings.Contains(err.Error(), "Use 1 or 3 instead") { + t.Fatalf("expected actionable remediation in message, got: %v", err) + } +} + +func TestEnvtest_RejectsBadVersion(t *testing.T) { + skipIfNoEnvtest(t) + err := whClient.Create(context.Background(), mkCluster("bad-version", 3, "not-semver")) + if err == nil { + t.Fatal("expected non-semver version to be rejected") + } + if !strings.Contains(err.Error(), "is not a valid semantic version") { + t.Fatalf("unexpected rejection message: %v", err) + } +} + +func TestEnvtest_RejectsUnknownTLSProvider(t *testing.T) { + skipIfNoEnvtest(t) + c := mkCluster("bad-tls", 3, "3.6.1") + c.Spec.TLS = &operatorv1alpha1.TLSCertificate{Provider: "vault"} + err := whClient.Create(context.Background(), c) + if err == nil { + t.Fatal("expected unknown TLS provider to be rejected") + } + if !strings.Contains(err.Error(), "supported values: \"auto\", \"cert-manager\"") { + t.Fatalf("unexpected rejection message: %v", err) + } +} + +func TestEnvtest_AcceptsValidAndDefaultsTLSProvider(t *testing.T) { + skipIfNoEnvtest(t) + c := mkCluster("valid-defaults", 3, "3.6.1") + c.Spec.TLS = &operatorv1alpha1.TLSCertificate{} // empty provider -> defaulted to "auto" + if err := whClient.Create(context.Background(), c); err != nil { + t.Fatalf("expected valid cluster to be admitted, got: %v", err) + } + t.Cleanup(func() { _ = whClient.Delete(context.Background(), c) }) + + var got operatorv1alpha1.EtcdCluster + if err := whClient.Get(context.Background(), + client.ObjectKey{Name: c.Name, Namespace: c.Namespace}, &got); err != nil { + t.Fatalf("failed to read back cluster: %v", err) + } + if got.Spec.TLS == nil || got.Spec.TLS.Provider != "auto" { + t.Fatalf("expected defaulting webhook to set tls.provider=auto, got: %+v", got.Spec.TLS) + } +} + +func TestEnvtest_RejectsSkipMinorUpgrade(t *testing.T) { + skipIfNoEnvtest(t) + ctx := context.Background() + c := mkCluster("upgrade-guard", 3, "3.5.1") + if err := whClient.Create(ctx, c); err != nil { + t.Fatalf("failed to create initial cluster: %v", err) + } + t.Cleanup(func() { _ = whClient.Delete(ctx, c) }) + + var got operatorv1alpha1.EtcdCluster + if err := whClient.Get(ctx, client.ObjectKey{Name: c.Name, Namespace: c.Namespace}, &got); err != nil { + t.Fatalf("failed to read back cluster: %v", err) + } + got.Spec.Version = "3.7.1" // skips 3.6 + err := whClient.Update(ctx, &got) + if err == nil { + t.Fatal("expected skip-minor upgrade to be rejected on update") + } + if !strings.Contains(err.Error(), "skips a minor version") { + t.Fatalf("unexpected upgrade rejection message: %v", err) + } + + // A single-minor upgrade must be accepted. + if err := whClient.Get(ctx, client.ObjectKey{Name: c.Name, Namespace: c.Namespace}, &got); err != nil { + t.Fatalf("failed to re-read cluster: %v", err) + } + got.Spec.Version = "3.6.1" + if err := whClient.Update(ctx, &got); err != nil { + t.Fatalf("expected single-minor upgrade to be accepted, got: %v", err) + } +} + +// keep ctrl import used even if the manager construction changes shape. +var _ = ctrl.Log diff --git a/cmd/main.go b/cmd/main.go index a04983ec..af1ee171 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -158,6 +158,17 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "EtcdCluster") os.Exit(1) } + + // Register the EtcdCluster validating + defaulting admission webhooks. The + // webhook server is already wired above; this is the registration that was + // previously missing (see issue #380). Skip when ENABLE_WEBHOOKS=false, which + // is convenient for local `make run` without serving certs. + if os.Getenv("ENABLE_WEBHOOKS") != "false" { + if err = (&operatorv1alpha1.EtcdCluster{}).SetupWebhookWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "EtcdCluster") + os.Exit(1) + } + } // +kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { diff --git a/config/certmanager/certificate.yaml b/config/certmanager/certificate.yaml new file mode 100644 index 00000000..b2d10ccf --- /dev/null +++ b/config/certmanager/certificate.yaml @@ -0,0 +1,32 @@ +# The following manifests contain a self-signed issuer CR and a certificate CR. +# More document can be found at https://docs.cert-manager.io +# WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + labels: + app.kubernetes.io/name: etcd-operator + app.kubernetes.io/managed-by: kustomize + name: selfsigned-issuer + namespace: system +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + labels: + app.kubernetes.io/name: etcd-operator + app.kubernetes.io/managed-by: kustomize + name: serving-cert # this name should match the one appeared in kustomizeconfig.yaml + namespace: system +spec: + # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize + # replacements in the config/default/kustomization.yaml file. + dnsNames: + - SERVICE_NAME.SERVICE_NAMESPACE.svc + - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local + issuerRef: + kind: Issuer + name: selfsigned-issuer + secretName: webhook-server-cert diff --git a/config/certmanager/kustomization.yaml b/config/certmanager/kustomization.yaml new file mode 100644 index 00000000..bebea5a5 --- /dev/null +++ b/config/certmanager/kustomization.yaml @@ -0,0 +1,5 @@ +resources: +- certificate.yaml + +configurations: +- kustomizeconfig.yaml diff --git a/config/certmanager/kustomizeconfig.yaml b/config/certmanager/kustomizeconfig.yaml new file mode 100644 index 00000000..cf6f89e8 --- /dev/null +++ b/config/certmanager/kustomizeconfig.yaml @@ -0,0 +1,8 @@ +# This configuration is for teaching kustomize how to update name ref substitution +nameReference: +- kind: Issuer + group: cert-manager.io + fieldSpecs: + - kind: Certificate + group: cert-manager.io + path: spec/issuerRef/name diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index 792ef45c..c3b738d0 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -20,9 +20,9 @@ resources: - ../manager # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml -#- ../webhook +- ../webhook # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. -#- ../certmanager +- ../certmanager # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. #- ../prometheus # [METRICS] Expose the controller manager metrics service. @@ -43,135 +43,108 @@ patches: # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml -#- path: manager_webhook_patch.yaml +- path: manager_webhook_patch.yaml + target: + kind: Deployment -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. -# Uncomment the following replacements to add the cert-manager CA injection annotations -#replacements: -# - source: # Uncomment the following block if you have any webhook -# kind: Service -# version: v1 -# name: webhook-service -# fieldPath: .metadata.name # Name of the service -# targets: -# - select: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# fieldPaths: -# - .spec.dnsNames.0 -# - .spec.dnsNames.1 -# options: -# delimiter: '.' -# index: 0 -# create: true -# - source: -# kind: Service -# version: v1 -# name: webhook-service -# fieldPath: .metadata.namespace # Namespace of the service -# targets: -# - select: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# fieldPaths: -# - .spec.dnsNames.0 -# - .spec.dnsNames.1 -# options: -# delimiter: '.' -# index: 1 -# create: true -# -# - source: # Uncomment the following block if you have a ValidatingWebhook (--programmatic-validation) -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.namespace # Namespace of the certificate CR -# targets: -# - select: -# kind: ValidatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - source: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.name -# targets: -# - select: -# kind: ValidatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true -# -# - source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting ) -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.namespace # Namespace of the certificate CR -# targets: -# - select: -# kind: MutatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - source: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.name -# targets: -# - select: -# kind: MutatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true -# -# - source: # Uncomment the following block if you have a ConversionWebhook (--conversion) -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.namespace # Namespace of the certificate CR -# targets: -# - select: -# kind: CustomResourceDefinition -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - source: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.name -# targets: -# - select: -# kind: CustomResourceDefinition -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true +# [CERTMANAGER] To enable cert-manager, the following replacements wire the +# cert-manager CA into the webhook configurations and the serving Certificate's +# dnsNames so the API server trusts the webhook server's TLS cert. +replacements: +# Inject the webhook Service name/namespace into the serving Certificate dnsNames. +- source: + kind: Service + version: v1 + name: webhook-service + fieldPath: .metadata.name + targets: + - select: + kind: Certificate + group: cert-manager.io + version: v1 + fieldPaths: + - .spec.dnsNames.0 + - .spec.dnsNames.1 + options: + delimiter: '.' + index: 0 + create: true +- source: + kind: Service + version: v1 + name: webhook-service + fieldPath: .metadata.namespace + targets: + - select: + kind: Certificate + group: cert-manager.io + version: v1 + fieldPaths: + - .spec.dnsNames.0 + - .spec.dnsNames.1 + options: + delimiter: '.' + index: 1 + create: true +# Inject the serving Certificate into the ValidatingWebhookConfiguration CA annotation. +- source: + kind: Certificate + group: cert-manager.io + version: v1 + name: serving-cert + fieldPath: .metadata.namespace + targets: + - select: + kind: ValidatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 0 + create: true +- source: + kind: Certificate + group: cert-manager.io + version: v1 + name: serving-cert + fieldPath: .metadata.name + targets: + - select: + kind: ValidatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 1 + create: true +# Inject the serving Certificate into the MutatingWebhookConfiguration CA annotation. +- source: + kind: Certificate + group: cert-manager.io + version: v1 + name: serving-cert + fieldPath: .metadata.namespace + targets: + - select: + kind: MutatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 0 + create: true +- source: + kind: Certificate + group: cert-manager.io + version: v1 + name: serving-cert + fieldPath: .metadata.name + targets: + - select: + kind: MutatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 1 + create: true diff --git a/config/default/manager_webhook_patch.yaml b/config/default/manager_webhook_patch.yaml new file mode 100644 index 00000000..39a7ffe0 --- /dev/null +++ b/config/default/manager_webhook_patch.yaml @@ -0,0 +1,25 @@ +# This patch mounts the webhook server's serving certificate (provisioned by +# cert-manager into the webhook-server-cert secret) into the manager container +# and exposes the webhook port. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system +spec: + template: + spec: + containers: + - name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: webhook-certs + readOnly: true + volumes: + - name: webhook-certs + secret: + secretName: webhook-server-cert diff --git a/config/webhook/kustomization.yaml b/config/webhook/kustomization.yaml new file mode 100644 index 00000000..9cf26134 --- /dev/null +++ b/config/webhook/kustomization.yaml @@ -0,0 +1,6 @@ +resources: +- manifests.yaml +- service.yaml + +configurations: +- kustomizeconfig.yaml diff --git a/config/webhook/kustomizeconfig.yaml b/config/webhook/kustomizeconfig.yaml new file mode 100644 index 00000000..206316e5 --- /dev/null +++ b/config/webhook/kustomizeconfig.yaml @@ -0,0 +1,22 @@ +# the following config is for teaching kustomize where to look at when substituting nameReference. +# It requires kustomize v2.1.0 or newer to work properly. +nameReference: +- kind: Service + version: v1 + fieldSpecs: + - kind: MutatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/name + - kind: ValidatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/name + +namespace: +- kind: MutatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/namespace + create: true +- kind: ValidatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/namespace + create: true diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml new file mode 100644 index 00000000..d2c926a0 --- /dev/null +++ b/config/webhook/manifests.yaml @@ -0,0 +1,52 @@ +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: mutating-webhook-configuration +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: webhook-service + namespace: system + path: /mutate-operator-etcd-io-v1alpha1-etcdcluster + failurePolicy: Fail + name: metcdcluster-v1alpha1.kb.io + rules: + - apiGroups: + - operator.etcd.io + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - etcdclusters + sideEffects: None +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: validating-webhook-configuration +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: webhook-service + namespace: system + path: /validate-operator-etcd-io-v1alpha1-etcdcluster + failurePolicy: Fail + name: vetcdcluster-v1alpha1.kb.io + rules: + - apiGroups: + - operator.etcd.io + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - etcdclusters + sideEffects: None diff --git a/config/webhook/service.yaml b/config/webhook/service.yaml new file mode 100644 index 00000000..56767793 --- /dev/null +++ b/config/webhook/service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: webhook-service + namespace: system +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 9443 + selector: + control-plane: controller-manager diff --git a/test/e2e/webhook_test.go b/test/e2e/webhook_test.go new file mode 100644 index 00000000..00be46d3 --- /dev/null +++ b/test/e2e/webhook_test.go @@ -0,0 +1,138 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + "strings" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/e2e-framework/pkg/envconf" + "sigs.k8s.io/e2e-framework/pkg/features" + + ecv1alpha1 "go.etcd.io/etcd-operator/api/v1alpha1" +) + +// newWebhookTestCluster builds an EtcdCluster with the given spec parameters. +// It is local to this file so the suite does not depend on helpers added by +// other in-flight e2e PRs. +func newWebhookTestCluster(name string, size int, version string) *ecv1alpha1.EtcdCluster { + return &ecv1alpha1.EtcdCluster{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "operator.etcd.io/v1alpha1", + Kind: "EtcdCluster", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: ecv1alpha1.EtcdClusterSpec{Size: size, Version: version}, + } +} + +// TestAdmissionWebhooks verifies that the validating and defaulting admission +// webhooks are registered and enforce the documented invariants at apply time. +// Each invalid apply must be rejected synchronously with an actionable message. +func TestAdmissionWebhooks(t *testing.T) { + feature := features.New("admission-webhooks") + + feature.Assess("rejects even cluster size with quorum guidance", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + err := c.Resources().Create(ctx, newWebhookTestCluster("wh-even", 2, etcdVersion)) + if err == nil { + t.Fatal("expected even-sized EtcdCluster to be rejected by the validating webhook") + } + if !strings.Contains(err.Error(), "size must be an odd number") { + t.Fatalf("expected quorum guidance in rejection, got: %v", err) + } + return ctx + }) + + feature.Assess("rejects non-semver version", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + err := c.Resources().Create(ctx, newWebhookTestCluster("wh-badver", 3, "not-a-version")) + if err == nil { + t.Fatal("expected non-semver version to be rejected") + } + if !strings.Contains(err.Error(), "is not a valid semantic version") { + t.Fatalf("expected semver guidance in rejection, got: %v", err) + } + return ctx + }) + + feature.Assess("rejects unknown TLS provider", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + cluster := newWebhookTestCluster("wh-badtls", 3, etcdVersion) + cluster.Spec.TLS = &ecv1alpha1.TLSCertificate{Provider: "vault"} + err := c.Resources().Create(ctx, cluster) + if err == nil { + t.Fatal("expected unknown TLS provider to be rejected") + } + if !strings.Contains(err.Error(), "auto") || !strings.Contains(err.Error(), "cert-manager") { + t.Fatalf("expected supported-provider list in rejection, got: %v", err) + } + return ctx + }) + + feature.Assess("admits valid cluster and defaults tls.provider to auto", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + cluster := newWebhookTestCluster("wh-valid", 3, etcdVersion) + cluster.Spec.TLS = &ecv1alpha1.TLSCertificate{} // empty -> defaulted + if err := c.Resources().Create(ctx, cluster); err != nil { + t.Fatalf("expected valid cluster to be admitted, got: %v", err) + } + defer func() { _ = c.Resources().Delete(ctx, cluster) }() + + var got ecv1alpha1.EtcdCluster + if err := c.Resources().Get(ctx, cluster.Name, namespace, &got); err != nil { + t.Fatalf("failed to read back cluster: %v", err) + } + if got.Spec.TLS == nil || got.Spec.TLS.Provider != "auto" { + t.Fatalf("expected defaulting webhook to set tls.provider=auto, got: %+v", got.Spec.TLS) + } + return ctx + }) + + feature.Assess("rejects skip-minor upgrade on update", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + cluster := newWebhookTestCluster("wh-upgrade", 3, "3.5.1") + if err := c.Resources().Create(ctx, cluster); err != nil { + t.Fatalf("failed to create initial cluster: %v", err) + } + defer func() { _ = c.Resources().Delete(ctx, cluster) }() + + var got ecv1alpha1.EtcdCluster + if err := c.Resources().Get(ctx, cluster.Name, namespace, &got); err != nil { + t.Fatalf("failed to read back cluster: %v", err) + } + got.Spec.Version = "3.7.1" // skips 3.6 + if err := c.Resources().Update(ctx, &got); err == nil { + t.Fatal("expected skip-minor upgrade to be rejected on update") + } else if !strings.Contains(err.Error(), "skips a minor version") { + t.Fatalf("expected skip-minor guidance in rejection, got: %v", err) + } + return ctx + }) + + _ = testEnv.Test(t, feature.Feature()) +} From 2204b9c381d23ad4ebfd5b174d0a98a3802993ed Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Thu, 18 Jun 2026 20:01:17 -0400 Subject: [PATCH 2/6] fix(webhook): compare storageSpec by value and scope update validation to changed fields Three correctness fixes to the EtcdCluster admission webhooks: C1: storageSpec immutability used struct `!=` on a type embedding resource.Quantity fields. `==`/`!=` compares the Quantity's cached string and *inf.Dec pointer, not the numeric value, so a re-spelled but numerically identical quantity (e.g. 1Gi vs 1024Mi) falsely tripped "storageSpec is immutable" under failurePolicy: Fail. Replace with apiequality.Semantic.DeepEqual (apimachinery, already a direct dep). Verified empirically: 1Gi == 1024Mi is false under `==` but true under Semantic.DeepEqual; 1Gi vs 2Gi remains unequal so immutability holds. C2: the storage immutability test only set StorageClassName, leaving both Quantities zero-valued, which is exactly the case where the buggy `==` happens to agree. Populate VolumeSizeRequest in withStorage() and add two cases: 1Gi vs 1024Mi must be allowed (C1 regression guard) and 1Gi vs 2Gi must still be rejected. C3: ValidateUpdate re-ran the full create-time spec validation, so with failurePolicy: Fail and no objectSelector a pre-existing even-sized or unknown-version cluster (the CRD historically only enforced size >= 1) was rejected on every spec edit, with no way to remediate. Only re-check the size and version-format invariants when those fields actually change; always re-validate TLS coherence and the transition (immutability + upgrade-path) checks. ValidateCreate is unchanged. Adds a test asserting legacy even-sized clusters accept unrelated edits and can be remediated, while a changed-but-still-even size is still rejected. Also documents that validateUpgradePath checks the declared spec transition only; the controller remains authoritative against the live image. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Lange --- api/v1alpha1/etcdcluster_webhook.go | 36 ++++++++-- api/v1alpha1/etcdcluster_webhook_test.go | 89 +++++++++++++++++++++++- 2 files changed, 118 insertions(+), 7 deletions(-) diff --git a/api/v1alpha1/etcdcluster_webhook.go b/api/v1alpha1/etcdcluster_webhook.go index 2b42bab0..24cacfd1 100644 --- a/api/v1alpha1/etcdcluster_webhook.go +++ b/api/v1alpha1/etcdcluster_webhook.go @@ -23,6 +23,7 @@ import ( "strings" "github.com/coreos/go-semver/semver" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" @@ -140,8 +141,7 @@ func (v *EtcdClusterCustomValidator) ValidateUpdate( log := etcdclusterlog.WithValues("name", ec.Name, "namespace", ec.Namespace, "operation", "update") log.Info("validating EtcdCluster on update") - errs := validateEtcdClusterSpec(&ec.Spec) - errs = append(errs, validateEtcdClusterUpdate(&oldEc.Spec, &ec.Spec)...) + errs := validateEtcdClusterUpdate(&oldEc.Spec, &ec.Spec) return finish(log, ec, errs) } @@ -175,7 +175,8 @@ func finish(log logr, ec *EtcdCluster, errs field.ErrorList) (admission.Warnings // exact message text. // --------------------------------------------------------------------------- -// validateEtcdClusterSpec runs all create-time (and update-time) spec invariants. +// validateEtcdClusterSpec runs all create-time spec invariants. On update only the +// subset of these that changed is re-checked; see validateEtcdClusterUpdate. func validateEtcdClusterSpec(spec *EtcdClusterSpec) field.ErrorList { errs := make(field.ErrorList, 0, 3) specPath := field.NewPath("spec") @@ -268,18 +269,35 @@ func validateTLS(tls *TLSCertificate, path *field.Path) field.ErrorList { return errs } -// validateEtcdClusterUpdate enforces transition invariants between the old and new -// spec: immutable fields and the supported etcd upgrade path. +// validateEtcdClusterUpdate runs all update-time invariants between the old and +// new spec. Unlike ValidateCreate it does NOT re-run the full create-time spec +// validation: the CRD historically only enforced "size >= 1", so pre-existing +// clusters with an even size or an unknown-but-well-formed version are legal on +// the API server. With failurePolicy: Fail and no objectSelector, re-validating +// those invariants on every update would wedge unrelated edits (metadata, +// annotations, TLS) to legacy clusters. We therefore only enforce the size and +// version-format invariants when the offending field actually changed — keeping +// the guardrail for new values while letting legacy clusters be remediated. TLS +// coherence is cheap and self-contained, so it is always re-validated. func validateEtcdClusterUpdate(oldSpec, newSpec *EtcdClusterSpec) field.ErrorList { var errs field.ErrorList specPath := field.NewPath("spec") + if newSpec.Size != oldSpec.Size { + errs = append(errs, validateSize(newSpec.Size, specPath.Child("size"))...) + } + if newSpec.Version != oldSpec.Version { + errs = append(errs, validateVersionFormat(newSpec.Version, specPath.Child("version"))...) + } + errs = append(errs, validateTLS(newSpec.TLS, specPath.Child("tls"))...) + // storageSpec is immutable once set: changing the PVC template after the // StatefulSet exists cannot be reconciled in place and risks data loss. if oldSpec.StorageSpec != nil && newSpec.StorageSpec == nil { errs = append(errs, field.Invalid(specPath.Child("storageSpec"), nil, "storageSpec is immutable and cannot be removed once set; restore the original storageSpec or recreate the cluster.")) - } else if oldSpec.StorageSpec != nil && newSpec.StorageSpec != nil && *oldSpec.StorageSpec != *newSpec.StorageSpec { + } else if oldSpec.StorageSpec != nil && newSpec.StorageSpec != nil && + !apiequality.Semantic.DeepEqual(oldSpec.StorageSpec, newSpec.StorageSpec) { errs = append(errs, field.Invalid(specPath.Child("storageSpec"), newSpec.StorageSpec, "storageSpec is immutable and cannot be changed once set; revert spec.storageSpec to its original value or recreate the cluster.")) } @@ -295,6 +313,12 @@ func validateEtcdClusterUpdate(oldSpec, newSpec *EtcdClusterSpec) field.ErrorLis // upgrades and downgrades), reusing the same ordered version table the controller // uses at reconcile time. A no-op (equal versions) or an unparseable version is // not reported here (the latter is handled by validateVersionFormat). +// +// Note: this validates the *declared* spec transition (old spec.version -> +// new spec.version), not the actual running version. The controller compares the +// live StatefulSet image against the spec at reconcile time and remains +// authoritative; if a prior upgrade stalled before convergence, a skip-minor edit +// could pass this declared-transition check yet still be caught by the controller. func validateUpgradePath(current, target string, path *field.Path) field.ErrorList { if current == target { return nil diff --git a/api/v1alpha1/etcdcluster_webhook_test.go b/api/v1alpha1/etcdcluster_webhook_test.go index c1659558..9a860617 100644 --- a/api/v1alpha1/etcdcluster_webhook_test.go +++ b/api/v1alpha1/etcdcluster_webhook_test.go @@ -23,6 +23,7 @@ import ( "testing" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" ) // causeMessages flattens the aggregated field errors carried by an @@ -258,12 +259,64 @@ func TestValidateUpdate_UpgradePathAndImmutability(t *testing.T) { } } +// TestValidateUpdate_LegacyClustersRemediable guards C3: pre-existing clusters +// whose values predate the create-time invariants (the CRD historically only +// enforced size >= 1) must not be frozen on update. Unchanged-but-non-conforming +// values are tolerated; a metadata-only edit must be allowed; but the invariant is +// still enforced the moment the offending field is actually changed. +func TestValidateUpdate_LegacyClustersRemediable(t *testing.T) { + v := &EtcdClusterCustomValidator{} + ctx := context.Background() + + t.Run("even-sized legacy cluster: unrelated edit is allowed", func(t *testing.T) { + // size 4 would fail ValidateCreate, but it already exists; an unrelated + // (TLS) edit must not be rejected for the pre-existing even size. + old := newCluster(4, "3.6.1") + updated := newCluster(4, "3.6.1") + updated.Spec.TLS = &TLSCertificate{Provider: "auto"} + if _, err := v.ValidateUpdate(ctx, old, updated); err != nil { + t.Fatalf("expected unrelated edit on legacy even-sized cluster to be allowed, got: %v", err) + } + }) + + t.Run("changing size to another even value is still rejected", func(t *testing.T) { + old := newCluster(4, "3.6.1") + updated := newCluster(6, "3.6.1") + _, err := v.ValidateUpdate(ctx, old, updated) + msgs := causeMessages(t, err) + found := false + for _, m := range msgs { + if strings.HasPrefix(m, "spec.size: Invalid value:") && + strings.Contains(m, "size must be an odd number") { + found = true + } + } + if !found { + t.Fatalf("expected even-size rejection when size is changed, got: %s", strings.Join(msgs, " | ")) + } + }) + + t.Run("remediating size to a valid odd value is allowed", func(t *testing.T) { + old := newCluster(4, "3.6.1") + updated := newCluster(3, "3.6.1") + if _, err := v.ValidateUpdate(ctx, old, updated); err != nil { + t.Fatalf("expected remediation 4 -> 3 to be allowed, got: %v", err) + } + }) +} + func TestValidateUpdate_StorageSpecImmutable(t *testing.T) { v := &EtcdClusterCustomValidator{} + // withStorage populates a non-zero VolumeSizeRequest on purpose: a zero-valued + // Quantity has an empty string cache and nil *inf.Dec, so struct == happens to + // agree with the numeric value and would mask the comparison bug C1 fixes. withStorage := func() *EtcdCluster { c := newCluster(3, "3.6.1") - c.Spec.StorageSpec = &StorageSpec{StorageClassName: "standard"} + c.Spec.StorageSpec = &StorageSpec{ + StorageClassName: "standard", + VolumeSizeRequest: resource.MustParse("1Gi"), + } return c } @@ -295,6 +348,40 @@ func TestValidateUpdate_StorageSpecImmutable(t *testing.T) { t.Fatalf("expected unchanged storageSpec to be allowed, got: %v", err) } }) + + // Regression guard for C1: a re-spelled but numerically identical quantity + // (1Gi vs 1024Mi) must NOT trip immutability. Struct == compares the cached + // string and *inf.Dec pointer, so it sees these as different; Semantic.DeepEqual + // compares the value and sees them as equal. + t.Run("numerically-equal re-spelled quantity is allowed", func(t *testing.T) { + old := withStorage() + old.Spec.StorageSpec.VolumeSizeRequest = resource.MustParse("1Gi") + updated := withStorage() + updated.Spec.StorageSpec.VolumeSizeRequest = resource.MustParse("1024Mi") + if _, err := v.ValidateUpdate(context.Background(), old, updated); err != nil { + t.Fatalf("expected 1Gi vs 1024Mi to be allowed (equal value), got: %v", err) + } + }) + + // A real size change (1Gi -> 2Gi) must still be rejected: immutability holds. + t.Run("changing the quantity value is rejected", func(t *testing.T) { + old := withStorage() + old.Spec.StorageSpec.VolumeSizeRequest = resource.MustParse("1Gi") + updated := withStorage() + updated.Spec.StorageSpec.VolumeSizeRequest = resource.MustParse("2Gi") + _, err := v.ValidateUpdate(context.Background(), old, updated) + msgs := causeMessages(t, err) + ok := false + for _, m := range msgs { + if strings.HasPrefix(m, "spec.storageSpec: Invalid value:") && + strings.Contains(m, "storageSpec is immutable and cannot be changed once set") { + ok = true + } + } + if !ok { + t.Fatalf("expected storageSpec immutability error for 1Gi->2Gi, got: %s", strings.Join(msgs, " | ")) + } + }) } func assertCause(t *testing.T, err error, want string) { From 34edbc5bf8e3c1d591032a0848d3fb0c36f8a38b Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Thu, 18 Jun 2026 20:22:00 -0400 Subject: [PATCH 3/6] feat(webhook): production-harden admission webhook policy, cert wiring, and spec validation Completes the admission-webhook PR by making the generated webhook configuration explicit and safe for production, verifying the cert-manager caBundle injection chain renders end to end, and closing remaining gaps in synchronous spec validation. Webhook policy (generated via pinned controller-gen markers): - failurePolicy=fail is now an explicit, documented decision. These webhooks are correctness-critical for a stateful quorum store, so they fail closed. The rule scope is exactly {operator.etcd.io/v1alpha1 etcdclusters}, so a webhook outage never blocks core cluster objects; a documented break-glass objectSelector (label etcd.operator.etcd.io/skip-webhooks=true) lets an admin bypass admission for a single CR while the operator is down. - timeoutSeconds=10 and matchPolicy=Equivalent added with rationale. cert-manager caBundle injection: - Added test/config render assertions that run `kustomize build config/default` and verify the inject-ca-from annotation points at the serving Certificate, the Certificate's dnsNames resolve to the webhook Service FQDN (no surviving placeholders), the issuerRef and mounted secret name line up, and both webhooks carry the hardened policy + break-glass selector. Validation/defaulting completeness: - validateStorageSpec mirrors the controller's reconcile-time storage invariants (volumeSizeRequest >= 1Mi and required, volumeSizeLimit >= request, supported accessModes, pvcName required for ReadWriteMany) so they are caught synchronously at apply time with actionable messages; also validated when storage is newly added on update. - commonName is rejected when it exceeds the documented 64-char X.509 CN ceiling for both auto and cert-manager providers. - New unit tests assert the exact rejection-message text; new envtest + compile-only e2e cases cover undersized storage. Deferred (intentionally not built): a conversion webhook. It is premature while only v1alpha1 exists; matchPolicy=Equivalent already guards against a future equivalent group/version slipping past the validator. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Lange --- api/v1alpha1/etcdcluster_webhook.go | 144 ++++++++++- api/v1alpha1/etcdcluster_webhook_test.go | 159 ++++++++++++ api/v1alpha1/webhook_envtest_test.go | 16 ++ config/default/kustomization.yaml | 12 + config/default/webhook_selector_patch.yaml | 26 ++ config/webhook/manifests.yaml | 4 + test/config/webhook_render_test.go | 278 +++++++++++++++++++++ test/e2e/webhook_test.go | 18 ++ 8 files changed, 648 insertions(+), 9 deletions(-) create mode 100644 config/default/webhook_selector_patch.yaml create mode 100644 test/config/webhook_render_test.go diff --git a/api/v1alpha1/etcdcluster_webhook.go b/api/v1alpha1/etcdcluster_webhook.go index 24cacfd1..1be9dad0 100644 --- a/api/v1alpha1/etcdcluster_webhook.go +++ b/api/v1alpha1/etcdcluster_webhook.go @@ -23,8 +23,10 @@ import ( "strings" "github.com/coreos/go-semver/semver" + corev1 "k8s.io/api/core/v1" apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/util/validation/field" @@ -65,7 +67,33 @@ func (r *EtcdCluster) SetupWebhookWithManager(mgr ctrl.Manager) error { Complete() } -// +kubebuilder:webhook:path=/mutate-operator-etcd-io-v1alpha1-etcdcluster,mutating=true,failurePolicy=fail,sideEffects=None,groups=operator.etcd.io,resources=etcdclusters,verbs=create;update,versions=v1alpha1,name=metcdcluster-v1alpha1.kb.io,admissionReviewVersions=v1 +// Webhook policy rationale (applies to BOTH webhooks declared in this file). +// +// - failurePolicy=fail: these webhooks are correctness-critical. Defaulting +// (provider=auto) and validation (odd size, semver, supported upgrade path, +// storageSpec immutability) protect a stateful, quorum-based datastore where a +// bad spec admitted while the webhook is unreachable can produce a cluster that +// cannot form quorum or silent data loss on a botched PVC change. We therefore +// fail closed: if the API server cannot reach the webhook, the EtcdCluster +// create/update is rejected rather than admitted unchecked. +// - Failing closed does NOT brick the cluster. The rule scope is EXACTLY +// {operator.etcd.io/v1alpha1 etcdclusters}, so a webhook outage only blocks +// mutations to EtcdCluster CRs (a narrow, operator-owned resource); core +// workloads, nodes, and every other API object stay fully editable. As an +// explicit break-glass escape hatch, config/default injects an objectSelector +// so an admin can label a single CR with +// `etcd.operator.etcd.io/skip-webhooks: "true"` to bypass admission while the +// webhook is down (see config/default/webhook_selector_patch.yaml). +// - timeoutSeconds=10: validation is pure, in-memory, and dependency-free (no +// external calls), so 10s is generous headroom that still bounds API-server +// stalls if the webhook pod is wedged. With failurePolicy=fail a timeout is a +// rejection, which is the safe direction. +// - matchPolicy=Equivalent: match the request even if it arrives via a different +// but equivalent API group/version, so a future served/stored version cannot +// silently slip past the validator. Only v1alpha1 exists today, but Equivalent +// is the safe default and costs nothing now. + +// +kubebuilder:webhook:path=/mutate-operator-etcd-io-v1alpha1-etcdcluster,mutating=true,failurePolicy=fail,matchPolicy=Equivalent,timeoutSeconds=10,sideEffects=None,groups=operator.etcd.io,resources=etcdclusters,verbs=create;update,versions=v1alpha1,name=metcdcluster-v1alpha1.kb.io,admissionReviewVersions=v1 // EtcdClusterCustomDefaulter applies defaults to EtcdCluster objects on admission. // +kubebuilder:object:generate=false @@ -99,7 +127,11 @@ func applyEtcdClusterDefaults(ec *EtcdCluster, log logr) { } } -// +kubebuilder:webhook:path=/validate-operator-etcd-io-v1alpha1-etcdcluster,mutating=false,failurePolicy=fail,sideEffects=None,groups=operator.etcd.io,resources=etcdclusters,verbs=create;update,versions=v1alpha1,name=vetcdcluster-v1alpha1.kb.io,admissionReviewVersions=v1 +// See the "Webhook policy rationale" comment above the mutating webhook marker; +// failurePolicy=fail, matchPolicy=Equivalent, and timeoutSeconds=10 are chosen on +// the same grounds for the validating webhook. + +// +kubebuilder:webhook:path=/validate-operator-etcd-io-v1alpha1-etcdcluster,mutating=false,failurePolicy=fail,matchPolicy=Equivalent,timeoutSeconds=10,sideEffects=None,groups=operator.etcd.io,resources=etcdclusters,verbs=create;update,versions=v1alpha1,name=vetcdcluster-v1alpha1.kb.io,admissionReviewVersions=v1 // EtcdClusterCustomValidator validates EtcdCluster objects on admission. // +kubebuilder:object:generate=false @@ -184,6 +216,68 @@ func validateEtcdClusterSpec(spec *EtcdClusterSpec) field.ErrorList { errs = append(errs, validateSize(spec.Size, specPath.Child("size"))...) errs = append(errs, validateVersionFormat(spec.Version, specPath.Child("version"))...) errs = append(errs, validateTLS(spec.TLS, specPath.Child("tls"))...) + errs = append(errs, validateStorageSpec(spec.StorageSpec, specPath.Child("storageSpec"))...) + + return errs +} + +// minVolumeSizeRequest mirrors the controller's reconcile-time floor +// (internal/controller/utils.go) so the webhook rejects an undersized PVC +// synchronously at apply time instead of letting the StatefulSet build fail later +// with no actionable feedback. +var minVolumeSizeRequest = resource.MustParse("1Mi") + +// validateStorageSpec enforces the same storage invariants the controller checks +// while building the StatefulSet, but at admission time so the user gets an +// immediate, actionable rejection rather than a silently failing reconcile: +// +// - volumeSizeRequest is required and must be >= 1Mi. +// - volumeSizeLimit, if set, must be >= volumeSizeRequest (a PVC whose limit is +// below its request is rejected by the API server when the StatefulSet is +// created). +// - accessModes must be one the operator handles (ReadWriteOnce, ReadWriteMany, +// or empty -> ReadWriteOnce); ReadOnlyMany and others are unsupported. +// - pvcName is required when accessModes is ReadWriteMany (the operator mounts a +// pre-provisioned shared PVC by name in that mode). +func validateStorageSpec(s *StorageSpec, path *field.Path) field.ErrorList { + if s == nil { + return nil + } + var errs field.ErrorList + + switch { + case s.VolumeSizeRequest.IsZero(): + errs = append(errs, field.Required(path.Child("volumeSizeRequest"), + "volumeSizeRequest is required when storageSpec is set; specify a persistent volume size such as \"1Gi\".")) + case s.VolumeSizeRequest.Cmp(minVolumeSizeRequest) < 0: + errs = append(errs, field.Invalid(path.Child("volumeSizeRequest"), s.VolumeSizeRequest.String(), + fmt.Sprintf("volumeSizeRequest must be at least 1Mi; got %q. Request a larger persistent volume (e.g. \"1Gi\").", + s.VolumeSizeRequest.String()))) + } + + // Only meaningful to compare the limit when both are usable values; if the + // request itself is invalid the message above is the actionable one. + if !s.VolumeSizeLimit.IsZero() && !s.VolumeSizeRequest.IsZero() && + s.VolumeSizeLimit.Cmp(s.VolumeSizeRequest) < 0 { + errs = append(errs, field.Invalid(path.Child("volumeSizeLimit"), s.VolumeSizeLimit.String(), + fmt.Sprintf("volumeSizeLimit (%q) must be greater than or equal to volumeSizeRequest (%q); "+ + "raise volumeSizeLimit or lower volumeSizeRequest.", + s.VolumeSizeLimit.String(), s.VolumeSizeRequest.String()))) + } + + switch s.AccessModes { + case "", corev1.ReadWriteOnce: + // default / single-writer: nothing else required. + case corev1.ReadWriteMany: + if strings.TrimSpace(s.PVCName) == "" { + errs = append(errs, field.Required(path.Child("pvcName"), + "pvcName is required when accessModes is \"ReadWriteMany\"; set it to the name of the "+ + "pre-provisioned shared PersistentVolumeClaim the etcd pods should mount.")) + } + default: + errs = append(errs, field.NotSupported(path.Child("accessModes"), + s.AccessModes, []string{string(corev1.ReadWriteOnce), string(corev1.ReadWriteMany)})) + } return errs } @@ -236,32 +330,39 @@ func validateTLS(tls *TLSCertificate, path *field.Path) field.ErrorList { provider = tlsProviderAuto } + cfgPath := path.Child("providerCfg") switch provider { case tlsProviderAuto: // auto provider self-generates certs; cert-manager config must not be set. if tls.ProviderCfg.CertManagerCfg != nil { - errs = append(errs, field.Invalid(path.Child("providerCfg").Child("certManagerCfg"), + errs = append(errs, field.Invalid(cfgPath.Child("certManagerCfg"), "", "providerCfg.certManagerCfg must not be set when provider is \"auto\"; "+ "either remove providerCfg.certManagerCfg or set provider to \"cert-manager\".")) } + if ac := tls.ProviderCfg.AutoCfg; ac != nil { + errs = append(errs, validateCommonName(ac.CommonName, + cfgPath.Child("autoCfg").Child("commonName"))...) + } case tlsProviderCertManager: cm := tls.ProviderCfg.CertManagerCfg if cm == nil { - errs = append(errs, field.Required(path.Child("providerCfg").Child("certManagerCfg"), + errs = append(errs, field.Required(cfgPath.Child("certManagerCfg"), "providerCfg.certManagerCfg is required when provider is \"cert-manager\"; "+ "supply issuerKind and issuerName.")) break } if strings.TrimSpace(cm.IssuerName) == "" { - errs = append(errs, field.Required(path.Child("providerCfg").Child("certManagerCfg").Child("issuerName"), + errs = append(errs, field.Required(cfgPath.Child("certManagerCfg").Child("issuerName"), "issuerName is required for the cert-manager provider; set it to the name of an Issuer or ClusterIssuer.")) } if k := strings.TrimSpace(cm.IssuerKind); k != "" && k != "Issuer" && k != "ClusterIssuer" { errs = append(errs, field.NotSupported( - path.Child("providerCfg").Child("certManagerCfg").Child("issuerKind"), + cfgPath.Child("certManagerCfg").Child("issuerKind"), cm.IssuerKind, []string{"Issuer", "ClusterIssuer"})) } + errs = append(errs, validateCommonName(cm.CommonName, + cfgPath.Child("certManagerCfg").Child("commonName"))...) default: errs = append(errs, field.NotSupported(path.Child("provider"), tls.Provider, knownTLSProviders)) } @@ -269,6 +370,24 @@ func validateTLS(tls *TLSCertificate, path *field.Path) field.ErrorList { return errs } +// maxCommonNameLen is the X.509 CommonName ceiling documented on CommonConfig: a +// CN longer than 64 characters produces an invalid CSR, so the cert never issues. +// Catching it at admission turns a silent never-ready certificate into an +// immediate, actionable rejection. +const maxCommonNameLen = 64 + +// validateCommonName enforces the documented 64-character CommonName limit. An +// empty CommonName is valid (the provider derives a default). +func validateCommonName(cn string, path *field.Path) field.ErrorList { + if len(cn) <= maxCommonNameLen { + return nil + } + return field.ErrorList{field.Invalid(path, cn, + fmt.Sprintf("commonName must be %d characters or fewer to produce a valid X.509 CSR; got %d. "+ + "Shorten commonName (the certificate provider derives a default when it is empty).", + maxCommonNameLen, len(cn)))} +} + // validateEtcdClusterUpdate runs all update-time invariants between the old and // new spec. Unlike ValidateCreate it does NOT re-run the full create-time spec // validation: the CRD historically only enforced "size >= 1", so pre-existing @@ -293,13 +412,20 @@ func validateEtcdClusterUpdate(oldSpec, newSpec *EtcdClusterSpec) field.ErrorLis // storageSpec is immutable once set: changing the PVC template after the // StatefulSet exists cannot be reconciled in place and risks data loss. - if oldSpec.StorageSpec != nil && newSpec.StorageSpec == nil { + switch { + case oldSpec.StorageSpec != nil && newSpec.StorageSpec == nil: errs = append(errs, field.Invalid(specPath.Child("storageSpec"), nil, "storageSpec is immutable and cannot be removed once set; restore the original storageSpec or recreate the cluster.")) - } else if oldSpec.StorageSpec != nil && newSpec.StorageSpec != nil && - !apiequality.Semantic.DeepEqual(oldSpec.StorageSpec, newSpec.StorageSpec) { + case oldSpec.StorageSpec != nil && newSpec.StorageSpec != nil && + !apiequality.Semantic.DeepEqual(oldSpec.StorageSpec, newSpec.StorageSpec): errs = append(errs, field.Invalid(specPath.Child("storageSpec"), newSpec.StorageSpec, "storageSpec is immutable and cannot be changed once set; revert spec.storageSpec to its original value or recreate the cluster.")) + case oldSpec.StorageSpec == nil && newSpec.StorageSpec != nil: + // Adding storage to a cluster that previously had none is permitted (it is + // not yet "set"), but the new block must itself be valid — otherwise the + // content invariants would only be enforced at create time and a malformed + // storageSpec could be introduced via update. + errs = append(errs, validateStorageSpec(newSpec.StorageSpec, specPath.Child("storageSpec"))...) } // Upgrade-path check. Only meaningful when both versions parse and actually diff --git a/api/v1alpha1/etcdcluster_webhook_test.go b/api/v1alpha1/etcdcluster_webhook_test.go index 9a860617..1e26b6f1 100644 --- a/api/v1alpha1/etcdcluster_webhook_test.go +++ b/api/v1alpha1/etcdcluster_webhook_test.go @@ -430,3 +430,162 @@ func TestDefault_TLSProvider(t *testing.T) { } }) } + +func TestValidateStorageSpec_Messages(t *testing.T) { + v := &EtcdClusterCustomValidator{} + + // withStorage returns a valid 3-node cluster we can attach a storageSpec to. + withStorage := func(s *StorageSpec) *EtcdCluster { + c := newCluster(3, "3.6.1") + c.Spec.StorageSpec = s + return c + } + + tests := []struct { + name string + storage *StorageSpec + wantOK bool + wantMessage string // exact ": " + }{ + { + name: "valid RWO storage is accepted", + storage: &StorageSpec{VolumeSizeRequest: resource.MustParse("1Gi")}, + wantOK: true, + }, + { + name: "missing volumeSizeRequest is required", + storage: &StorageSpec{StorageClassName: "standard"}, + wantMessage: "spec.storageSpec.volumeSizeRequest: Required value: volumeSizeRequest is required when storageSpec is set; specify a persistent volume size such as \"1Gi\".", + }, + { + name: "volumeSizeRequest below 1Mi is rejected", + storage: &StorageSpec{VolumeSizeRequest: resource.MustParse("512Ki")}, + wantMessage: "spec.storageSpec.volumeSizeRequest: Invalid value: \"512Ki\": volumeSizeRequest must be at least 1Mi; got \"512Ki\". Request a larger persistent volume (e.g. \"1Gi\").", + }, + { + name: "volumeSizeLimit below request is rejected", + storage: &StorageSpec{ + VolumeSizeRequest: resource.MustParse("2Gi"), + VolumeSizeLimit: resource.MustParse("1Gi"), + }, + wantMessage: "spec.storageSpec.volumeSizeLimit: Invalid value: \"1Gi\": volumeSizeLimit (\"1Gi\") must be greater than or equal to volumeSizeRequest (\"2Gi\"); raise volumeSizeLimit or lower volumeSizeRequest.", + }, + { + name: "volumeSizeLimit equal to request is accepted", + storage: &StorageSpec{ + VolumeSizeRequest: resource.MustParse("1Gi"), + VolumeSizeLimit: resource.MustParse("1Gi"), + }, + wantOK: true, + }, + { + name: "unsupported accessModes lists the supported choices", + storage: &StorageSpec{ + VolumeSizeRequest: resource.MustParse("1Gi"), + AccessModes: "ReadOnlyMany", + }, + wantMessage: "spec.storageSpec.accessModes: Unsupported value: \"ReadOnlyMany\": supported values: \"ReadWriteOnce\", \"ReadWriteMany\"", + }, + { + name: "ReadWriteMany without pvcName is required", + storage: &StorageSpec{ + VolumeSizeRequest: resource.MustParse("1Gi"), + AccessModes: "ReadWriteMany", + }, + wantMessage: "spec.storageSpec.pvcName: Required value: pvcName is required when accessModes is \"ReadWriteMany\"; set it to the name of the pre-provisioned shared PersistentVolumeClaim the etcd pods should mount.", + }, + { + name: "ReadWriteMany with pvcName is accepted", + storage: &StorageSpec{ + VolumeSizeRequest: resource.MustParse("1Gi"), + AccessModes: "ReadWriteMany", + PVCName: "shared-etcd-pvc", + }, + wantOK: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + _, err := v.ValidateCreate(context.Background(), withStorage(tc.storage)) + if tc.wantOK { + if err != nil { + t.Fatalf("expected storageSpec to be accepted, got: %v", err) + } + return + } + assertCause(t, err, tc.wantMessage) + }) + } +} + +// TestValidateStorageSpec_OnUpdate guards the update-path handling: a storageSpec +// added to a cluster that previously had none must still be validated, while the +// immutability rule continues to protect a storageSpec that was already set. +func TestValidateStorageSpec_OnUpdate(t *testing.T) { + v := &EtcdClusterCustomValidator{} + ctx := context.Background() + + t.Run("adding an invalid storageSpec on update is rejected", func(t *testing.T) { + old := newCluster(3, "3.6.1") // no storage + updated := newCluster(3, "3.6.1") + updated.Spec.StorageSpec = &StorageSpec{StorageClassName: "standard"} // missing size + _, err := v.ValidateUpdate(ctx, old, updated) + assertCause(t, err, + "spec.storageSpec.volumeSizeRequest: Required value: volumeSizeRequest is required when storageSpec is set; specify a persistent volume size such as \"1Gi\".") + }) + + t.Run("adding a valid storageSpec on update is accepted", func(t *testing.T) { + old := newCluster(3, "3.6.1") + updated := newCluster(3, "3.6.1") + updated.Spec.StorageSpec = &StorageSpec{VolumeSizeRequest: resource.MustParse("1Gi")} + if _, err := v.ValidateUpdate(ctx, old, updated); err != nil { + t.Fatalf("expected adding a valid storageSpec to be allowed, got: %v", err) + } + }) +} + +func TestValidateCommonName_Messages(t *testing.T) { + v := &EtcdClusterCustomValidator{} + ctx := context.Background() + + // 65 characters: one over the documented 64-char X.509 CN ceiling. + tooLong := strings.Repeat("a", 65) + + t.Run("auto provider commonName over 64 chars is rejected", func(t *testing.T) { + c := newCluster(3, "3.6.1") + c.Spec.TLS = &TLSCertificate{ + Provider: "auto", + ProviderCfg: ProviderConfig{AutoCfg: &ProviderAutoConfig{CommonConfig: CommonConfig{CommonName: tooLong}}}, + } + _, err := v.ValidateCreate(ctx, c) + assertCause(t, err, + "spec.tls.providerCfg.autoCfg.commonName: Invalid value: \""+tooLong+"\": commonName must be 64 characters or fewer to produce a valid X.509 CSR; got 65. Shorten commonName (the certificate provider derives a default when it is empty).") + }) + + t.Run("cert-manager provider commonName over 64 chars is rejected", func(t *testing.T) { + c := newCluster(3, "3.6.1") + c.Spec.TLS = &TLSCertificate{ + Provider: "cert-manager", + ProviderCfg: ProviderConfig{CertManagerCfg: &ProviderCertManagerConfig{ + IssuerName: "ca", + IssuerKind: "Issuer", + CommonConfig: CommonConfig{CommonName: tooLong}, + }}, + } + _, err := v.ValidateCreate(ctx, c) + assertCause(t, err, + "spec.tls.providerCfg.certManagerCfg.commonName: Invalid value: \""+tooLong+"\": commonName must be 64 characters or fewer to produce a valid X.509 CSR; got 65. Shorten commonName (the certificate provider derives a default when it is empty).") + }) + + t.Run("commonName exactly 64 chars is accepted", func(t *testing.T) { + c := newCluster(3, "3.6.1") + c.Spec.TLS = &TLSCertificate{ + Provider: "auto", + ProviderCfg: ProviderConfig{AutoCfg: &ProviderAutoConfig{CommonConfig: CommonConfig{CommonName: strings.Repeat("a", 64)}}}, + } + if _, err := v.ValidateCreate(ctx, c); err != nil { + t.Fatalf("expected a 64-char commonName to be accepted, got: %v", err) + } + }) +} diff --git a/api/v1alpha1/webhook_envtest_test.go b/api/v1alpha1/webhook_envtest_test.go index be059a02..5d9502d4 100644 --- a/api/v1alpha1/webhook_envtest_test.go +++ b/api/v1alpha1/webhook_envtest_test.go @@ -27,6 +27,7 @@ import ( "testing" "time" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" @@ -202,6 +203,21 @@ func TestEnvtest_RejectsBadVersion(t *testing.T) { } } +func TestEnvtest_RejectsUndersizedStorage(t *testing.T) { + skipIfNoEnvtest(t) + c := mkCluster("bad-storage", 3, "3.6.1") + c.Spec.StorageSpec = &operatorv1alpha1.StorageSpec{ + VolumeSizeRequest: resource.MustParse("512Ki"), // below the 1Mi floor + } + err := whClient.Create(context.Background(), c) + if err == nil { + t.Fatal("expected undersized volumeSizeRequest to be rejected by the validating webhook") + } + if !strings.Contains(err.Error(), "volumeSizeRequest must be at least 1Mi") { + t.Fatalf("unexpected storage rejection message: %v", err) + } +} + func TestEnvtest_RejectsUnknownTLSProvider(t *testing.T) { skipIfNoEnvtest(t) c := mkCluster("bad-tls", 3, "3.6.1") diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index c3b738d0..63b8c310 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -47,6 +47,18 @@ patches: target: kind: Deployment +# [WEBHOOK] Break-glass escape hatch: add an objectSelector to both fail-closed +# webhook configurations so an admin can opt a single EtcdCluster out of admission +# (label etcd.operator.etcd.io/skip-webhooks=true) while the webhook is down. +- path: webhook_selector_patch.yaml + target: + group: admissionregistration.k8s.io + kind: ValidatingWebhookConfiguration +- path: webhook_selector_patch.yaml + target: + group: admissionregistration.k8s.io + kind: MutatingWebhookConfiguration + # [CERTMANAGER] To enable cert-manager, the following replacements wire the # cert-manager CA into the webhook configurations and the serving Certificate's # dnsNames so the API server trusts the webhook server's TLS cert. diff --git a/config/default/webhook_selector_patch.yaml b/config/default/webhook_selector_patch.yaml new file mode 100644 index 00000000..db279451 --- /dev/null +++ b/config/default/webhook_selector_patch.yaml @@ -0,0 +1,26 @@ +# Break-glass escape hatch for the fail-closed (failurePolicy: Fail) webhooks. +# +# Both the validating and the mutating webhook fail closed so a bad EtcdCluster +# spec cannot be admitted while the webhook server is unreachable (see the +# "Webhook policy rationale" comment in api/v1alpha1/etcdcluster_webhook.go). The +# rule scope is already narrow (only operator.etcd.io/v1alpha1 etcdclusters), so an +# operator outage never blocks core cluster objects -- but it would still block +# remediating an EtcdCluster CR while the operator is down. +# +# This objectSelector adds a per-object opt-out: an admin can label the single CR +# that needs editing with +# +# etcd.operator.etcd.io/skip-webhooks: "true" +# +# and that object (and only that object) bypasses both webhooks. The default -- +# no label -- still matches, so every EtcdCluster is validated/defaulted unless an +# operator has explicitly turned the webhooks off for break-glass. matchExpressions +# uses NotIn so the absence of the label is treated as "selected". +- op: add + path: /webhooks/0/objectSelector + value: + matchExpressions: + - key: etcd.operator.etcd.io/skip-webhooks + operator: NotIn + values: + - "true" diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml index d2c926a0..b30a93d6 100644 --- a/config/webhook/manifests.yaml +++ b/config/webhook/manifests.yaml @@ -12,6 +12,7 @@ webhooks: namespace: system path: /mutate-operator-etcd-io-v1alpha1-etcdcluster failurePolicy: Fail + matchPolicy: Equivalent name: metcdcluster-v1alpha1.kb.io rules: - apiGroups: @@ -24,6 +25,7 @@ webhooks: resources: - etcdclusters sideEffects: None + timeoutSeconds: 10 --- apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration @@ -38,6 +40,7 @@ webhooks: namespace: system path: /validate-operator-etcd-io-v1alpha1-etcdcluster failurePolicy: Fail + matchPolicy: Equivalent name: vetcdcluster-v1alpha1.kb.io rules: - apiGroups: @@ -50,3 +53,4 @@ webhooks: resources: - etcdclusters sideEffects: None + timeoutSeconds: 10 diff --git a/test/config/webhook_render_test.go b/test/config/webhook_render_test.go new file mode 100644 index 00000000..472a468d --- /dev/null +++ b/test/config/webhook_render_test.go @@ -0,0 +1,278 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package config_test renders the deployment kustomization and asserts the +// production-hardening properties of the admission webhook wiring that are easy +// to break silently in YAML: the fail-closed policy and timeouts, the +// break-glass objectSelector, and the cert-manager caBundle injection chain +// (inject-ca-from annotation -> serving Certificate -> webhook Service DNS -> +// mounted secret). These are verified against the actual `kustomize build` +// output rather than the hand-maintained source files so a future edit to any +// link in the chain is caught. +package config_test + +import ( + "bytes" + "encoding/json" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + + certmanagerv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" + admissionregv1 "k8s.io/api/admissionregistration/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/yaml" +) + +const skipLabelKey = "etcd.operator.etcd.io/skip-webhooks" + +// repoRoot returns the repository root (two levels up from test/config). +func repoRoot(t *testing.T) string { + t.Helper() + wd, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + return filepath.Clean(filepath.Join(wd, "..", "..")) +} + +// kustomizeBin resolves a kustomize binary: the repo-local bin/kustomize that +// `make kustomize` installs, otherwise one on PATH. Returns "" when neither is +// available so the test can skip rather than fail on a bare checkout. +func kustomizeBin(t *testing.T) string { + t.Helper() + local := filepath.Join(repoRoot(t), "bin", "kustomize") + if fi, err := os.Stat(local); err == nil && !fi.IsDir() { + return local + } + if p, err := exec.LookPath("kustomize"); err == nil { + return p + } + return "" +} + +// renderDefault runs `kustomize build config/default` and splits the multi-doc +// output, returning the decoded webhook configurations and serving Certificate. +func renderDefault(t *testing.T) ( + []admissionregv1.ValidatingWebhookConfiguration, + []admissionregv1.MutatingWebhookConfiguration, + []certmanagerv1.Certificate, +) { + t.Helper() + bin := kustomizeBin(t) + if bin == "" { + t.Skip("kustomize not found (run `make kustomize`); skipping config render assertions") + } + root := repoRoot(t) + cmd := exec.Command(bin, "build", filepath.Join(root, "config", "default")) + out, err := cmd.Output() + if err != nil { + if ee, ok := err.(*exec.ExitError); ok { + t.Fatalf("kustomize build failed: %v\nstderr:\n%s", err, ee.Stderr) + } + t.Fatalf("kustomize build failed: %v", err) + } + + var ( + vwcs []admissionregv1.ValidatingWebhookConfiguration + mwcs []admissionregv1.MutatingWebhookConfiguration + certs []certmanagerv1.Certificate + ) + dec := yaml.NewYAMLOrJSONDecoder(bytes.NewReader(out), 4096) + for { + // Decode each document to canonical JSON, peek at its kind, then unmarshal + // into the matching typed object. yaml.Unmarshal on a generic map already + // produces JSON-compatible values, so encoding/json round-trips cleanly. + var raw json.RawMessage + if err := dec.Decode(&raw); err != nil { + break // io.EOF (or a malformed trailing doc) ends the stream. + } + if len(bytes.TrimSpace(raw)) == 0 { + continue + } + var meta metav1.TypeMeta + if err := json.Unmarshal(raw, &meta); err != nil { + continue + } + switch meta.Kind { + case "ValidatingWebhookConfiguration": + var o admissionregv1.ValidatingWebhookConfiguration + mustJSON(t, raw, &o) + vwcs = append(vwcs, o) + case "MutatingWebhookConfiguration": + var o admissionregv1.MutatingWebhookConfiguration + mustJSON(t, raw, &o) + mwcs = append(mwcs, o) + case "Certificate": + var o certmanagerv1.Certificate + mustJSON(t, raw, &o) + certs = append(certs, o) + } + } + return vwcs, mwcs, certs +} + +// mustJSON unmarshals a JSON document into out, failing the test on error. +func mustJSON(t *testing.T, raw json.RawMessage, out any) { + t.Helper() + if err := json.Unmarshal(raw, out); err != nil { + t.Fatalf("decoding %T: %v", out, err) + } +} + +// assertSkipSelector verifies the break-glass objectSelector: absence of the +// skip label must still select the object (NotIn "true"). +func assertSkipSelector(t *testing.T, sel *metav1.LabelSelector, who string) { + t.Helper() + if sel == nil { + t.Fatalf("%s: expected an objectSelector (break-glass escape hatch) but it was nil", who) + } + for _, expr := range sel.MatchExpressions { + if expr.Key != skipLabelKey { + continue + } + if expr.Operator != metav1.LabelSelectorOpNotIn { + t.Fatalf("%s: objectSelector on %q must use NotIn so an unlabeled CR is still validated; got %q", + who, skipLabelKey, expr.Operator) + } + if len(expr.Values) != 1 || expr.Values[0] != "true" { + t.Fatalf("%s: objectSelector on %q must exclude value \"true\"; got %v", + who, skipLabelKey, expr.Values) + } + return + } + t.Fatalf("%s: objectSelector is missing the break-glass key %q", who, skipLabelKey) +} + +func TestWebhookConfigsAreProductionHardened(t *testing.T) { + vwcs, mwcs, _ := renderDefault(t) + + if len(vwcs) != 1 { + t.Fatalf("expected exactly one ValidatingWebhookConfiguration, got %d", len(vwcs)) + } + if len(mwcs) != 1 { + t.Fatalf("expected exactly one MutatingWebhookConfiguration, got %d", len(mwcs)) + } + + check := func(who string, wh admissionregv1.ValidatingWebhook) { + if wh.FailurePolicy == nil || *wh.FailurePolicy != admissionregv1.Fail { + t.Errorf("%s: failurePolicy must be Fail (fail closed for correctness-critical validation); got %v", + who, wh.FailurePolicy) + } + if wh.MatchPolicy == nil || *wh.MatchPolicy != admissionregv1.Equivalent { + t.Errorf("%s: matchPolicy must be Equivalent; got %v", who, wh.MatchPolicy) + } + if wh.TimeoutSeconds == nil || *wh.TimeoutSeconds != 10 { + t.Errorf("%s: timeoutSeconds must be 10; got %v", who, wh.TimeoutSeconds) + } + if wh.SideEffects == nil || *wh.SideEffects != admissionregv1.SideEffectClassNone { + t.Errorf("%s: sideEffects must be None; got %v", who, wh.SideEffects) + } + assertSkipSelector(t, wh.ObjectSelector, who) + } + + for _, wh := range vwcs[0].Webhooks { + check("validating/"+wh.Name, wh) + } + for _, wh := range mwcs[0].Webhooks { + // MutatingWebhook shares the relevant fields; adapt via a tiny shim. + check("mutating/"+wh.Name, admissionregv1.ValidatingWebhook{ + Name: wh.Name, + FailurePolicy: wh.FailurePolicy, + MatchPolicy: wh.MatchPolicy, + TimeoutSeconds: wh.TimeoutSeconds, + SideEffects: wh.SideEffects, + ObjectSelector: wh.ObjectSelector, + }) + } +} + +func TestCertManagerCABundleInjectionWiring(t *testing.T) { + vwcs, mwcs, certs := renderDefault(t) + + if len(certs) != 1 { + t.Fatalf("expected exactly one serving Certificate, got %d", len(certs)) + } + cert := certs[0] + + // The inject-ca-from annotation on each webhook config must point at the + // rendered serving Certificate as namespace/name. + wantRef := cert.Namespace + "/" + cert.Name + const injectAnnotation = "cert-manager.io/inject-ca-from" + + assertInject := func(who string, ann map[string]string) { + got, ok := ann[injectAnnotation] + if !ok { + t.Fatalf("%s: missing %s annotation; cert-manager will not inject the caBundle", who, injectAnnotation) + } + if got != wantRef { + t.Fatalf("%s: %s = %q but the serving Certificate is %q; the CA will be injected from the wrong object", + who, injectAnnotation, got, wantRef) + } + } + assertInject("ValidatingWebhookConfiguration", vwcs[0].Annotations) + assertInject("MutatingWebhookConfiguration", mwcs[0].Annotations) + + // The Certificate must be issued by the rendered self-signed Issuer and write + // the secret the manager mounts (webhook-server-cert). + if cert.Spec.SecretName != "webhook-server-cert" { + t.Errorf("serving Certificate secretName must be webhook-server-cert (mounted by manager_webhook_patch); got %q", + cert.Spec.SecretName) + } + if cert.Spec.IssuerRef.Name == "" || cert.Spec.IssuerRef.Kind != "Issuer" { + t.Errorf("serving Certificate issuerRef must reference the self-signed Issuer; got %+v", cert.Spec.IssuerRef) + } + + // dnsNames must resolve to the actual webhook Service FQDN so the served cert + // validates against the clientConfig.service the API server dials. The + // replacements substitute SERVICE_NAME/SERVICE_NAMESPACE, so no placeholder + // must survive and the svc form must be present. + var sawSvc, sawClusterLocal bool + for _, dn := range cert.Spec.DNSNames { + if dn == "SERVICE_NAME.SERVICE_NAMESPACE.svc" || dn == "SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local" { + t.Fatalf("serving Certificate dnsNames still contains an unsubstituted placeholder %q", dn) + } + if strings.HasSuffix(dn, ".svc") { + sawSvc = true + } + if strings.HasSuffix(dn, ".svc.cluster.local") { + sawClusterLocal = true + } + } + if !sawSvc || !sawClusterLocal { + t.Fatalf("serving Certificate dnsNames must include both ..svc and .svc.cluster.local; got %v", + cert.Spec.DNSNames) + } + + // Cross-check the dnsNames name/namespace against the webhook clientConfig the + // API server will dial, so a future rename of the Service can't silently break + // TLS SAN matching. + svcName := vwcs[0].Webhooks[0].ClientConfig.Service.Name + svcNS := vwcs[0].Webhooks[0].ClientConfig.Service.Namespace + wantFQDN := svcName + "." + svcNS + ".svc" + found := false + for _, dn := range cert.Spec.DNSNames { + if dn == wantFQDN { + found = true + } + } + if !found { + t.Fatalf("serving Certificate dnsNames must include the webhook Service FQDN %q so the served cert's "+ + "SAN matches the dialed service; got %v", wantFQDN, cert.Spec.DNSNames) + } +} diff --git a/test/e2e/webhook_test.go b/test/e2e/webhook_test.go index 00be46d3..4eff2e22 100644 --- a/test/e2e/webhook_test.go +++ b/test/e2e/webhook_test.go @@ -21,6 +21,7 @@ import ( "strings" "testing" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" @@ -92,6 +93,23 @@ func TestAdmissionWebhooks(t *testing.T) { return ctx }) + feature.Assess("rejects undersized storageSpec volumeSizeRequest", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + cluster := newWebhookTestCluster("wh-badstorage", 3, etcdVersion) + cluster.Spec.StorageSpec = &ecv1alpha1.StorageSpec{ + VolumeSizeRequest: resource.MustParse("512Ki"), // below the 1Mi floor + } + err := c.Resources().Create(ctx, cluster) + if err == nil { + t.Fatal("expected undersized volumeSizeRequest to be rejected") + } + if !strings.Contains(err.Error(), "volumeSizeRequest must be at least 1Mi") { + t.Fatalf("expected storage-size guidance in rejection, got: %v", err) + } + return ctx + }) + feature.Assess("admits valid cluster and defaults tls.provider to auto", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { c := cfg.Client() From 37731591662bf874944dda58222c1bcb15ad2ee8 Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Thu, 18 Jun 2026 22:15:52 -0400 Subject: [PATCH 4/6] test(e2e): install cert-manager before deploy so webhook suite can come up The admission-webhook feature wires config/default to ../certmanager: a cert-manager Certificate/Issuer issues the webhook-server-cert secret and manager_webhook_patch.yaml mounts it into the manager pod. The e2e suite's TestMain installed cert-manager only lazily inside an individual feature test that runs AFTER setup, so 'make deploy DEPLOY_MODE=e2e' ran with no cert-manager present: the Certificate/Issuer had no CRDs to apply against, the secret never materialized, the manager pod hung in ContainerCreating, and the DeploymentAvailable wait timed out before any webhook assertion could run. Install cert-manager (InstallCertManager blocks until cert-manager-webhook is Available) as a setup step BEFORE the deploy step. Verified live: without this the manager never becomes Available; with it the controller-manager is Available ~14s after deploy and all webhook assertions pass. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Lange --- test/e2e/e2e_suite_test.go | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 87d18d30..0aef6099 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -98,6 +98,28 @@ func TestMain(m *testing.M) { return ctx, nil }, + // install cert-manager BEFORE deploy. + // + // The admission-webhook feature wires config/default to reference + // ../certmanager: a cert-manager Certificate/Issuer issues the + // webhook-server-cert secret, and manager_webhook_patch.yaml mounts that + // secret into the manager pod. If cert-manager is absent at deploy time the + // Certificate/Issuer manifests have no CRDs to apply against and the secret + // never materializes, so the manager pod hangs in ContainerCreating and the + // DeploymentAvailable wait below times out before a single webhook assertion + // can run. cert-manager must therefore be installed (and its own webhook + // Ready) here, not lazily inside an individual feature test that runs after + // TestMain. InstallCertManager blocks until cert-manager-webhook is Available. + func(ctx context.Context, cfg *envconf.Config) (context.Context, error) { + log.Println("Installing cert-manager...") + if err := test_utils.InstallCertManager(); err != nil { + log.Printf("Unable to install cert-manager: %s", err) + return ctx, err + } + + return ctx, nil + }, + // set up environment func(ctx context.Context, cfg *envconf.Config) (context.Context, error) { // create namespace From e0339147fb70598daa0253621f4258ffa6e6303e Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Thu, 18 Jun 2026 22:16:06 -0400 Subject: [PATCH 5/6] test(e2e): make webhook e2e a true, isolated proof of admission The webhook e2e fed the suite-wide etcdVersion ("v3.6.12", v-prefixed) into every case. semver.NewVersion("v3.6.12") errors, so the new version webhook rejects it: the happy-path 'admits valid cluster' assertion failed outright, and the size/tls/storage reject cases passed for the WRONG reason (the aggregated NewInvalid also carried a version error, so a substring match no longer isolated the invariant under test). Improvements: - Use a hardcoded valid semver (3.6.1) for every non-version case, matching the envtest twin, so each assertion proves exactly its one invariant. - requireWebhookRejection helper: assert apierrors.IsInvalid (a real 422 from the validating webhook, not a CRD/schema rejection), assert the exact spec. path, and assert that unrelated invariants are NOT present (isolation) -- this is the check that kills the 'passes for the wrong reason' failure mode. - Tighten message asserts to the envtest twin's exact text: 'Use 1 or 3 instead' for even size, supported values: "auto", "cert-manager" for the bad provider (catches a garbled NotSupported list a loose Contains pair would miss). - Add the break-glass objectSelector proof: a skip-webhooks-labeled even-sized cluster is admitted (bypass) while the same spec unlabeled is still rejected. This is the one behavior only a live API server can prove (render test is static, envtest has no objectSelector) and the most dangerous new knob. - Prove the positive upgrade path (3.5.1 -> 3.6.1 admitted) after the skip-minor rejection, so a reject-all-updates regression cannot hide behind the negative case. - Use size 1 for the two clusters that are actually created (still odd/valid) to cut reconcile load on the kind node. No sleeps/polling added: admission is synchronous, so a Create/Update return IS the decision. Live: all 7 sub-asserts green; suite wall-clock ~127-231s, the 8 assertions themselves run in ~0.3s. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Lange --- test/e2e/webhook_test.go | 181 +++++++++++++++++++++++++++++++-------- 1 file changed, 145 insertions(+), 36 deletions(-) diff --git a/test/e2e/webhook_test.go b/test/e2e/webhook_test.go index 4eff2e22..7b7e6d17 100644 --- a/test/e2e/webhook_test.go +++ b/test/e2e/webhook_test.go @@ -21,6 +21,7 @@ import ( "strings" "testing" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/pkg/envconf" @@ -29,6 +30,26 @@ import ( ecv1alpha1 "go.etcd.io/etcd-operator/api/v1alpha1" ) +// webhookValidVersion is a hardcoded, well-formed semantic version used by every +// webhook test that is NOT specifically exercising version validation. +// +// It deliberately does NOT use the suite-wide etcdVersion (sourced from +// `go list -m {{.Version}} go.etcd.io/etcd/api/v3`), because that value is +// v-prefixed (e.g. "v3.6.12"). semver.NewVersion("v3.6.12") errors, so the new +// validateVersionFormat webhook rejects it. Feeding etcdVersion into these cases +// would (a) make the happy-path "admits valid cluster" assertion fail outright, +// and (b) cause the size/tls/storage reject cases to pass for the WRONG reason -- +// the aggregated rejection would also carry a version error, so a substring match +// could no longer isolate the invariant actually under test. A fixed valid semver +// keeps each assertion proving exactly the one thing it claims to prove. +const webhookValidVersion = "3.6.1" + +// skipWebhooksLabel is the break-glass opt-out label injected as an objectSelector +// by config/default/webhook_selector_patch.yaml. An EtcdCluster carrying this label +// bypasses BOTH admission webhooks (the objectSelector matches every object that +// does NOT have the label set to "true"). +const skipWebhooksLabel = "etcd.operator.etcd.io/skip-webhooks" + // newWebhookTestCluster builds an EtcdCluster with the given spec parameters. // It is local to this file so the suite does not depend on helpers added by // other in-flight e2e PRs. @@ -46,21 +67,65 @@ func newWebhookTestCluster(name string, size int, version string) *ecv1alpha1.Et } } +// requireWebhookRejection asserts that err is a rejection produced by the +// validating webhook (a structured 422 Invalid), that it names the expected +// spec field path, carries the expected actionable detail, and -- for isolation -- +// does NOT also carry any of the forbidden substrings (errors that belong to a +// different invariant). The isolation check is what catches the "passes for the +// wrong reason" failure mode where an aggregated NewInvalid happens to contain the +// asserted substring while really failing on an unrelated field (e.g. a bad +// version riding along on a size test). +func requireWebhookRejection(t *testing.T, err error, wantField, wantDetail string, forbidden ...string) { + t.Helper() + if err == nil { + t.Fatalf("expected the validating webhook to reject the request (field %q), but Create/Update succeeded", wantField) + } + if !apierrors.IsInvalid(err) { + t.Fatalf("expected an apierrors.IsInvalid (422) rejection from the validating webhook, got %T: %v", err, err) + } + msg := err.Error() + if !strings.Contains(msg, wantField) { + t.Fatalf("expected rejection to name field %q, got: %v", wantField, err) + } + if !strings.Contains(msg, wantDetail) { + t.Fatalf("expected rejection detail %q, got: %v", wantDetail, err) + } + for _, f := range forbidden { + if strings.Contains(msg, f) { + t.Fatalf("rejection leaked an unrelated invariant %q (test is not isolating %q); got: %v", f, wantField, err) + } + } +} + // TestAdmissionWebhooks verifies that the validating and defaulting admission -// webhooks are registered and enforce the documented invariants at apply time. -// Each invalid apply must be rejected synchronously with an actionable message. +// webhooks are registered behind a real cert-manager-issued serving cert and +// enforce the documented invariants at apply time through the live API server. +// Each invalid apply must be rejected synchronously, as a structured Invalid +// (422), with an actionable, field-scoped message; each valid apply must be +// admitted (and defaulted). It additionally proves the two behaviors that only an +// e2e against a live API server can prove: the kustomize CA-injection chain (every +// rejection here is a real API-server -> webhook-pod call over TLS) and the +// break-glass objectSelector opt-out. +// +// Admission is synchronous, so the test body contains no sleeps or polling: a +// successful Create/Update return IS the admission decision. func TestAdmissionWebhooks(t *testing.T) { feature := features.New("admission-webhooks") + // --- Negative cases: each isolates exactly one invariant. ----------------- + feature.Assess("rejects even cluster size with quorum guidance", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { c := cfg.Client() - err := c.Resources().Create(ctx, newWebhookTestCluster("wh-even", 2, etcdVersion)) - if err == nil { - t.Fatal("expected even-sized EtcdCluster to be rejected by the validating webhook") - } - if !strings.Contains(err.Error(), "size must be an odd number") { - t.Fatalf("expected quorum guidance in rejection, got: %v", err) + err := c.Resources().Create(ctx, newWebhookTestCluster("wh-even", 2, webhookValidVersion)) + requireWebhookRejection(t, err, + "spec.size", + "size must be an odd number", + // isolation + remediation: must not be failing on the version, and + // must carry the concrete quorum remediation the envtest twin asserts. + "is not a valid semantic version") + if !strings.Contains(err.Error(), "Use 1 or 3 instead") { + t.Fatalf("expected concrete quorum remediation %q, got: %v", "Use 1 or 3 instead", err) } return ctx }) @@ -69,52 +134,53 @@ func TestAdmissionWebhooks(t *testing.T) { func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { c := cfg.Client() err := c.Resources().Create(ctx, newWebhookTestCluster("wh-badver", 3, "not-a-version")) - if err == nil { - t.Fatal("expected non-semver version to be rejected") - } - if !strings.Contains(err.Error(), "is not a valid semantic version") { - t.Fatalf("expected semver guidance in rejection, got: %v", err) - } + requireWebhookRejection(t, err, + "spec.version", + "is not a valid semantic version", + // size is valid (3) here, so a size error would mean the wrong path fired. + "size must be an odd number") return ctx }) feature.Assess("rejects unknown TLS provider", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { c := cfg.Client() - cluster := newWebhookTestCluster("wh-badtls", 3, etcdVersion) + cluster := newWebhookTestCluster("wh-badtls", 3, webhookValidVersion) cluster.Spec.TLS = &ecv1alpha1.TLSCertificate{Provider: "vault"} err := c.Resources().Create(ctx, cluster) - if err == nil { - t.Fatal("expected unknown TLS provider to be rejected") - } - if !strings.Contains(err.Error(), "auto") || !strings.Contains(err.Error(), "cert-manager") { - t.Fatalf("expected supported-provider list in rejection, got: %v", err) - } + requireWebhookRejection(t, err, + "spec.tls.provider", + // exact NotSupported rendering -- catches a garbled supported-values list + // that a loose Contains("auto") && Contains("cert-manager") would miss. + `supported values: "auto", "cert-manager"`, + "size must be an odd number", "is not a valid semantic version") return ctx }) feature.Assess("rejects undersized storageSpec volumeSizeRequest", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { c := cfg.Client() - cluster := newWebhookTestCluster("wh-badstorage", 3, etcdVersion) + cluster := newWebhookTestCluster("wh-badstorage", 3, webhookValidVersion) cluster.Spec.StorageSpec = &ecv1alpha1.StorageSpec{ VolumeSizeRequest: resource.MustParse("512Ki"), // below the 1Mi floor } err := c.Resources().Create(ctx, cluster) - if err == nil { - t.Fatal("expected undersized volumeSizeRequest to be rejected") - } - if !strings.Contains(err.Error(), "volumeSizeRequest must be at least 1Mi") { - t.Fatalf("expected storage-size guidance in rejection, got: %v", err) - } + requireWebhookRejection(t, err, + "spec.storageSpec.volumeSizeRequest", + "volumeSizeRequest must be at least 1Mi", + "size must be an odd number", "is not a valid semantic version") return ctx }) + // --- Positive case: admitted + defaulted. --------------------------------- + feature.Assess("admits valid cluster and defaults tls.provider to auto", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { c := cfg.Client() - cluster := newWebhookTestCluster("wh-valid", 3, etcdVersion) - cluster.Spec.TLS = &ecv1alpha1.TLSCertificate{} // empty -> defaulted + // size 1 (still odd/valid) minimizes the reconcile load this otherwise + // pure-admission test imposes on the kind node. + cluster := newWebhookTestCluster("wh-valid", 1, webhookValidVersion) + cluster.Spec.TLS = &ecv1alpha1.TLSCertificate{} // empty provider -> defaulted if err := c.Resources().Create(ctx, cluster); err != nil { t.Fatalf("expected valid cluster to be admitted, got: %v", err) } @@ -130,27 +196,70 @@ func TestAdmissionWebhooks(t *testing.T) { return ctx }) - feature.Assess("rejects skip-minor upgrade on update", + // --- Update path: rejects skip-minor, admits a single-minor upgrade. ------ + + feature.Assess("guards version upgrades on update (rejects skip-minor, admits single-minor)", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { c := cfg.Client() - cluster := newWebhookTestCluster("wh-upgrade", 3, "3.5.1") + cluster := newWebhookTestCluster("wh-upgrade", 1, "3.5.1") if err := c.Resources().Create(ctx, cluster); err != nil { t.Fatalf("failed to create initial cluster: %v", err) } defer func() { _ = c.Resources().Delete(ctx, cluster) }() + // Skip-minor (3.5 -> 3.7) must be rejected. var got ecv1alpha1.EtcdCluster if err := c.Resources().Get(ctx, cluster.Name, namespace, &got); err != nil { t.Fatalf("failed to read back cluster: %v", err) } got.Spec.Version = "3.7.1" // skips 3.6 - if err := c.Resources().Update(ctx, &got); err == nil { - t.Fatal("expected skip-minor upgrade to be rejected on update") - } else if !strings.Contains(err.Error(), "skips a minor version") { - t.Fatalf("expected skip-minor guidance in rejection, got: %v", err) + err := c.Resources().Update(ctx, &got) + requireWebhookRejection(t, err, "spec.version", "skips a minor version") + + // Single-minor (3.5 -> 3.6) must be admitted. This proves the validator + // is not simply over-blocking ALL updates -- the negative case alone + // cannot distinguish a correct guard from a reject-everything bug. + var cur ecv1alpha1.EtcdCluster + if err := c.Resources().Get(ctx, cluster.Name, namespace, &cur); err != nil { + t.Fatalf("failed to re-read cluster before single-minor update: %v", err) + } + cur.Spec.Version = "3.6.1" + if err := c.Resources().Update(ctx, &cur); err != nil { + t.Fatalf("expected single-minor upgrade 3.5.1 -> 3.6.1 to be admitted, got: %v", err) } return ctx }) + // --- Break-glass: the objectSelector opt-out. ----------------------------- + // + // This is the assertion that justifies paying the kind+build+deploy cost: it + // cannot be exercised by envtest or the static render test. A wrong operator + // (In vs NotIn) or a wrong value in webhook_selector_patch.yaml would silently + // disable admission fleet-wide; here we prove against a LIVE API server that + // (a) the label bypasses the otherwise-fatal even-size rejection, and + // (b) the very same invalid spec WITHOUT the label is still rejected -- so the + // bypass is genuinely scoped to the labeled object, not globally off. + + feature.Assess("break-glass skip-webhooks label bypasses admission for that object only", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + + // Labeled: an even size (2) that the webhook would normally reject must be + // ADMITTED because the objectSelector excludes it. + labeled := newWebhookTestCluster("wh-skip-labeled", 2, webhookValidVersion) + labeled.Labels = map[string]string{skipWebhooksLabel: "true"} + if err := c.Resources().Create(ctx, labeled); err != nil { + t.Fatalf("expected break-glass-labeled even-sized cluster to bypass the webhook and be admitted, got: %v", err) + } + defer func() { _ = c.Resources().Delete(ctx, labeled) }() + + // Unlabeled control: the SAME invalid spec must still be rejected, proving + // the bypass is per-object and admission is otherwise live. + unlabeled := newWebhookTestCluster("wh-skip-unlabeled", 2, webhookValidVersion) + err := c.Resources().Create(ctx, unlabeled) + requireWebhookRejection(t, err, "spec.size", "size must be an odd number") + return ctx + }) + _ = testEnv.Test(t, feature.Feature()) } From cefcfa37dee7f8e8eeb32c3096618798890d6ffd Mon Sep 17 00:00:00 2001 From: Xavier Lange Date: Fri, 19 Jun 2026 01:19:39 -0400 Subject: [PATCH 6/6] fix(webhook,controller): accept v-prefixed etcd versions and render v-tagged images The admission webhook validated spec.version with coreos/go-semver, which rejects a leading "v". But the default registry (gcr.io/etcd-development/etcd) publishes ONLY v-prefixed tags, and the controller built the image as ":". So with the webhook on: - "v3.6.1" (the form config/samples actually ships -> v3.5.21, matching origin/main) was REJECTED by the validator, and - a bare "3.6.1" that the validator did accept rendered "gcr.io/etcd-development/etcd:3.6.1", a tag the registry does not publish -> NotFound / ImagePullBackOff. The default image was therefore unpullable whenever the webhook was enabled. Fix, end to end: - Webhook NORMALIZES spec.version: a leading "v" is stripped before go-semver parses it, so BOTH "v3.6.1" and "3.6.1" validate, including the upgrade-path comparison (validateUpgradePath/checkUpgradePath) and the no-op short-circuit. This keeps the operator's own v-prefixed samples appliable rather than self-rejecting. - Controller NORMALIZES the rendered tag to the registry convention (internal/controller/utils.go: etcdImageTag) by prepending "v" when the user supplied a bare semver, so a bare spec.version still pulls. An already-v-prefixed value and an explicit non-semver custom tag are passed through untouched so non-default registries are not disturbed. - Controller's reconcile-time upgrade check now compares and parses on the normalized form, so the v-prefixed live image tag and a bare spec.version for the same release are not mistaken for a version change (and parse cleanly through validateEtcdUpgradePath). Tests: webhook accepts "v3.6.1" and "3.6.1" and a v-prefixed single-minor upgrade path (skip-minor still rejected); etcdImageTag and an end-to-end StatefulSet render assert the tag is v-prefixed for the default registry. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Xavier Lange --- api/v1alpha1/etcdcluster_webhook.go | 31 ++++++++-- api/v1alpha1/etcdcluster_webhook_test.go | 33 +++++++++++ internal/controller/etcdcluster_controller.go | 6 +- internal/controller/utils.go | 41 +++++++++++++- internal/controller/utils_test.go | 56 +++++++++++++++++++ 5 files changed, 156 insertions(+), 11 deletions(-) diff --git a/api/v1alpha1/etcdcluster_webhook.go b/api/v1alpha1/etcdcluster_webhook.go index 1be9dad0..86e830a0 100644 --- a/api/v1alpha1/etcdcluster_webhook.go +++ b/api/v1alpha1/etcdcluster_webhook.go @@ -299,13 +299,28 @@ func validateSize(size int, path *field.Path) field.ErrorList { return nil } +// normalizeEtcdVersion strips an optional leading "v" from an etcd version string +// so it can be parsed by the CoreOS go-semver library, which rejects the "v" +// prefix. The canonical etcd release images at the default registry +// (gcr.io/etcd-development/etcd) are published ONLY with v-prefixed tags (e.g. +// "v3.5.21" — see config/samples), so users legitimately write spec.version with +// or without the "v". Both forms must validate identically here, and the +// controller (internal/controller/utils.go: etcdImageTag) re-applies the "v" the +// registry expects when rendering the image, so a bare spec.version still pulls. +func normalizeEtcdVersion(version string) string { + return strings.TrimPrefix(strings.TrimSpace(version), "v") +} + // validateVersionFormat ensures spec.version is non-empty and parses as semver. +// A leading "v" is accepted (and stripped before parsing) because the default +// etcd-development registry only publishes v-prefixed tags; rejecting "v3.6.1" +// would make the operator's own working samples unappliable. func validateVersionFormat(version string, path *field.Path) field.ErrorList { if strings.TrimSpace(version) == "" { return field.ErrorList{field.Required(path, "version is required; set spec.version to a semver etcd image tag such as \"3.6.1\".")} } - if _, err := semver.NewVersion(version); err != nil { + if _, err := semver.NewVersion(normalizeEtcdVersion(version)); err != nil { return field.ErrorList{field.Invalid(path, version, fmt.Sprintf("version %q is not a valid semantic version (expected MAJOR.MINOR.PATCH, e.g. \"3.6.1\"): %v.", version, err))} @@ -446,13 +461,15 @@ func validateEtcdClusterUpdate(oldSpec, newSpec *EtcdClusterSpec) field.ErrorLis // authoritative; if a prior upgrade stalled before convergence, a skip-minor edit // could pass this declared-transition check yet still be caught by the controller. func validateUpgradePath(current, target string, path *field.Path) field.ErrorList { - if current == target { + // Compare on the normalized form so "v3.6.1" -> "3.6.1" (same release, just a + // prefix difference) is correctly treated as a no-op rather than a transition. + if normalizeEtcdVersion(current) == normalizeEtcdVersion(target) { return nil } - if _, err := semver.NewVersion(current); err != nil { + if _, err := semver.NewVersion(normalizeEtcdVersion(current)); err != nil { return nil // old object had a bad version; nothing actionable to add here. } - if _, err := semver.NewVersion(target); err != nil { + if _, err := semver.NewVersion(normalizeEtcdVersion(target)); err != nil { return nil // format error already reported by validateVersionFormat. } @@ -468,11 +485,13 @@ func validateUpgradePath(current, target string, path *field.Path) field.ErrorLi // but is local to the api package (controller internals are not importable here) // and returns only the actionable error. supportedVersions must be ascending. func checkUpgradePath(supportedVersions []semver.Version, current, target string) error { - currentVer, err := semver.NewVersion(current) + // go-semver rejects a leading "v"; normalize so a v-prefixed spec.version (the + // default registry's tag convention) participates in the ordering check. + currentVer, err := semver.NewVersion(normalizeEtcdVersion(current)) if err != nil { return fmt.Errorf("failed to parse current version %s: %w", current, err) } - targetVer, err := semver.NewVersion(target) + targetVer, err := semver.NewVersion(normalizeEtcdVersion(target)) if err != nil { return fmt.Errorf("failed to parse target version %s: %w", target, err) } diff --git a/api/v1alpha1/etcdcluster_webhook_test.go b/api/v1alpha1/etcdcluster_webhook_test.go index 1e26b6f1..7ad775cb 100644 --- a/api/v1alpha1/etcdcluster_webhook_test.go +++ b/api/v1alpha1/etcdcluster_webhook_test.go @@ -66,6 +66,19 @@ func TestValidateCreate_Messages(t *testing.T) { cluster: newCluster(3, "3.6.1"), wantOK: true, }, + { + // The default etcd-development registry only publishes v-prefixed tags + // (config/samples ships version: v3.5.21), so the webhook must accept a + // leading "v" or it would reject the operator's own working samples. + name: "v-prefixed semver version is accepted", + cluster: newCluster(3, "v3.6.1"), + wantOK: true, + }, + { + name: "bare semver version is accepted", + cluster: newCluster(3, "3.5.21"), + wantOK: true, + }, { name: "even size is rejected with quorum guidance", cluster: newCluster(4, "3.6.1"), @@ -221,6 +234,26 @@ func TestValidateUpdate_UpgradePathAndImmutability(t *testing.T) { updated: newCluster(3, "3.6.1"), wantOK: true, }, + { + name: "v-prefixed single minor upgrade is allowed", + old: newCluster(3, "v3.5.1"), + updated: newCluster(3, "v3.6.1"), + wantOK: true, + }, + { + // v-prefixed and bare forms of the same release must be treated as a + // no-op, not a transition. + name: "v-prefixed vs bare same release is a no-op", + old: newCluster(3, "v3.5.21"), + updated: newCluster(3, "3.5.21"), + wantOK: true, + }, + { + name: "v-prefixed skip-minor upgrade is still rejected", + old: newCluster(3, "v3.5.1"), + updated: newCluster(3, "v3.7.1"), + wantMessage: "spec.version: Invalid value: \"v3.7.1\": upgrading from version 3.5.1 to version 3.7.1 is not allowed (skips a minor version). etcd only supports sequential single-minor upgrades and forbids downgrades; upgrade one minor version at a time (current \"v3.5.1\" -> target \"v3.7.1\").", + }, { name: "skip-minor upgrade is rejected", old: newCluster(3, "3.5.1"), diff --git a/internal/controller/etcdcluster_controller.go b/internal/controller/etcdcluster_controller.go index 867886a3..edf7dbe3 100644 --- a/internal/controller/etcdcluster_controller.go +++ b/internal/controller/etcdcluster_controller.go @@ -182,8 +182,10 @@ func (r *EtcdClusterReconciler) fetchAndValidateState(ctx context.Context, req c currentVersion := stsImage[idx+1:] targetVersion := ec.Spec.Version - // Only handle cases when there is a version change. - if currentVersion != targetVersion { + // Only handle cases when there is a version change. Compare on the + // normalized (v-stripped) form so the v-prefixed live image tag and a bare + // spec.version describing the same release are not seen as a change. + if normalizeEtcdVersion(currentVersion) != normalizeEtcdVersion(targetVersion) { // TODO: consider adding an option in the CRD called allowCustomImageUpgrade to make // the behavior here optional: // https://github.com/etcd-io/etcd-operator/pull/278/changes#r2764717418 diff --git a/internal/controller/utils.go b/internal/controller/utils.go index 3092c379..c37c9e3e 100644 --- a/internal/controller/utils.go +++ b/internal/controller/utils.go @@ -39,6 +39,38 @@ const ( volumeName = "etcd-data" ) +// normalizeEtcdVersion strips an optional leading "v" from an etcd version +// string. The CoreOS go-semver parser (used for upgrade-path checks here and in +// the admission webhook) rejects a leading "v", but the canonical etcd release +// images at the default registry (gcr.io/etcd-development/etcd) are published +// ONLY with v-prefixed tags (e.g. "v3.5.21"). Users therefore legitimately set +// spec.version with or without the "v"; normalizing to the bare semver here lets +// both forms compare and parse identically. +func normalizeEtcdVersion(version string) string { + return strings.TrimPrefix(strings.TrimSpace(version), "v") +} + +// etcdImageTag renders the image tag for spec.version against the configured +// registry. The default etcd-development registry only publishes v-prefixed +// tags, so a bare "3.5.21" would resolve to a NotFound "...:3.5.21" image +// (ImagePullBackOff). We therefore prepend the "v" the registry convention +// expects whenever the user supplied a bare semver. A version the user already +// wrote with a leading "v" is passed through unchanged, and a non-semver/custom +// tag (e.g. a pinned digest-less internal tag) is left exactly as written so +// non-default registries with their own tagging scheme are not disturbed. +func etcdImageTag(version string) string { + v := strings.TrimSpace(version) + if strings.HasPrefix(v, "v") { + return v + } + // Only re-apply the registry's "v" convention to values that are real + // semver; anything else is an explicit custom tag we must not rewrite. + if _, err := semver.NewVersion(v); err != nil { + return v + } + return "v" + v +} + type etcdClusterState string const ( @@ -146,7 +178,7 @@ func createOrPatchStatefulSet(ctx context.Context, logger logr.Logger, ec *ecv1a Name: "etcd", Command: []string{"/usr/local/bin/etcd"}, Args: createArgs(ec.Name, ec.Spec.EtcdOptions), - Image: fmt.Sprintf("%s:%s", ec.Spec.ImageRegistry, ec.Spec.Version), + Image: fmt.Sprintf("%s:%s", ec.Spec.ImageRegistry, etcdImageTag(ec.Spec.Version)), Env: []corev1.EnvVar{ { Name: "POD_NAME", @@ -811,11 +843,14 @@ func validateEtcdUpgradePath(etcdVersions []semver.Version, current, target stri currentIdx, targetIdx = -1, -1 ) - currentVer, err = semver.NewVersion(current) + // The live StatefulSet image tag is v-prefixed (the registry convention) while + // spec.version may be bare; normalize both so go-semver can parse them and so a + // "v3.5.21" image vs a "3.5.21" spec is not mistaken for a version change. + currentVer, err = semver.NewVersion(normalizeEtcdVersion(current)) if err != nil { return false, fmt.Errorf("failed to parse current version %s: %w", current, err) } - targetVer, err = semver.NewVersion(target) + targetVer, err = semver.NewVersion(normalizeEtcdVersion(target)) if err != nil { return false, fmt.Errorf("failed to parse target version %s: %w", target, err) } diff --git a/internal/controller/utils_test.go b/internal/controller/utils_test.go index 67592316..2a128b04 100644 --- a/internal/controller/utils_test.go +++ b/internal/controller/utils_test.go @@ -701,6 +701,62 @@ func TestCreateOrPatchStatefulSetWithPodLabels(t *testing.T) { } } +// TestEtcdImageTag guards the registry-convention normalization: the default +// etcd-development registry only publishes v-prefixed tags, so a bare semver +// spec.version must be rendered with a leading "v" (otherwise the image is +// NotFound -> ImagePullBackOff), while an already-v-prefixed value and an +// explicit non-semver custom tag must be passed through unchanged. +func TestEtcdImageTag(t *testing.T) { + tests := []struct { + name string + version string + want string + }{ + {name: "bare semver gets v prefix for the v-tagged registry", version: "3.6.1", want: "v3.6.1"}, + {name: "v-prefixed semver passes through unchanged", version: "v3.6.1", want: "v3.6.1"}, + {name: "default sample version (v-prefixed) is preserved", version: "v3.5.21", want: "v3.5.21"}, + {name: "surrounding whitespace is trimmed", version: " 3.5.21 ", want: "v3.5.21"}, + {name: "non-semver custom tag is left untouched", version: "latest", want: "latest"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, etcdImageTag(tt.version)) + }) + } +} + +// TestRenderedImageRefIsVPrefixed asserts end-to-end that a bare spec.version +// renders a v-prefixed image ref against the default registry, so the StatefulSet +// pulls a tag the registry actually publishes. +func TestRenderedImageRefIsVPrefixed(t *testing.T) { + ctx := t.Context() + logger := log.FromContext(ctx) + + scheme := runtime.NewScheme() + _ = corev1.AddToScheme(scheme) + _ = ecv1alpha1.AddToScheme(scheme) + _ = appsv1.AddToScheme(scheme) + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() + ec := &ecv1alpha1.EtcdCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test-etcd", Namespace: "default"}, + Spec: ecv1alpha1.EtcdClusterSpec{ + Size: 3, + Version: "3.5.21", // bare semver, as a user might write it + ImageRegistry: "gcr.io/etcd-development/etcd", + }, + } + + require.NoError(t, createOrPatchStatefulSet(ctx, logger, ec, fakeClient, 3, scheme)) + + sts := &appsv1.StatefulSet{} + require.NoError(t, fakeClient.Get(ctx, client.ObjectKey{Name: "test-etcd", Namespace: "default"}, sts)) + require.Len(t, sts.Spec.Template.Spec.Containers, 1) + assert.Equal(t, "gcr.io/etcd-development/etcd:v3.5.21", + sts.Spec.Template.Spec.Containers[0].Image, + "bare spec.version must render a v-prefixed tag against the v-only etcd-development registry") +} + func TestCreatingArgs(t *testing.T) { tests := []struct { testName string