diff --git a/api/v1alpha1/etcdcluster_webhook.go b/api/v1alpha1/etcdcluster_webhook.go new file mode 100644 index 00000000..86e830a0 --- /dev/null +++ b/api/v1alpha1/etcdcluster_webhook.go @@ -0,0 +1,545 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + "context" + "fmt" + "sort" + "strings" + + "github.com/coreos/go-semver/semver" + corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/validation/field" + ctrl "sigs.k8s.io/controller-runtime" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + + etcdversions "go.etcd.io/etcd/api/v3/version" +) + +// etcdclusterlog is the logger used by the EtcdCluster webhooks. Every admission +// decision is logged with enough structured context (name, namespace, the +// offending field, and the user-facing remediation) that an operator reading the +// webhook-server logs can reconstruct exactly why a request was accepted, +// defaulted, or rejected. +var etcdclusterlog = logf.Log.WithName("etcdcluster-webhook") + +// Known TLS providers. Kept in one place so the validator's accepted-set and the +// defaulter agree, and so the error message can enumerate the valid choices. +const ( + tlsProviderAuto = "auto" + tlsProviderCertManager = "cert-manager" +) + +// knownTLSProviders is the canonical, lower-cased set of providers the operator +// understands, in a deterministic order for stable error messages. +var knownTLSProviders = []string{tlsProviderAuto, tlsProviderCertManager} + +// SetupWebhookWithManager registers the validating and defaulting webhooks for +// EtcdCluster with the manager's webhook server. This is the single entry point +// called from cmd/main.go; previously the webhook server was started but nothing +// was ever registered with it (see issue #380). +func (r *EtcdCluster) SetupWebhookWithManager(mgr ctrl.Manager) error { + etcdclusterlog.Info("registering EtcdCluster admission webhooks (validating + defaulting)") + return ctrl.NewWebhookManagedBy(mgr, r). + WithCustomValidator(&EtcdClusterCustomValidator{}). + WithCustomDefaulter(&EtcdClusterCustomDefaulter{}). + Complete() +} + +// Webhook policy rationale (applies to BOTH webhooks declared in this file). +// +// - failurePolicy=fail: these webhooks are correctness-critical. Defaulting +// (provider=auto) and validation (odd size, semver, supported upgrade path, +// storageSpec immutability) protect a stateful, quorum-based datastore where a +// bad spec admitted while the webhook is unreachable can produce a cluster that +// cannot form quorum or silent data loss on a botched PVC change. We therefore +// fail closed: if the API server cannot reach the webhook, the EtcdCluster +// create/update is rejected rather than admitted unchecked. +// - Failing closed does NOT brick the cluster. The rule scope is EXACTLY +// {operator.etcd.io/v1alpha1 etcdclusters}, so a webhook outage only blocks +// mutations to EtcdCluster CRs (a narrow, operator-owned resource); core +// workloads, nodes, and every other API object stay fully editable. As an +// explicit break-glass escape hatch, config/default injects an objectSelector +// so an admin can label a single CR with +// `etcd.operator.etcd.io/skip-webhooks: "true"` to bypass admission while the +// webhook is down (see config/default/webhook_selector_patch.yaml). +// - timeoutSeconds=10: validation is pure, in-memory, and dependency-free (no +// external calls), so 10s is generous headroom that still bounds API-server +// stalls if the webhook pod is wedged. With failurePolicy=fail a timeout is a +// rejection, which is the safe direction. +// - matchPolicy=Equivalent: match the request even if it arrives via a different +// but equivalent API group/version, so a future served/stored version cannot +// silently slip past the validator. Only v1alpha1 exists today, but Equivalent +// is the safe default and costs nothing now. + +// +kubebuilder:webhook:path=/mutate-operator-etcd-io-v1alpha1-etcdcluster,mutating=true,failurePolicy=fail,matchPolicy=Equivalent,timeoutSeconds=10,sideEffects=None,groups=operator.etcd.io,resources=etcdclusters,verbs=create;update,versions=v1alpha1,name=metcdcluster-v1alpha1.kb.io,admissionReviewVersions=v1 + +// EtcdClusterCustomDefaulter applies defaults to EtcdCluster objects on admission. +// +kubebuilder:object:generate=false +type EtcdClusterCustomDefaulter struct{} + +var _ admission.Defaulter[runtime.Object] = &EtcdClusterCustomDefaulter{} + +// Default implements webhook.CustomDefaulter. +func (d *EtcdClusterCustomDefaulter) Default(_ context.Context, obj runtime.Object) error { + ec, ok := obj.(*EtcdCluster) + if !ok { + return fmt.Errorf("expected an EtcdCluster object but got %T", obj) + } + log := etcdclusterlog.WithValues("name", ec.Name, "namespace", ec.Namespace) + applyEtcdClusterDefaults(ec, log) + return nil +} + +// applyEtcdClusterDefaults mutates ec in place to fill in sensible defaults. It is +// pure (aside from the supplied logger) so it can be unit tested directly. +func applyEtcdClusterDefaults(ec *EtcdCluster, log logr) { + // When a TLS block is present but no provider is named, the operator falls + // back to the "auto" provider (matches the comment on TLSCertificate.Provider). + // Materializing it here means downstream code and validation see an explicit, + // canonical value instead of "". + if ec.Spec.TLS != nil && strings.TrimSpace(ec.Spec.TLS.Provider) == "" { + ec.Spec.TLS.Provider = tlsProviderAuto + if log != nil { + log.Info("defaulting spec.tls.provider to \"auto\" (no provider specified)") + } + } +} + +// See the "Webhook policy rationale" comment above the mutating webhook marker; +// failurePolicy=fail, matchPolicy=Equivalent, and timeoutSeconds=10 are chosen on +// the same grounds for the validating webhook. + +// +kubebuilder:webhook:path=/validate-operator-etcd-io-v1alpha1-etcdcluster,mutating=false,failurePolicy=fail,matchPolicy=Equivalent,timeoutSeconds=10,sideEffects=None,groups=operator.etcd.io,resources=etcdclusters,verbs=create;update,versions=v1alpha1,name=vetcdcluster-v1alpha1.kb.io,admissionReviewVersions=v1 + +// EtcdClusterCustomValidator validates EtcdCluster objects on admission. +// +kubebuilder:object:generate=false +type EtcdClusterCustomValidator struct{} + +var _ admission.Validator[runtime.Object] = &EtcdClusterCustomValidator{} + +// etcdClusterGroupKind is used when building aggregated field errors into an +// apierrors.NewInvalid status so the API server returns a structured 422. +var etcdClusterGroupKind = schema.GroupKind{Group: GroupVersion.Group, Kind: "EtcdCluster"} + +// ValidateCreate implements webhook.CustomValidator. +func (v *EtcdClusterCustomValidator) ValidateCreate( + _ context.Context, obj runtime.Object, +) (admission.Warnings, error) { + ec, ok := obj.(*EtcdCluster) + if !ok { + return nil, fmt.Errorf("expected an EtcdCluster object but got %T", obj) + } + log := etcdclusterlog.WithValues("name", ec.Name, "namespace", ec.Namespace, "operation", "create") + log.Info("validating EtcdCluster on create") + + errs := validateEtcdClusterSpec(&ec.Spec) + return finish(log, ec, errs) +} + +// ValidateUpdate implements webhook.CustomValidator. +func (v *EtcdClusterCustomValidator) ValidateUpdate( + _ context.Context, oldObj, newObj runtime.Object, +) (admission.Warnings, error) { + ec, ok := newObj.(*EtcdCluster) + if !ok { + return nil, fmt.Errorf("expected an EtcdCluster object for the new object but got %T", newObj) + } + oldEc, ok := oldObj.(*EtcdCluster) + if !ok { + return nil, fmt.Errorf("expected an EtcdCluster object for the old object but got %T", oldObj) + } + log := etcdclusterlog.WithValues("name", ec.Name, "namespace", ec.Namespace, "operation", "update") + log.Info("validating EtcdCluster on update") + + errs := validateEtcdClusterUpdate(&oldEc.Spec, &ec.Spec) + return finish(log, ec, errs) +} + +// ValidateDelete implements webhook.CustomValidator. Deletion is always allowed. +func (v *EtcdClusterCustomValidator) ValidateDelete( + _ context.Context, obj runtime.Object, +) (admission.Warnings, error) { + if _, ok := obj.(*EtcdCluster); !ok { + return nil, fmt.Errorf("expected an EtcdCluster object but got %T", obj) + } + return nil, nil +} + +// finish converts an accumulated field.ErrorList into either nil (accept) or a +// structured apierrors.NewInvalid (reject), logging the outcome either way. +func finish(log logr, ec *EtcdCluster, errs field.ErrorList) (admission.Warnings, error) { + if len(errs) == 0 { + log.Info("EtcdCluster admitted") + return nil, nil + } + // Log each rejection reason individually so the actionable detail is greppable. + for _, e := range errs { + log.Info("EtcdCluster rejected", "field", e.Field, "reason", e.Detail) + } + return nil, apierrors.NewInvalid(etcdClusterGroupKind, ec.Name, errs) +} + +// --------------------------------------------------------------------------- +// Pure validation helpers. These take only the spec(s) and return field errors +// with crisp, actionable Detail strings. They are unit-tested directly, including +// exact message text. +// --------------------------------------------------------------------------- + +// validateEtcdClusterSpec runs all create-time spec invariants. On update only the +// subset of these that changed is re-checked; see validateEtcdClusterUpdate. +func validateEtcdClusterSpec(spec *EtcdClusterSpec) field.ErrorList { + errs := make(field.ErrorList, 0, 3) + specPath := field.NewPath("spec") + + errs = append(errs, validateSize(spec.Size, specPath.Child("size"))...) + errs = append(errs, validateVersionFormat(spec.Version, specPath.Child("version"))...) + errs = append(errs, validateTLS(spec.TLS, specPath.Child("tls"))...) + errs = append(errs, validateStorageSpec(spec.StorageSpec, specPath.Child("storageSpec"))...) + + return errs +} + +// minVolumeSizeRequest mirrors the controller's reconcile-time floor +// (internal/controller/utils.go) so the webhook rejects an undersized PVC +// synchronously at apply time instead of letting the StatefulSet build fail later +// with no actionable feedback. +var minVolumeSizeRequest = resource.MustParse("1Mi") + +// validateStorageSpec enforces the same storage invariants the controller checks +// while building the StatefulSet, but at admission time so the user gets an +// immediate, actionable rejection rather than a silently failing reconcile: +// +// - volumeSizeRequest is required and must be >= 1Mi. +// - volumeSizeLimit, if set, must be >= volumeSizeRequest (a PVC whose limit is +// below its request is rejected by the API server when the StatefulSet is +// created). +// - accessModes must be one the operator handles (ReadWriteOnce, ReadWriteMany, +// or empty -> ReadWriteOnce); ReadOnlyMany and others are unsupported. +// - pvcName is required when accessModes is ReadWriteMany (the operator mounts a +// pre-provisioned shared PVC by name in that mode). +func validateStorageSpec(s *StorageSpec, path *field.Path) field.ErrorList { + if s == nil { + return nil + } + var errs field.ErrorList + + switch { + case s.VolumeSizeRequest.IsZero(): + errs = append(errs, field.Required(path.Child("volumeSizeRequest"), + "volumeSizeRequest is required when storageSpec is set; specify a persistent volume size such as \"1Gi\".")) + case s.VolumeSizeRequest.Cmp(minVolumeSizeRequest) < 0: + errs = append(errs, field.Invalid(path.Child("volumeSizeRequest"), s.VolumeSizeRequest.String(), + fmt.Sprintf("volumeSizeRequest must be at least 1Mi; got %q. Request a larger persistent volume (e.g. \"1Gi\").", + s.VolumeSizeRequest.String()))) + } + + // Only meaningful to compare the limit when both are usable values; if the + // request itself is invalid the message above is the actionable one. + if !s.VolumeSizeLimit.IsZero() && !s.VolumeSizeRequest.IsZero() && + s.VolumeSizeLimit.Cmp(s.VolumeSizeRequest) < 0 { + errs = append(errs, field.Invalid(path.Child("volumeSizeLimit"), s.VolumeSizeLimit.String(), + fmt.Sprintf("volumeSizeLimit (%q) must be greater than or equal to volumeSizeRequest (%q); "+ + "raise volumeSizeLimit or lower volumeSizeRequest.", + s.VolumeSizeLimit.String(), s.VolumeSizeRequest.String()))) + } + + switch s.AccessModes { + case "", corev1.ReadWriteOnce: + // default / single-writer: nothing else required. + case corev1.ReadWriteMany: + if strings.TrimSpace(s.PVCName) == "" { + errs = append(errs, field.Required(path.Child("pvcName"), + "pvcName is required when accessModes is \"ReadWriteMany\"; set it to the name of the "+ + "pre-provisioned shared PersistentVolumeClaim the etcd pods should mount.")) + } + default: + errs = append(errs, field.NotSupported(path.Child("accessModes"), + s.AccessModes, []string{string(corev1.ReadWriteOnce), string(corev1.ReadWriteMany)})) + } + + return errs +} + +// validateSize enforces size >= 1 and an odd member count. etcd forms a quorum of +// (n/2)+1; an even cluster has the same fault tolerance as the next-lower odd size +// while being strictly more likely to lose quorum, so even sizes are rejected. +func validateSize(size int, path *field.Path) field.ErrorList { + if size < 1 { + return field.ErrorList{field.Invalid(path, size, + fmt.Sprintf("size must be at least 1; got %d. Set spec.size to a positive odd number (e.g. 1, 3, or 5).", size))} + } + if size%2 == 0 { + return field.ErrorList{field.Invalid(path, size, + fmt.Sprintf("size must be an odd number so the cluster can form a majority quorum; got %d. "+ + "An even-sized etcd cluster tolerates no more failures than the next-smaller odd size while being more "+ + "likely to lose quorum. Use %d or %d instead.", size, size-1, size+1))} + } + return nil +} + +// normalizeEtcdVersion strips an optional leading "v" from an etcd version string +// so it can be parsed by the CoreOS go-semver library, which rejects the "v" +// prefix. The canonical etcd release images at the default registry +// (gcr.io/etcd-development/etcd) are published ONLY with v-prefixed tags (e.g. +// "v3.5.21" — see config/samples), so users legitimately write spec.version with +// or without the "v". Both forms must validate identically here, and the +// controller (internal/controller/utils.go: etcdImageTag) re-applies the "v" the +// registry expects when rendering the image, so a bare spec.version still pulls. +func normalizeEtcdVersion(version string) string { + return strings.TrimPrefix(strings.TrimSpace(version), "v") +} + +// validateVersionFormat ensures spec.version is non-empty and parses as semver. +// A leading "v" is accepted (and stripped before parsing) because the default +// etcd-development registry only publishes v-prefixed tags; rejecting "v3.6.1" +// would make the operator's own working samples unappliable. +func validateVersionFormat(version string, path *field.Path) field.ErrorList { + if strings.TrimSpace(version) == "" { + return field.ErrorList{field.Required(path, + "version is required; set spec.version to a semver etcd image tag such as \"3.6.1\".")} + } + if _, err := semver.NewVersion(normalizeEtcdVersion(version)); err != nil { + return field.ErrorList{field.Invalid(path, version, + fmt.Sprintf("version %q is not a valid semantic version (expected MAJOR.MINOR.PATCH, e.g. \"3.6.1\"): %v.", + version, err))} + } + return nil +} + +// validateTLS checks that the TLS surface is internally coherent: the provider is +// one the operator understands, and the provider-specific config block required by +// that provider is present and complete. +func validateTLS(tls *TLSCertificate, path *field.Path) field.ErrorList { + if tls == nil { + return nil + } + var errs field.ErrorList + + // An empty provider is defaulted to "auto" by the defaulting webhook; treat it + // as "auto" here too so validation is correct even if the defaulter is bypassed + // (e.g. in envtest where only the validating webhook is registered). + provider := strings.TrimSpace(tls.Provider) + if provider == "" { + provider = tlsProviderAuto + } + + cfgPath := path.Child("providerCfg") + switch provider { + case tlsProviderAuto: + // auto provider self-generates certs; cert-manager config must not be set. + if tls.ProviderCfg.CertManagerCfg != nil { + errs = append(errs, field.Invalid(cfgPath.Child("certManagerCfg"), + "", + "providerCfg.certManagerCfg must not be set when provider is \"auto\"; "+ + "either remove providerCfg.certManagerCfg or set provider to \"cert-manager\".")) + } + if ac := tls.ProviderCfg.AutoCfg; ac != nil { + errs = append(errs, validateCommonName(ac.CommonName, + cfgPath.Child("autoCfg").Child("commonName"))...) + } + case tlsProviderCertManager: + cm := tls.ProviderCfg.CertManagerCfg + if cm == nil { + errs = append(errs, field.Required(cfgPath.Child("certManagerCfg"), + "providerCfg.certManagerCfg is required when provider is \"cert-manager\"; "+ + "supply issuerKind and issuerName.")) + break + } + if strings.TrimSpace(cm.IssuerName) == "" { + errs = append(errs, field.Required(cfgPath.Child("certManagerCfg").Child("issuerName"), + "issuerName is required for the cert-manager provider; set it to the name of an Issuer or ClusterIssuer.")) + } + if k := strings.TrimSpace(cm.IssuerKind); k != "" && k != "Issuer" && k != "ClusterIssuer" { + errs = append(errs, field.NotSupported( + cfgPath.Child("certManagerCfg").Child("issuerKind"), + cm.IssuerKind, []string{"Issuer", "ClusterIssuer"})) + } + errs = append(errs, validateCommonName(cm.CommonName, + cfgPath.Child("certManagerCfg").Child("commonName"))...) + default: + errs = append(errs, field.NotSupported(path.Child("provider"), tls.Provider, knownTLSProviders)) + } + + return errs +} + +// maxCommonNameLen is the X.509 CommonName ceiling documented on CommonConfig: a +// CN longer than 64 characters produces an invalid CSR, so the cert never issues. +// Catching it at admission turns a silent never-ready certificate into an +// immediate, actionable rejection. +const maxCommonNameLen = 64 + +// validateCommonName enforces the documented 64-character CommonName limit. An +// empty CommonName is valid (the provider derives a default). +func validateCommonName(cn string, path *field.Path) field.ErrorList { + if len(cn) <= maxCommonNameLen { + return nil + } + return field.ErrorList{field.Invalid(path, cn, + fmt.Sprintf("commonName must be %d characters or fewer to produce a valid X.509 CSR; got %d. "+ + "Shorten commonName (the certificate provider derives a default when it is empty).", + maxCommonNameLen, len(cn)))} +} + +// validateEtcdClusterUpdate runs all update-time invariants between the old and +// new spec. Unlike ValidateCreate it does NOT re-run the full create-time spec +// validation: the CRD historically only enforced "size >= 1", so pre-existing +// clusters with an even size or an unknown-but-well-formed version are legal on +// the API server. With failurePolicy: Fail and no objectSelector, re-validating +// those invariants on every update would wedge unrelated edits (metadata, +// annotations, TLS) to legacy clusters. We therefore only enforce the size and +// version-format invariants when the offending field actually changed — keeping +// the guardrail for new values while letting legacy clusters be remediated. TLS +// coherence is cheap and self-contained, so it is always re-validated. +func validateEtcdClusterUpdate(oldSpec, newSpec *EtcdClusterSpec) field.ErrorList { + var errs field.ErrorList + specPath := field.NewPath("spec") + + if newSpec.Size != oldSpec.Size { + errs = append(errs, validateSize(newSpec.Size, specPath.Child("size"))...) + } + if newSpec.Version != oldSpec.Version { + errs = append(errs, validateVersionFormat(newSpec.Version, specPath.Child("version"))...) + } + errs = append(errs, validateTLS(newSpec.TLS, specPath.Child("tls"))...) + + // storageSpec is immutable once set: changing the PVC template after the + // StatefulSet exists cannot be reconciled in place and risks data loss. + switch { + case oldSpec.StorageSpec != nil && newSpec.StorageSpec == nil: + errs = append(errs, field.Invalid(specPath.Child("storageSpec"), nil, + "storageSpec is immutable and cannot be removed once set; restore the original storageSpec or recreate the cluster.")) + case oldSpec.StorageSpec != nil && newSpec.StorageSpec != nil && + !apiequality.Semantic.DeepEqual(oldSpec.StorageSpec, newSpec.StorageSpec): + errs = append(errs, field.Invalid(specPath.Child("storageSpec"), newSpec.StorageSpec, + "storageSpec is immutable and cannot be changed once set; revert spec.storageSpec to its original value or recreate the cluster.")) + case oldSpec.StorageSpec == nil && newSpec.StorageSpec != nil: + // Adding storage to a cluster that previously had none is permitted (it is + // not yet "set"), but the new block must itself be valid — otherwise the + // content invariants would only be enforced at create time and a malformed + // storageSpec could be introduced via update. + errs = append(errs, validateStorageSpec(newSpec.StorageSpec, specPath.Child("storageSpec"))...) + } + + // Upgrade-path check. Only meaningful when both versions parse and actually + // differ; format errors are already surfaced by validateVersionFormat. + errs = append(errs, validateUpgradePath(oldSpec.Version, newSpec.Version, specPath.Child("version"))...) + + return errs +} + +// validateUpgradePath rejects unsupported version transitions (skip-minor +// upgrades and downgrades), reusing the same ordered version table the controller +// uses at reconcile time. A no-op (equal versions) or an unparseable version is +// not reported here (the latter is handled by validateVersionFormat). +// +// Note: this validates the *declared* spec transition (old spec.version -> +// new spec.version), not the actual running version. The controller compares the +// live StatefulSet image against the spec at reconcile time and remains +// authoritative; if a prior upgrade stalled before convergence, a skip-minor edit +// could pass this declared-transition check yet still be caught by the controller. +func validateUpgradePath(current, target string, path *field.Path) field.ErrorList { + // Compare on the normalized form so "v3.6.1" -> "3.6.1" (same release, just a + // prefix difference) is correctly treated as a no-op rather than a transition. + if normalizeEtcdVersion(current) == normalizeEtcdVersion(target) { + return nil + } + if _, err := semver.NewVersion(normalizeEtcdVersion(current)); err != nil { + return nil // old object had a bad version; nothing actionable to add here. + } + if _, err := semver.NewVersion(normalizeEtcdVersion(target)); err != nil { + return nil // format error already reported by validateVersionFormat. + } + + if err := checkUpgradePath(etcdversions.AllVersions, current, target); err != nil { + return field.ErrorList{field.Invalid(path, target, + fmt.Sprintf("%v. etcd only supports sequential single-minor upgrades and forbids downgrades; "+ + "upgrade one minor version at a time (current %q -> target %q).", err, current, target))} + } + return nil +} + +// checkUpgradePath mirrors the controller's validateEtcdUpgradePath ordering logic +// but is local to the api package (controller internals are not importable here) +// and returns only the actionable error. supportedVersions must be ascending. +func checkUpgradePath(supportedVersions []semver.Version, current, target string) error { + // go-semver rejects a leading "v"; normalize so a v-prefixed spec.version (the + // default registry's tag convention) participates in the ordering check. + currentVer, err := semver.NewVersion(normalizeEtcdVersion(current)) + if err != nil { + return fmt.Errorf("failed to parse current version %s: %w", current, err) + } + targetVer, err := semver.NewVersion(normalizeEtcdVersion(target)) + if err != nil { + return fmt.Errorf("failed to parse target version %s: %w", target, err) + } + + currentIdx, targetIdx := -1, -1 + for idx, v := range supportedVersions { + if v.Major == currentVer.Major && v.Minor == currentVer.Minor { + currentIdx = idx + } + if v.Major == targetVer.Major && v.Minor == targetVer.Minor { + targetIdx = idx + } + } + + switch { + case currentIdx == -1: + return fmt.Errorf("unknown current version %s (supported minor lines: %s)", + currentVer, supportedMinorLines(supportedVersions)) + case targetIdx == -1: + return fmt.Errorf("unknown target version %s (supported minor lines: %s)", + targetVer, supportedMinorLines(supportedVersions)) + case currentIdx > targetIdx || (currentIdx == targetIdx && currentVer.Patch > targetVer.Patch): + return fmt.Errorf("downgrading from version %s to version %s is not allowed", currentVer, targetVer) + case targetIdx > currentIdx+1: + return fmt.Errorf("upgrading from version %s to version %s is not allowed (skips a minor version)", + currentVer, targetVer) + } + return nil +} + +// supportedMinorLines renders the supported MAJOR.MINOR lines for error messages. +func supportedMinorLines(versions []semver.Version) string { + seen := make(map[string]struct{}, len(versions)) + var lines []string + for _, v := range versions { + key := fmt.Sprintf("%d.%d", v.Major, v.Minor) + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + lines = append(lines, key) + } + sort.Strings(lines) + return strings.Join(lines, ", ") +} + +// logr is the minimal logging surface applyEtcdClusterDefaults needs; it lets the +// unit tests pass nil while production code passes a real logr.Logger. +type logr interface { + Info(msg string, keysAndValues ...any) +} diff --git a/api/v1alpha1/etcdcluster_webhook_test.go b/api/v1alpha1/etcdcluster_webhook_test.go new file mode 100644 index 00000000..7ad775cb --- /dev/null +++ b/api/v1alpha1/etcdcluster_webhook_test.go @@ -0,0 +1,624 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + "context" + "errors" + "strings" + "testing" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" +) + +// causeMessages flattens the aggregated field errors carried by an +// apierrors.NewInvalid status into ": " strings so tests can +// assert on the exact user-facing rejection text. +func causeMessages(t *testing.T, err error) []string { + t.Helper() + if err == nil { + return nil + } + var status apierrors.APIStatus + if !errors.As(err, &status) { + t.Fatalf("expected an apierrors status error, got %T: %v", err, err) + } + var msgs []string + if status.Status().Details != nil { + for _, c := range status.Status().Details.Causes { + msgs = append(msgs, c.Field+": "+c.Message) + } + } + return msgs +} + +func newCluster(size int, version string) *EtcdCluster { + return &EtcdCluster{Spec: EtcdClusterSpec{Size: size, Version: version}} +} + +func TestValidateCreate_Messages(t *testing.T) { + v := &EtcdClusterCustomValidator{} + + tests := []struct { + name string + cluster *EtcdCluster + wantOK bool + wantField string + wantMessage string // exact, fully-formed message text + }{ + { + name: "valid odd size with semver version", + cluster: newCluster(3, "3.6.1"), + wantOK: true, + }, + { + // The default etcd-development registry only publishes v-prefixed tags + // (config/samples ships version: v3.5.21), so the webhook must accept a + // leading "v" or it would reject the operator's own working samples. + name: "v-prefixed semver version is accepted", + cluster: newCluster(3, "v3.6.1"), + wantOK: true, + }, + { + name: "bare semver version is accepted", + cluster: newCluster(3, "3.5.21"), + wantOK: true, + }, + { + name: "even size is rejected with quorum guidance", + cluster: newCluster(4, "3.6.1"), + wantField: "spec.size", + wantMessage: "Invalid value: 4: size must be an odd number so the cluster can form a majority quorum; got 4. An even-sized etcd cluster tolerates no more failures than the next-smaller odd size while being more likely to lose quorum. Use 3 or 5 instead.", + }, + { + name: "zero size is rejected", + cluster: newCluster(0, "3.6.1"), + wantField: "spec.size", + wantMessage: "Invalid value: 0: size must be at least 1; got 0. Set spec.size to a positive odd number (e.g. 1, 3, or 5).", + }, + { + name: "empty version is rejected as required", + cluster: newCluster(1, ""), + wantField: "spec.version", + wantMessage: "Required value: version is required; set spec.version to a semver etcd image tag such as \"3.6.1\".", + }, + { + name: "non-semver version is rejected", + cluster: newCluster(1, "latest"), + wantField: "spec.version", + wantMessage: "Invalid value: \"latest\": version \"latest\" is not a valid semantic version (expected MAJOR.MINOR.PATCH, e.g. \"3.6.1\"): latest is not in dotted-tri format.", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + _, err := v.ValidateCreate(context.Background(), tc.cluster) + if tc.wantOK { + if err != nil { + t.Fatalf("expected admission to succeed, got error: %v", err) + } + return + } + msgs := causeMessages(t, err) + joined := strings.Join(msgs, " | ") + found := false + for _, m := range msgs { + if m == tc.wantField+": "+tc.wantMessage { + found = true + break + } + } + if !found { + t.Fatalf("did not find expected cause.\n want: %s: %s\n got: %s", + tc.wantField, tc.wantMessage, joined) + } + }) + } +} + +func TestValidateTLS_Messages(t *testing.T) { + v := &EtcdClusterCustomValidator{} + + tests := []struct { + name string + tls *TLSCertificate + wantOK bool + wantMessage string + }{ + { + name: "auto provider with no config is valid", + tls: &TLSCertificate{Provider: "auto"}, + wantOK: true, + }, + { + name: "empty provider is treated as auto and is valid", + tls: &TLSCertificate{}, + wantOK: true, + }, + { + name: "unknown provider lists the supported choices", + tls: &TLSCertificate{Provider: "vault"}, + wantMessage: "spec.tls.provider: Unsupported value: \"vault\": supported values: \"auto\", \"cert-manager\"", + }, + { + name: "cert-manager provider without config block", + tls: &TLSCertificate{Provider: "cert-manager"}, + wantMessage: "spec.tls.providerCfg.certManagerCfg: Required value: providerCfg.certManagerCfg is required when provider is \"cert-manager\"; supply issuerKind and issuerName.", + }, + { + name: "cert-manager provider missing issuerName", + tls: &TLSCertificate{ + Provider: "cert-manager", + ProviderCfg: ProviderConfig{CertManagerCfg: &ProviderCertManagerConfig{IssuerKind: "Issuer"}}, + }, + wantMessage: "spec.tls.providerCfg.certManagerCfg.issuerName: Required value: issuerName is required for the cert-manager provider; set it to the name of an Issuer or ClusterIssuer.", + }, + { + name: "cert-manager provider with bad issuerKind", + tls: &TLSCertificate{ + Provider: "cert-manager", + ProviderCfg: ProviderConfig{CertManagerCfg: &ProviderCertManagerConfig{IssuerKind: "Bogus", IssuerName: "ca"}}, + }, + wantMessage: "spec.tls.providerCfg.certManagerCfg.issuerKind: Unsupported value: \"Bogus\": supported values: \"Issuer\", \"ClusterIssuer\"", + }, + { + name: "auto provider must not carry cert-manager config", + tls: &TLSCertificate{ + Provider: "auto", + ProviderCfg: ProviderConfig{CertManagerCfg: &ProviderCertManagerConfig{IssuerName: "ca"}}, + }, + wantMessage: "spec.tls.providerCfg.certManagerCfg: Invalid value: \"\": providerCfg.certManagerCfg must not be set when provider is \"auto\"; either remove providerCfg.certManagerCfg or set provider to \"cert-manager\".", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + c := newCluster(1, "3.6.1") + c.Spec.TLS = tc.tls + _, err := v.ValidateCreate(context.Background(), c) + if tc.wantOK { + if err != nil { + t.Fatalf("expected admission to succeed, got: %v", err) + } + return + } + msgs := causeMessages(t, err) + found := false + for _, m := range msgs { + if m == tc.wantMessage { + found = true + } + } + if !found { + t.Fatalf("did not find expected TLS cause.\n want: %s\n got: %s", + tc.wantMessage, strings.Join(msgs, " | ")) + } + }) + } +} + +func TestValidateUpdate_UpgradePathAndImmutability(t *testing.T) { + v := &EtcdClusterCustomValidator{} + + tests := []struct { + name string + old *EtcdCluster + updated *EtcdCluster + wantOK bool + wantMessage string + }{ + { + name: "same version no-op is allowed", + old: newCluster(3, "3.6.1"), + updated: newCluster(3, "3.6.2"), + wantOK: true, + }, + { + name: "single minor upgrade is allowed", + old: newCluster(3, "3.5.1"), + updated: newCluster(3, "3.6.1"), + wantOK: true, + }, + { + name: "v-prefixed single minor upgrade is allowed", + old: newCluster(3, "v3.5.1"), + updated: newCluster(3, "v3.6.1"), + wantOK: true, + }, + { + // v-prefixed and bare forms of the same release must be treated as a + // no-op, not a transition. + name: "v-prefixed vs bare same release is a no-op", + old: newCluster(3, "v3.5.21"), + updated: newCluster(3, "3.5.21"), + wantOK: true, + }, + { + name: "v-prefixed skip-minor upgrade is still rejected", + old: newCluster(3, "v3.5.1"), + updated: newCluster(3, "v3.7.1"), + wantMessage: "spec.version: Invalid value: \"v3.7.1\": upgrading from version 3.5.1 to version 3.7.1 is not allowed (skips a minor version). etcd only supports sequential single-minor upgrades and forbids downgrades; upgrade one minor version at a time (current \"v3.5.1\" -> target \"v3.7.1\").", + }, + { + name: "skip-minor upgrade is rejected", + old: newCluster(3, "3.5.1"), + updated: newCluster(3, "3.7.1"), + wantMessage: "spec.version: Invalid value: \"3.7.1\": upgrading from version 3.5.1 to version 3.7.1 is not allowed (skips a minor version). etcd only supports sequential single-minor upgrades and forbids downgrades; upgrade one minor version at a time (current \"3.5.1\" -> target \"3.7.1\").", + }, + { + name: "downgrade is rejected", + old: newCluster(3, "3.6.1"), + updated: newCluster(3, "3.5.1"), + wantMessage: "spec.version: Invalid value: \"3.5.1\": downgrading from version 3.6.1 to version 3.5.1 is not allowed. etcd only supports sequential single-minor upgrades and forbids downgrades; upgrade one minor version at a time (current \"3.6.1\" -> target \"3.5.1\").", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + _, err := v.ValidateUpdate(context.Background(), tc.old, tc.updated) + if tc.wantOK { + if err != nil { + t.Fatalf("expected update to succeed, got: %v", err) + } + return + } + msgs := causeMessages(t, err) + found := false + for _, m := range msgs { + if m == tc.wantMessage { + found = true + } + } + if !found { + t.Fatalf("did not find expected upgrade cause.\n want: %s\n got: %s", + tc.wantMessage, strings.Join(msgs, " | ")) + } + }) + } +} + +// TestValidateUpdate_LegacyClustersRemediable guards C3: pre-existing clusters +// whose values predate the create-time invariants (the CRD historically only +// enforced size >= 1) must not be frozen on update. Unchanged-but-non-conforming +// values are tolerated; a metadata-only edit must be allowed; but the invariant is +// still enforced the moment the offending field is actually changed. +func TestValidateUpdate_LegacyClustersRemediable(t *testing.T) { + v := &EtcdClusterCustomValidator{} + ctx := context.Background() + + t.Run("even-sized legacy cluster: unrelated edit is allowed", func(t *testing.T) { + // size 4 would fail ValidateCreate, but it already exists; an unrelated + // (TLS) edit must not be rejected for the pre-existing even size. + old := newCluster(4, "3.6.1") + updated := newCluster(4, "3.6.1") + updated.Spec.TLS = &TLSCertificate{Provider: "auto"} + if _, err := v.ValidateUpdate(ctx, old, updated); err != nil { + t.Fatalf("expected unrelated edit on legacy even-sized cluster to be allowed, got: %v", err) + } + }) + + t.Run("changing size to another even value is still rejected", func(t *testing.T) { + old := newCluster(4, "3.6.1") + updated := newCluster(6, "3.6.1") + _, err := v.ValidateUpdate(ctx, old, updated) + msgs := causeMessages(t, err) + found := false + for _, m := range msgs { + if strings.HasPrefix(m, "spec.size: Invalid value:") && + strings.Contains(m, "size must be an odd number") { + found = true + } + } + if !found { + t.Fatalf("expected even-size rejection when size is changed, got: %s", strings.Join(msgs, " | ")) + } + }) + + t.Run("remediating size to a valid odd value is allowed", func(t *testing.T) { + old := newCluster(4, "3.6.1") + updated := newCluster(3, "3.6.1") + if _, err := v.ValidateUpdate(ctx, old, updated); err != nil { + t.Fatalf("expected remediation 4 -> 3 to be allowed, got: %v", err) + } + }) +} + +func TestValidateUpdate_StorageSpecImmutable(t *testing.T) { + v := &EtcdClusterCustomValidator{} + + // withStorage populates a non-zero VolumeSizeRequest on purpose: a zero-valued + // Quantity has an empty string cache and nil *inf.Dec, so struct == happens to + // agree with the numeric value and would mask the comparison bug C1 fixes. + withStorage := func() *EtcdCluster { + c := newCluster(3, "3.6.1") + c.Spec.StorageSpec = &StorageSpec{ + StorageClassName: "standard", + VolumeSizeRequest: resource.MustParse("1Gi"), + } + return c + } + + t.Run("removing storageSpec is rejected", func(t *testing.T) { + _, err := v.ValidateUpdate(context.Background(), withStorage(), newCluster(3, "3.6.1")) + want := "spec.storageSpec: Invalid value: null: storageSpec is immutable and cannot be removed once set; restore the original storageSpec or recreate the cluster." + assertCause(t, err, want) + }) + + t.Run("changing storageSpec is rejected", func(t *testing.T) { + updated := withStorage() + updated.Spec.StorageSpec.StorageClassName = "fast" + _, err := v.ValidateUpdate(context.Background(), withStorage(), updated) + msgs := causeMessages(t, err) + ok := false + for _, m := range msgs { + if strings.HasPrefix(m, "spec.storageSpec: Invalid value:") && + strings.Contains(m, "storageSpec is immutable and cannot be changed once set") { + ok = true + } + } + if !ok { + t.Fatalf("expected storageSpec immutability error, got: %s", strings.Join(msgs, " | ")) + } + }) + + t.Run("unchanged storageSpec is allowed", func(t *testing.T) { + if _, err := v.ValidateUpdate(context.Background(), withStorage(), withStorage()); err != nil { + t.Fatalf("expected unchanged storageSpec to be allowed, got: %v", err) + } + }) + + // Regression guard for C1: a re-spelled but numerically identical quantity + // (1Gi vs 1024Mi) must NOT trip immutability. Struct == compares the cached + // string and *inf.Dec pointer, so it sees these as different; Semantic.DeepEqual + // compares the value and sees them as equal. + t.Run("numerically-equal re-spelled quantity is allowed", func(t *testing.T) { + old := withStorage() + old.Spec.StorageSpec.VolumeSizeRequest = resource.MustParse("1Gi") + updated := withStorage() + updated.Spec.StorageSpec.VolumeSizeRequest = resource.MustParse("1024Mi") + if _, err := v.ValidateUpdate(context.Background(), old, updated); err != nil { + t.Fatalf("expected 1Gi vs 1024Mi to be allowed (equal value), got: %v", err) + } + }) + + // A real size change (1Gi -> 2Gi) must still be rejected: immutability holds. + t.Run("changing the quantity value is rejected", func(t *testing.T) { + old := withStorage() + old.Spec.StorageSpec.VolumeSizeRequest = resource.MustParse("1Gi") + updated := withStorage() + updated.Spec.StorageSpec.VolumeSizeRequest = resource.MustParse("2Gi") + _, err := v.ValidateUpdate(context.Background(), old, updated) + msgs := causeMessages(t, err) + ok := false + for _, m := range msgs { + if strings.HasPrefix(m, "spec.storageSpec: Invalid value:") && + strings.Contains(m, "storageSpec is immutable and cannot be changed once set") { + ok = true + } + } + if !ok { + t.Fatalf("expected storageSpec immutability error for 1Gi->2Gi, got: %s", strings.Join(msgs, " | ")) + } + }) +} + +func assertCause(t *testing.T, err error, want string) { + t.Helper() + msgs := causeMessages(t, err) + for _, m := range msgs { + if m == want { + return + } + } + t.Fatalf("did not find expected cause.\n want: %s\n got: %s", want, strings.Join(msgs, " | ")) +} + +func TestDefault_TLSProvider(t *testing.T) { + d := &EtcdClusterCustomDefaulter{} + + t.Run("empty provider defaults to auto", func(t *testing.T) { + c := newCluster(3, "3.6.1") + c.Spec.TLS = &TLSCertificate{} + if err := d.Default(context.Background(), c); err != nil { + t.Fatalf("Default returned error: %v", err) + } + if c.Spec.TLS.Provider != "auto" { + t.Fatalf("expected provider to default to \"auto\", got %q", c.Spec.TLS.Provider) + } + }) + + t.Run("explicit provider is preserved", func(t *testing.T) { + c := newCluster(3, "3.6.1") + c.Spec.TLS = &TLSCertificate{Provider: "cert-manager"} + if err := d.Default(context.Background(), c); err != nil { + t.Fatalf("Default returned error: %v", err) + } + if c.Spec.TLS.Provider != "cert-manager" { + t.Fatalf("expected provider preserved, got %q", c.Spec.TLS.Provider) + } + }) + + t.Run("nil TLS is left untouched", func(t *testing.T) { + c := newCluster(3, "3.6.1") + if err := d.Default(context.Background(), c); err != nil { + t.Fatalf("Default returned error: %v", err) + } + if c.Spec.TLS != nil { + t.Fatalf("expected TLS to stay nil") + } + }) +} + +func TestValidateStorageSpec_Messages(t *testing.T) { + v := &EtcdClusterCustomValidator{} + + // withStorage returns a valid 3-node cluster we can attach a storageSpec to. + withStorage := func(s *StorageSpec) *EtcdCluster { + c := newCluster(3, "3.6.1") + c.Spec.StorageSpec = s + return c + } + + tests := []struct { + name string + storage *StorageSpec + wantOK bool + wantMessage string // exact ": " + }{ + { + name: "valid RWO storage is accepted", + storage: &StorageSpec{VolumeSizeRequest: resource.MustParse("1Gi")}, + wantOK: true, + }, + { + name: "missing volumeSizeRequest is required", + storage: &StorageSpec{StorageClassName: "standard"}, + wantMessage: "spec.storageSpec.volumeSizeRequest: Required value: volumeSizeRequest is required when storageSpec is set; specify a persistent volume size such as \"1Gi\".", + }, + { + name: "volumeSizeRequest below 1Mi is rejected", + storage: &StorageSpec{VolumeSizeRequest: resource.MustParse("512Ki")}, + wantMessage: "spec.storageSpec.volumeSizeRequest: Invalid value: \"512Ki\": volumeSizeRequest must be at least 1Mi; got \"512Ki\". Request a larger persistent volume (e.g. \"1Gi\").", + }, + { + name: "volumeSizeLimit below request is rejected", + storage: &StorageSpec{ + VolumeSizeRequest: resource.MustParse("2Gi"), + VolumeSizeLimit: resource.MustParse("1Gi"), + }, + wantMessage: "spec.storageSpec.volumeSizeLimit: Invalid value: \"1Gi\": volumeSizeLimit (\"1Gi\") must be greater than or equal to volumeSizeRequest (\"2Gi\"); raise volumeSizeLimit or lower volumeSizeRequest.", + }, + { + name: "volumeSizeLimit equal to request is accepted", + storage: &StorageSpec{ + VolumeSizeRequest: resource.MustParse("1Gi"), + VolumeSizeLimit: resource.MustParse("1Gi"), + }, + wantOK: true, + }, + { + name: "unsupported accessModes lists the supported choices", + storage: &StorageSpec{ + VolumeSizeRequest: resource.MustParse("1Gi"), + AccessModes: "ReadOnlyMany", + }, + wantMessage: "spec.storageSpec.accessModes: Unsupported value: \"ReadOnlyMany\": supported values: \"ReadWriteOnce\", \"ReadWriteMany\"", + }, + { + name: "ReadWriteMany without pvcName is required", + storage: &StorageSpec{ + VolumeSizeRequest: resource.MustParse("1Gi"), + AccessModes: "ReadWriteMany", + }, + wantMessage: "spec.storageSpec.pvcName: Required value: pvcName is required when accessModes is \"ReadWriteMany\"; set it to the name of the pre-provisioned shared PersistentVolumeClaim the etcd pods should mount.", + }, + { + name: "ReadWriteMany with pvcName is accepted", + storage: &StorageSpec{ + VolumeSizeRequest: resource.MustParse("1Gi"), + AccessModes: "ReadWriteMany", + PVCName: "shared-etcd-pvc", + }, + wantOK: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + _, err := v.ValidateCreate(context.Background(), withStorage(tc.storage)) + if tc.wantOK { + if err != nil { + t.Fatalf("expected storageSpec to be accepted, got: %v", err) + } + return + } + assertCause(t, err, tc.wantMessage) + }) + } +} + +// TestValidateStorageSpec_OnUpdate guards the update-path handling: a storageSpec +// added to a cluster that previously had none must still be validated, while the +// immutability rule continues to protect a storageSpec that was already set. +func TestValidateStorageSpec_OnUpdate(t *testing.T) { + v := &EtcdClusterCustomValidator{} + ctx := context.Background() + + t.Run("adding an invalid storageSpec on update is rejected", func(t *testing.T) { + old := newCluster(3, "3.6.1") // no storage + updated := newCluster(3, "3.6.1") + updated.Spec.StorageSpec = &StorageSpec{StorageClassName: "standard"} // missing size + _, err := v.ValidateUpdate(ctx, old, updated) + assertCause(t, err, + "spec.storageSpec.volumeSizeRequest: Required value: volumeSizeRequest is required when storageSpec is set; specify a persistent volume size such as \"1Gi\".") + }) + + t.Run("adding a valid storageSpec on update is accepted", func(t *testing.T) { + old := newCluster(3, "3.6.1") + updated := newCluster(3, "3.6.1") + updated.Spec.StorageSpec = &StorageSpec{VolumeSizeRequest: resource.MustParse("1Gi")} + if _, err := v.ValidateUpdate(ctx, old, updated); err != nil { + t.Fatalf("expected adding a valid storageSpec to be allowed, got: %v", err) + } + }) +} + +func TestValidateCommonName_Messages(t *testing.T) { + v := &EtcdClusterCustomValidator{} + ctx := context.Background() + + // 65 characters: one over the documented 64-char X.509 CN ceiling. + tooLong := strings.Repeat("a", 65) + + t.Run("auto provider commonName over 64 chars is rejected", func(t *testing.T) { + c := newCluster(3, "3.6.1") + c.Spec.TLS = &TLSCertificate{ + Provider: "auto", + ProviderCfg: ProviderConfig{AutoCfg: &ProviderAutoConfig{CommonConfig: CommonConfig{CommonName: tooLong}}}, + } + _, err := v.ValidateCreate(ctx, c) + assertCause(t, err, + "spec.tls.providerCfg.autoCfg.commonName: Invalid value: \""+tooLong+"\": commonName must be 64 characters or fewer to produce a valid X.509 CSR; got 65. Shorten commonName (the certificate provider derives a default when it is empty).") + }) + + t.Run("cert-manager provider commonName over 64 chars is rejected", func(t *testing.T) { + c := newCluster(3, "3.6.1") + c.Spec.TLS = &TLSCertificate{ + Provider: "cert-manager", + ProviderCfg: ProviderConfig{CertManagerCfg: &ProviderCertManagerConfig{ + IssuerName: "ca", + IssuerKind: "Issuer", + CommonConfig: CommonConfig{CommonName: tooLong}, + }}, + } + _, err := v.ValidateCreate(ctx, c) + assertCause(t, err, + "spec.tls.providerCfg.certManagerCfg.commonName: Invalid value: \""+tooLong+"\": commonName must be 64 characters or fewer to produce a valid X.509 CSR; got 65. Shorten commonName (the certificate provider derives a default when it is empty).") + }) + + t.Run("commonName exactly 64 chars is accepted", func(t *testing.T) { + c := newCluster(3, "3.6.1") + c.Spec.TLS = &TLSCertificate{ + Provider: "auto", + ProviderCfg: ProviderConfig{AutoCfg: &ProviderAutoConfig{CommonConfig: CommonConfig{CommonName: strings.Repeat("a", 64)}}}, + } + if _, err := v.ValidateCreate(ctx, c); err != nil { + t.Fatalf("expected a 64-char commonName to be accepted, got: %v", err) + } + }) +} diff --git a/api/v1alpha1/webhook_envtest_test.go b/api/v1alpha1/webhook_envtest_test.go new file mode 100644 index 00000000..5d9502d4 --- /dev/null +++ b/api/v1alpha1/webhook_envtest_test.go @@ -0,0 +1,286 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1_test + +import ( + "context" + "crypto/tls" + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + "testing" + "time" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/webhook" + + operatorv1alpha1 "go.etcd.io/etcd-operator/api/v1alpha1" +) + +// This file exercises the validating + defaulting webhooks against a real +// (envtest) API server: the generated webhook configurations are installed, a +// manager serves the webhook endpoints over the envtest-provisioned serving +// cert, and invalid EtcdCluster objects are applied to assert they are rejected +// with the exact, actionable messages produced by the webhook. + +var ( + whCfg *rest.Config + whEnv *envtest.Environment + whClient client.Client + whTestNS = "default" + whStarted bool +) + +func TestMain(m *testing.M) { + if os.Getenv("KUBEBUILDER_ASSETS") == "" { + // Without envtest binaries we cannot stand up an API server; the pure-unit + // tests in this package still run via the normal `go test` path. Skipping + // here keeps `go vet` / `go build` green on machines without assets. + fmt.Fprintln(os.Stderr, "KUBEBUILDER_ASSETS not set; skipping webhook envtest setup") + os.Exit(m.Run()) + } + + logf.SetLogger(zap.New(zap.UseDevMode(true))) + + envTestK8sVersion := os.Getenv("ENVTEST_K8S_VERSION") + if envTestK8sVersion == "" { + envTestK8sVersion = "1.31.0" + } + + whEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + BinaryAssetsDirectory: filepath.Join( + "..", "..", "bin", "k8s", + fmt.Sprintf("%s-%s-%s", envTestK8sVersion, runtime.GOOS, runtime.GOARCH), + ), + WebhookInstallOptions: envtest.WebhookInstallOptions{ + Paths: []string{filepath.Join("..", "..", "config", "webhook", "manifests.yaml")}, + }, + } + + var err error + whCfg, err = whEnv.Start() + if err != nil { + fmt.Fprintf(os.Stderr, "failed to start webhook envtest: %v\n", err) + os.Exit(1) + } + whStarted = true + + if err := operatorv1alpha1.AddToScheme(scheme.Scheme); err != nil { + fmt.Fprintf(os.Stderr, "failed to add scheme: %v\n", err) + os.Exit(1) + } + + whClient, err = client.New(whCfg, client.Options{Scheme: scheme.Scheme}) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to create client: %v\n", err) + os.Exit(1) + } + + // Start a manager serving the webhooks on the envtest-provisioned cert. + wo := whEnv.WebhookInstallOptions + mgr, err := manager.New(whCfg, manager.Options{ + Scheme: scheme.Scheme, + Metrics: server.Options{BindAddress: "0"}, + WebhookServer: webhook.NewServer(webhook.Options{ + Host: wo.LocalServingHost, + Port: wo.LocalServingPort, + CertDir: wo.LocalServingCertDir, + TLSOpts: []func(*tls.Config){func(c *tls.Config) { c.MinVersion = tls.VersionTLS12 }}, + }), + }) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to create manager: %v\n", err) + os.Exit(1) + } + if err := (&operatorv1alpha1.EtcdCluster{}).SetupWebhookWithManager(mgr); err != nil { + fmt.Fprintf(os.Stderr, "failed to register webhook: %v\n", err) + os.Exit(1) + } + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + if err := mgr.Start(ctx); err != nil { + fmt.Fprintf(os.Stderr, "manager exited: %v\n", err) + } + }() + + // Wait for the webhook server to accept connections before running tests. + if err := waitForWebhookServer(wo.LocalServingHost, wo.LocalServingPort); err != nil { + fmt.Fprintf(os.Stderr, "webhook server never became ready: %v\n", err) + cancel() + _ = whEnv.Stop() + os.Exit(1) + } + + code := m.Run() + + cancel() + if err := whEnv.Stop(); err != nil { + fmt.Fprintf(os.Stderr, "failed to stop envtest: %v\n", err) + } + os.Exit(code) +} + +func waitForWebhookServer(host string, port int) error { + addr := fmt.Sprintf("%s:%d", host, port) + dialer := &tls.Dialer{Config: &tls.Config{InsecureSkipVerify: true}} //nolint:gosec // test-only + deadline := time.Now().Add(30 * time.Second) + for time.Now().Before(deadline) { + conn, err := dialer.Dial("tcp", addr) + if err == nil { + _ = conn.Close() + return nil + } + time.Sleep(250 * time.Millisecond) + } + return fmt.Errorf("timed out dialing %s", addr) +} + +func skipIfNoEnvtest(t *testing.T) { + t.Helper() + if !whStarted { + t.Skip("webhook envtest not started (KUBEBUILDER_ASSETS unset)") + } +} + +func mkCluster(name string, size int, version string) *operatorv1alpha1.EtcdCluster { + return &operatorv1alpha1.EtcdCluster{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: whTestNS}, + Spec: operatorv1alpha1.EtcdClusterSpec{Size: size, Version: version}, + } +} + +func TestEnvtest_RejectsEvenSize(t *testing.T) { + skipIfNoEnvtest(t) + err := whClient.Create(context.Background(), mkCluster("even-size", 2, "3.6.1")) + if err == nil { + t.Fatal("expected even size to be rejected by the validating webhook") + } + if !strings.Contains(err.Error(), "size must be an odd number") { + t.Fatalf("unexpected rejection message: %v", err) + } + if !strings.Contains(err.Error(), "Use 1 or 3 instead") { + t.Fatalf("expected actionable remediation in message, got: %v", err) + } +} + +func TestEnvtest_RejectsBadVersion(t *testing.T) { + skipIfNoEnvtest(t) + err := whClient.Create(context.Background(), mkCluster("bad-version", 3, "not-semver")) + if err == nil { + t.Fatal("expected non-semver version to be rejected") + } + if !strings.Contains(err.Error(), "is not a valid semantic version") { + t.Fatalf("unexpected rejection message: %v", err) + } +} + +func TestEnvtest_RejectsUndersizedStorage(t *testing.T) { + skipIfNoEnvtest(t) + c := mkCluster("bad-storage", 3, "3.6.1") + c.Spec.StorageSpec = &operatorv1alpha1.StorageSpec{ + VolumeSizeRequest: resource.MustParse("512Ki"), // below the 1Mi floor + } + err := whClient.Create(context.Background(), c) + if err == nil { + t.Fatal("expected undersized volumeSizeRequest to be rejected by the validating webhook") + } + if !strings.Contains(err.Error(), "volumeSizeRequest must be at least 1Mi") { + t.Fatalf("unexpected storage rejection message: %v", err) + } +} + +func TestEnvtest_RejectsUnknownTLSProvider(t *testing.T) { + skipIfNoEnvtest(t) + c := mkCluster("bad-tls", 3, "3.6.1") + c.Spec.TLS = &operatorv1alpha1.TLSCertificate{Provider: "vault"} + err := whClient.Create(context.Background(), c) + if err == nil { + t.Fatal("expected unknown TLS provider to be rejected") + } + if !strings.Contains(err.Error(), "supported values: \"auto\", \"cert-manager\"") { + t.Fatalf("unexpected rejection message: %v", err) + } +} + +func TestEnvtest_AcceptsValidAndDefaultsTLSProvider(t *testing.T) { + skipIfNoEnvtest(t) + c := mkCluster("valid-defaults", 3, "3.6.1") + c.Spec.TLS = &operatorv1alpha1.TLSCertificate{} // empty provider -> defaulted to "auto" + if err := whClient.Create(context.Background(), c); err != nil { + t.Fatalf("expected valid cluster to be admitted, got: %v", err) + } + t.Cleanup(func() { _ = whClient.Delete(context.Background(), c) }) + + var got operatorv1alpha1.EtcdCluster + if err := whClient.Get(context.Background(), + client.ObjectKey{Name: c.Name, Namespace: c.Namespace}, &got); err != nil { + t.Fatalf("failed to read back cluster: %v", err) + } + if got.Spec.TLS == nil || got.Spec.TLS.Provider != "auto" { + t.Fatalf("expected defaulting webhook to set tls.provider=auto, got: %+v", got.Spec.TLS) + } +} + +func TestEnvtest_RejectsSkipMinorUpgrade(t *testing.T) { + skipIfNoEnvtest(t) + ctx := context.Background() + c := mkCluster("upgrade-guard", 3, "3.5.1") + if err := whClient.Create(ctx, c); err != nil { + t.Fatalf("failed to create initial cluster: %v", err) + } + t.Cleanup(func() { _ = whClient.Delete(ctx, c) }) + + var got operatorv1alpha1.EtcdCluster + if err := whClient.Get(ctx, client.ObjectKey{Name: c.Name, Namespace: c.Namespace}, &got); err != nil { + t.Fatalf("failed to read back cluster: %v", err) + } + got.Spec.Version = "3.7.1" // skips 3.6 + err := whClient.Update(ctx, &got) + if err == nil { + t.Fatal("expected skip-minor upgrade to be rejected on update") + } + if !strings.Contains(err.Error(), "skips a minor version") { + t.Fatalf("unexpected upgrade rejection message: %v", err) + } + + // A single-minor upgrade must be accepted. + if err := whClient.Get(ctx, client.ObjectKey{Name: c.Name, Namespace: c.Namespace}, &got); err != nil { + t.Fatalf("failed to re-read cluster: %v", err) + } + got.Spec.Version = "3.6.1" + if err := whClient.Update(ctx, &got); err != nil { + t.Fatalf("expected single-minor upgrade to be accepted, got: %v", err) + } +} + +// keep ctrl import used even if the manager construction changes shape. +var _ = ctrl.Log diff --git a/cmd/main.go b/cmd/main.go index a04983ec..af1ee171 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -158,6 +158,17 @@ func main() { setupLog.Error(err, "unable to create controller", "controller", "EtcdCluster") os.Exit(1) } + + // Register the EtcdCluster validating + defaulting admission webhooks. The + // webhook server is already wired above; this is the registration that was + // previously missing (see issue #380). Skip when ENABLE_WEBHOOKS=false, which + // is convenient for local `make run` without serving certs. + if os.Getenv("ENABLE_WEBHOOKS") != "false" { + if err = (&operatorv1alpha1.EtcdCluster{}).SetupWebhookWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create webhook", "webhook", "EtcdCluster") + os.Exit(1) + } + } // +kubebuilder:scaffold:builder if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { diff --git a/config/certmanager/certificate.yaml b/config/certmanager/certificate.yaml new file mode 100644 index 00000000..b2d10ccf --- /dev/null +++ b/config/certmanager/certificate.yaml @@ -0,0 +1,32 @@ +# The following manifests contain a self-signed issuer CR and a certificate CR. +# More document can be found at https://docs.cert-manager.io +# WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + labels: + app.kubernetes.io/name: etcd-operator + app.kubernetes.io/managed-by: kustomize + name: selfsigned-issuer + namespace: system +spec: + selfSigned: {} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + labels: + app.kubernetes.io/name: etcd-operator + app.kubernetes.io/managed-by: kustomize + name: serving-cert # this name should match the one appeared in kustomizeconfig.yaml + namespace: system +spec: + # SERVICE_NAME and SERVICE_NAMESPACE will be substituted by kustomize + # replacements in the config/default/kustomization.yaml file. + dnsNames: + - SERVICE_NAME.SERVICE_NAMESPACE.svc + - SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local + issuerRef: + kind: Issuer + name: selfsigned-issuer + secretName: webhook-server-cert diff --git a/config/certmanager/kustomization.yaml b/config/certmanager/kustomization.yaml new file mode 100644 index 00000000..bebea5a5 --- /dev/null +++ b/config/certmanager/kustomization.yaml @@ -0,0 +1,5 @@ +resources: +- certificate.yaml + +configurations: +- kustomizeconfig.yaml diff --git a/config/certmanager/kustomizeconfig.yaml b/config/certmanager/kustomizeconfig.yaml new file mode 100644 index 00000000..cf6f89e8 --- /dev/null +++ b/config/certmanager/kustomizeconfig.yaml @@ -0,0 +1,8 @@ +# This configuration is for teaching kustomize how to update name ref substitution +nameReference: +- kind: Issuer + group: cert-manager.io + fieldSpecs: + - kind: Certificate + group: cert-manager.io + path: spec/issuerRef/name diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index 792ef45c..63b8c310 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -20,9 +20,9 @@ resources: - ../manager # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml -#- ../webhook +- ../webhook # [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. -#- ../certmanager +- ../certmanager # [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. #- ../prometheus # [METRICS] Expose the controller manager metrics service. @@ -43,135 +43,120 @@ patches: # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml -#- path: manager_webhook_patch.yaml +- path: manager_webhook_patch.yaml + target: + kind: Deployment + +# [WEBHOOK] Break-glass escape hatch: add an objectSelector to both fail-closed +# webhook configurations so an admin can opt a single EtcdCluster out of admission +# (label etcd.operator.etcd.io/skip-webhooks=true) while the webhook is down. +- path: webhook_selector_patch.yaml + target: + group: admissionregistration.k8s.io + kind: ValidatingWebhookConfiguration +- path: webhook_selector_patch.yaml + target: + group: admissionregistration.k8s.io + kind: MutatingWebhookConfiguration -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. -# Uncomment the following replacements to add the cert-manager CA injection annotations -#replacements: -# - source: # Uncomment the following block if you have any webhook -# kind: Service -# version: v1 -# name: webhook-service -# fieldPath: .metadata.name # Name of the service -# targets: -# - select: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# fieldPaths: -# - .spec.dnsNames.0 -# - .spec.dnsNames.1 -# options: -# delimiter: '.' -# index: 0 -# create: true -# - source: -# kind: Service -# version: v1 -# name: webhook-service -# fieldPath: .metadata.namespace # Namespace of the service -# targets: -# - select: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# fieldPaths: -# - .spec.dnsNames.0 -# - .spec.dnsNames.1 -# options: -# delimiter: '.' -# index: 1 -# create: true -# -# - source: # Uncomment the following block if you have a ValidatingWebhook (--programmatic-validation) -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.namespace # Namespace of the certificate CR -# targets: -# - select: -# kind: ValidatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - source: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.name -# targets: -# - select: -# kind: ValidatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true -# -# - source: # Uncomment the following block if you have a DefaultingWebhook (--defaulting ) -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.namespace # Namespace of the certificate CR -# targets: -# - select: -# kind: MutatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - source: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.name -# targets: -# - select: -# kind: MutatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true -# -# - source: # Uncomment the following block if you have a ConversionWebhook (--conversion) -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.namespace # Namespace of the certificate CR -# targets: -# - select: -# kind: CustomResourceDefinition -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - source: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # This name should match the one in certificate.yaml -# fieldPath: .metadata.name -# targets: -# - select: -# kind: CustomResourceDefinition -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true +# [CERTMANAGER] To enable cert-manager, the following replacements wire the +# cert-manager CA into the webhook configurations and the serving Certificate's +# dnsNames so the API server trusts the webhook server's TLS cert. +replacements: +# Inject the webhook Service name/namespace into the serving Certificate dnsNames. +- source: + kind: Service + version: v1 + name: webhook-service + fieldPath: .metadata.name + targets: + - select: + kind: Certificate + group: cert-manager.io + version: v1 + fieldPaths: + - .spec.dnsNames.0 + - .spec.dnsNames.1 + options: + delimiter: '.' + index: 0 + create: true +- source: + kind: Service + version: v1 + name: webhook-service + fieldPath: .metadata.namespace + targets: + - select: + kind: Certificate + group: cert-manager.io + version: v1 + fieldPaths: + - .spec.dnsNames.0 + - .spec.dnsNames.1 + options: + delimiter: '.' + index: 1 + create: true +# Inject the serving Certificate into the ValidatingWebhookConfiguration CA annotation. +- source: + kind: Certificate + group: cert-manager.io + version: v1 + name: serving-cert + fieldPath: .metadata.namespace + targets: + - select: + kind: ValidatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 0 + create: true +- source: + kind: Certificate + group: cert-manager.io + version: v1 + name: serving-cert + fieldPath: .metadata.name + targets: + - select: + kind: ValidatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 1 + create: true +# Inject the serving Certificate into the MutatingWebhookConfiguration CA annotation. +- source: + kind: Certificate + group: cert-manager.io + version: v1 + name: serving-cert + fieldPath: .metadata.namespace + targets: + - select: + kind: MutatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 0 + create: true +- source: + kind: Certificate + group: cert-manager.io + version: v1 + name: serving-cert + fieldPath: .metadata.name + targets: + - select: + kind: MutatingWebhookConfiguration + fieldPaths: + - .metadata.annotations.[cert-manager.io/inject-ca-from] + options: + delimiter: '/' + index: 1 + create: true diff --git a/config/default/manager_webhook_patch.yaml b/config/default/manager_webhook_patch.yaml new file mode 100644 index 00000000..39a7ffe0 --- /dev/null +++ b/config/default/manager_webhook_patch.yaml @@ -0,0 +1,25 @@ +# This patch mounts the webhook server's serving certificate (provisioned by +# cert-manager into the webhook-server-cert secret) into the manager container +# and exposes the webhook port. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system +spec: + template: + spec: + containers: + - name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: webhook-certs + readOnly: true + volumes: + - name: webhook-certs + secret: + secretName: webhook-server-cert diff --git a/config/default/webhook_selector_patch.yaml b/config/default/webhook_selector_patch.yaml new file mode 100644 index 00000000..db279451 --- /dev/null +++ b/config/default/webhook_selector_patch.yaml @@ -0,0 +1,26 @@ +# Break-glass escape hatch for the fail-closed (failurePolicy: Fail) webhooks. +# +# Both the validating and the mutating webhook fail closed so a bad EtcdCluster +# spec cannot be admitted while the webhook server is unreachable (see the +# "Webhook policy rationale" comment in api/v1alpha1/etcdcluster_webhook.go). The +# rule scope is already narrow (only operator.etcd.io/v1alpha1 etcdclusters), so an +# operator outage never blocks core cluster objects -- but it would still block +# remediating an EtcdCluster CR while the operator is down. +# +# This objectSelector adds a per-object opt-out: an admin can label the single CR +# that needs editing with +# +# etcd.operator.etcd.io/skip-webhooks: "true" +# +# and that object (and only that object) bypasses both webhooks. The default -- +# no label -- still matches, so every EtcdCluster is validated/defaulted unless an +# operator has explicitly turned the webhooks off for break-glass. matchExpressions +# uses NotIn so the absence of the label is treated as "selected". +- op: add + path: /webhooks/0/objectSelector + value: + matchExpressions: + - key: etcd.operator.etcd.io/skip-webhooks + operator: NotIn + values: + - "true" diff --git a/config/webhook/kustomization.yaml b/config/webhook/kustomization.yaml new file mode 100644 index 00000000..9cf26134 --- /dev/null +++ b/config/webhook/kustomization.yaml @@ -0,0 +1,6 @@ +resources: +- manifests.yaml +- service.yaml + +configurations: +- kustomizeconfig.yaml diff --git a/config/webhook/kustomizeconfig.yaml b/config/webhook/kustomizeconfig.yaml new file mode 100644 index 00000000..206316e5 --- /dev/null +++ b/config/webhook/kustomizeconfig.yaml @@ -0,0 +1,22 @@ +# the following config is for teaching kustomize where to look at when substituting nameReference. +# It requires kustomize v2.1.0 or newer to work properly. +nameReference: +- kind: Service + version: v1 + fieldSpecs: + - kind: MutatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/name + - kind: ValidatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/name + +namespace: +- kind: MutatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/namespace + create: true +- kind: ValidatingWebhookConfiguration + group: admissionregistration.k8s.io + path: webhooks/clientConfig/service/namespace + create: true diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml new file mode 100644 index 00000000..b30a93d6 --- /dev/null +++ b/config/webhook/manifests.yaml @@ -0,0 +1,56 @@ +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: mutating-webhook-configuration +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: webhook-service + namespace: system + path: /mutate-operator-etcd-io-v1alpha1-etcdcluster + failurePolicy: Fail + matchPolicy: Equivalent + name: metcdcluster-v1alpha1.kb.io + rules: + - apiGroups: + - operator.etcd.io + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - etcdclusters + sideEffects: None + timeoutSeconds: 10 +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: validating-webhook-configuration +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: webhook-service + namespace: system + path: /validate-operator-etcd-io-v1alpha1-etcdcluster + failurePolicy: Fail + matchPolicy: Equivalent + name: vetcdcluster-v1alpha1.kb.io + rules: + - apiGroups: + - operator.etcd.io + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - etcdclusters + sideEffects: None + timeoutSeconds: 10 diff --git a/config/webhook/service.yaml b/config/webhook/service.yaml new file mode 100644 index 00000000..56767793 --- /dev/null +++ b/config/webhook/service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: webhook-service + namespace: system +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 9443 + selector: + control-plane: controller-manager diff --git a/internal/controller/etcdcluster_controller.go b/internal/controller/etcdcluster_controller.go index 867886a3..edf7dbe3 100644 --- a/internal/controller/etcdcluster_controller.go +++ b/internal/controller/etcdcluster_controller.go @@ -182,8 +182,10 @@ func (r *EtcdClusterReconciler) fetchAndValidateState(ctx context.Context, req c currentVersion := stsImage[idx+1:] targetVersion := ec.Spec.Version - // Only handle cases when there is a version change. - if currentVersion != targetVersion { + // Only handle cases when there is a version change. Compare on the + // normalized (v-stripped) form so the v-prefixed live image tag and a bare + // spec.version describing the same release are not seen as a change. + if normalizeEtcdVersion(currentVersion) != normalizeEtcdVersion(targetVersion) { // TODO: consider adding an option in the CRD called allowCustomImageUpgrade to make // the behavior here optional: // https://github.com/etcd-io/etcd-operator/pull/278/changes#r2764717418 diff --git a/internal/controller/utils.go b/internal/controller/utils.go index 3092c379..c37c9e3e 100644 --- a/internal/controller/utils.go +++ b/internal/controller/utils.go @@ -39,6 +39,38 @@ const ( volumeName = "etcd-data" ) +// normalizeEtcdVersion strips an optional leading "v" from an etcd version +// string. The CoreOS go-semver parser (used for upgrade-path checks here and in +// the admission webhook) rejects a leading "v", but the canonical etcd release +// images at the default registry (gcr.io/etcd-development/etcd) are published +// ONLY with v-prefixed tags (e.g. "v3.5.21"). Users therefore legitimately set +// spec.version with or without the "v"; normalizing to the bare semver here lets +// both forms compare and parse identically. +func normalizeEtcdVersion(version string) string { + return strings.TrimPrefix(strings.TrimSpace(version), "v") +} + +// etcdImageTag renders the image tag for spec.version against the configured +// registry. The default etcd-development registry only publishes v-prefixed +// tags, so a bare "3.5.21" would resolve to a NotFound "...:3.5.21" image +// (ImagePullBackOff). We therefore prepend the "v" the registry convention +// expects whenever the user supplied a bare semver. A version the user already +// wrote with a leading "v" is passed through unchanged, and a non-semver/custom +// tag (e.g. a pinned digest-less internal tag) is left exactly as written so +// non-default registries with their own tagging scheme are not disturbed. +func etcdImageTag(version string) string { + v := strings.TrimSpace(version) + if strings.HasPrefix(v, "v") { + return v + } + // Only re-apply the registry's "v" convention to values that are real + // semver; anything else is an explicit custom tag we must not rewrite. + if _, err := semver.NewVersion(v); err != nil { + return v + } + return "v" + v +} + type etcdClusterState string const ( @@ -146,7 +178,7 @@ func createOrPatchStatefulSet(ctx context.Context, logger logr.Logger, ec *ecv1a Name: "etcd", Command: []string{"/usr/local/bin/etcd"}, Args: createArgs(ec.Name, ec.Spec.EtcdOptions), - Image: fmt.Sprintf("%s:%s", ec.Spec.ImageRegistry, ec.Spec.Version), + Image: fmt.Sprintf("%s:%s", ec.Spec.ImageRegistry, etcdImageTag(ec.Spec.Version)), Env: []corev1.EnvVar{ { Name: "POD_NAME", @@ -811,11 +843,14 @@ func validateEtcdUpgradePath(etcdVersions []semver.Version, current, target stri currentIdx, targetIdx = -1, -1 ) - currentVer, err = semver.NewVersion(current) + // The live StatefulSet image tag is v-prefixed (the registry convention) while + // spec.version may be bare; normalize both so go-semver can parse them and so a + // "v3.5.21" image vs a "3.5.21" spec is not mistaken for a version change. + currentVer, err = semver.NewVersion(normalizeEtcdVersion(current)) if err != nil { return false, fmt.Errorf("failed to parse current version %s: %w", current, err) } - targetVer, err = semver.NewVersion(target) + targetVer, err = semver.NewVersion(normalizeEtcdVersion(target)) if err != nil { return false, fmt.Errorf("failed to parse target version %s: %w", target, err) } diff --git a/internal/controller/utils_test.go b/internal/controller/utils_test.go index 67592316..2a128b04 100644 --- a/internal/controller/utils_test.go +++ b/internal/controller/utils_test.go @@ -701,6 +701,62 @@ func TestCreateOrPatchStatefulSetWithPodLabels(t *testing.T) { } } +// TestEtcdImageTag guards the registry-convention normalization: the default +// etcd-development registry only publishes v-prefixed tags, so a bare semver +// spec.version must be rendered with a leading "v" (otherwise the image is +// NotFound -> ImagePullBackOff), while an already-v-prefixed value and an +// explicit non-semver custom tag must be passed through unchanged. +func TestEtcdImageTag(t *testing.T) { + tests := []struct { + name string + version string + want string + }{ + {name: "bare semver gets v prefix for the v-tagged registry", version: "3.6.1", want: "v3.6.1"}, + {name: "v-prefixed semver passes through unchanged", version: "v3.6.1", want: "v3.6.1"}, + {name: "default sample version (v-prefixed) is preserved", version: "v3.5.21", want: "v3.5.21"}, + {name: "surrounding whitespace is trimmed", version: " 3.5.21 ", want: "v3.5.21"}, + {name: "non-semver custom tag is left untouched", version: "latest", want: "latest"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, etcdImageTag(tt.version)) + }) + } +} + +// TestRenderedImageRefIsVPrefixed asserts end-to-end that a bare spec.version +// renders a v-prefixed image ref against the default registry, so the StatefulSet +// pulls a tag the registry actually publishes. +func TestRenderedImageRefIsVPrefixed(t *testing.T) { + ctx := t.Context() + logger := log.FromContext(ctx) + + scheme := runtime.NewScheme() + _ = corev1.AddToScheme(scheme) + _ = ecv1alpha1.AddToScheme(scheme) + _ = appsv1.AddToScheme(scheme) + + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() + ec := &ecv1alpha1.EtcdCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test-etcd", Namespace: "default"}, + Spec: ecv1alpha1.EtcdClusterSpec{ + Size: 3, + Version: "3.5.21", // bare semver, as a user might write it + ImageRegistry: "gcr.io/etcd-development/etcd", + }, + } + + require.NoError(t, createOrPatchStatefulSet(ctx, logger, ec, fakeClient, 3, scheme)) + + sts := &appsv1.StatefulSet{} + require.NoError(t, fakeClient.Get(ctx, client.ObjectKey{Name: "test-etcd", Namespace: "default"}, sts)) + require.Len(t, sts.Spec.Template.Spec.Containers, 1) + assert.Equal(t, "gcr.io/etcd-development/etcd:v3.5.21", + sts.Spec.Template.Spec.Containers[0].Image, + "bare spec.version must render a v-prefixed tag against the v-only etcd-development registry") +} + func TestCreatingArgs(t *testing.T) { tests := []struct { testName string diff --git a/test/config/webhook_render_test.go b/test/config/webhook_render_test.go new file mode 100644 index 00000000..472a468d --- /dev/null +++ b/test/config/webhook_render_test.go @@ -0,0 +1,278 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package config_test renders the deployment kustomization and asserts the +// production-hardening properties of the admission webhook wiring that are easy +// to break silently in YAML: the fail-closed policy and timeouts, the +// break-glass objectSelector, and the cert-manager caBundle injection chain +// (inject-ca-from annotation -> serving Certificate -> webhook Service DNS -> +// mounted secret). These are verified against the actual `kustomize build` +// output rather than the hand-maintained source files so a future edit to any +// link in the chain is caught. +package config_test + +import ( + "bytes" + "encoding/json" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + + certmanagerv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" + admissionregv1 "k8s.io/api/admissionregistration/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/yaml" +) + +const skipLabelKey = "etcd.operator.etcd.io/skip-webhooks" + +// repoRoot returns the repository root (two levels up from test/config). +func repoRoot(t *testing.T) string { + t.Helper() + wd, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + return filepath.Clean(filepath.Join(wd, "..", "..")) +} + +// kustomizeBin resolves a kustomize binary: the repo-local bin/kustomize that +// `make kustomize` installs, otherwise one on PATH. Returns "" when neither is +// available so the test can skip rather than fail on a bare checkout. +func kustomizeBin(t *testing.T) string { + t.Helper() + local := filepath.Join(repoRoot(t), "bin", "kustomize") + if fi, err := os.Stat(local); err == nil && !fi.IsDir() { + return local + } + if p, err := exec.LookPath("kustomize"); err == nil { + return p + } + return "" +} + +// renderDefault runs `kustomize build config/default` and splits the multi-doc +// output, returning the decoded webhook configurations and serving Certificate. +func renderDefault(t *testing.T) ( + []admissionregv1.ValidatingWebhookConfiguration, + []admissionregv1.MutatingWebhookConfiguration, + []certmanagerv1.Certificate, +) { + t.Helper() + bin := kustomizeBin(t) + if bin == "" { + t.Skip("kustomize not found (run `make kustomize`); skipping config render assertions") + } + root := repoRoot(t) + cmd := exec.Command(bin, "build", filepath.Join(root, "config", "default")) + out, err := cmd.Output() + if err != nil { + if ee, ok := err.(*exec.ExitError); ok { + t.Fatalf("kustomize build failed: %v\nstderr:\n%s", err, ee.Stderr) + } + t.Fatalf("kustomize build failed: %v", err) + } + + var ( + vwcs []admissionregv1.ValidatingWebhookConfiguration + mwcs []admissionregv1.MutatingWebhookConfiguration + certs []certmanagerv1.Certificate + ) + dec := yaml.NewYAMLOrJSONDecoder(bytes.NewReader(out), 4096) + for { + // Decode each document to canonical JSON, peek at its kind, then unmarshal + // into the matching typed object. yaml.Unmarshal on a generic map already + // produces JSON-compatible values, so encoding/json round-trips cleanly. + var raw json.RawMessage + if err := dec.Decode(&raw); err != nil { + break // io.EOF (or a malformed trailing doc) ends the stream. + } + if len(bytes.TrimSpace(raw)) == 0 { + continue + } + var meta metav1.TypeMeta + if err := json.Unmarshal(raw, &meta); err != nil { + continue + } + switch meta.Kind { + case "ValidatingWebhookConfiguration": + var o admissionregv1.ValidatingWebhookConfiguration + mustJSON(t, raw, &o) + vwcs = append(vwcs, o) + case "MutatingWebhookConfiguration": + var o admissionregv1.MutatingWebhookConfiguration + mustJSON(t, raw, &o) + mwcs = append(mwcs, o) + case "Certificate": + var o certmanagerv1.Certificate + mustJSON(t, raw, &o) + certs = append(certs, o) + } + } + return vwcs, mwcs, certs +} + +// mustJSON unmarshals a JSON document into out, failing the test on error. +func mustJSON(t *testing.T, raw json.RawMessage, out any) { + t.Helper() + if err := json.Unmarshal(raw, out); err != nil { + t.Fatalf("decoding %T: %v", out, err) + } +} + +// assertSkipSelector verifies the break-glass objectSelector: absence of the +// skip label must still select the object (NotIn "true"). +func assertSkipSelector(t *testing.T, sel *metav1.LabelSelector, who string) { + t.Helper() + if sel == nil { + t.Fatalf("%s: expected an objectSelector (break-glass escape hatch) but it was nil", who) + } + for _, expr := range sel.MatchExpressions { + if expr.Key != skipLabelKey { + continue + } + if expr.Operator != metav1.LabelSelectorOpNotIn { + t.Fatalf("%s: objectSelector on %q must use NotIn so an unlabeled CR is still validated; got %q", + who, skipLabelKey, expr.Operator) + } + if len(expr.Values) != 1 || expr.Values[0] != "true" { + t.Fatalf("%s: objectSelector on %q must exclude value \"true\"; got %v", + who, skipLabelKey, expr.Values) + } + return + } + t.Fatalf("%s: objectSelector is missing the break-glass key %q", who, skipLabelKey) +} + +func TestWebhookConfigsAreProductionHardened(t *testing.T) { + vwcs, mwcs, _ := renderDefault(t) + + if len(vwcs) != 1 { + t.Fatalf("expected exactly one ValidatingWebhookConfiguration, got %d", len(vwcs)) + } + if len(mwcs) != 1 { + t.Fatalf("expected exactly one MutatingWebhookConfiguration, got %d", len(mwcs)) + } + + check := func(who string, wh admissionregv1.ValidatingWebhook) { + if wh.FailurePolicy == nil || *wh.FailurePolicy != admissionregv1.Fail { + t.Errorf("%s: failurePolicy must be Fail (fail closed for correctness-critical validation); got %v", + who, wh.FailurePolicy) + } + if wh.MatchPolicy == nil || *wh.MatchPolicy != admissionregv1.Equivalent { + t.Errorf("%s: matchPolicy must be Equivalent; got %v", who, wh.MatchPolicy) + } + if wh.TimeoutSeconds == nil || *wh.TimeoutSeconds != 10 { + t.Errorf("%s: timeoutSeconds must be 10; got %v", who, wh.TimeoutSeconds) + } + if wh.SideEffects == nil || *wh.SideEffects != admissionregv1.SideEffectClassNone { + t.Errorf("%s: sideEffects must be None; got %v", who, wh.SideEffects) + } + assertSkipSelector(t, wh.ObjectSelector, who) + } + + for _, wh := range vwcs[0].Webhooks { + check("validating/"+wh.Name, wh) + } + for _, wh := range mwcs[0].Webhooks { + // MutatingWebhook shares the relevant fields; adapt via a tiny shim. + check("mutating/"+wh.Name, admissionregv1.ValidatingWebhook{ + Name: wh.Name, + FailurePolicy: wh.FailurePolicy, + MatchPolicy: wh.MatchPolicy, + TimeoutSeconds: wh.TimeoutSeconds, + SideEffects: wh.SideEffects, + ObjectSelector: wh.ObjectSelector, + }) + } +} + +func TestCertManagerCABundleInjectionWiring(t *testing.T) { + vwcs, mwcs, certs := renderDefault(t) + + if len(certs) != 1 { + t.Fatalf("expected exactly one serving Certificate, got %d", len(certs)) + } + cert := certs[0] + + // The inject-ca-from annotation on each webhook config must point at the + // rendered serving Certificate as namespace/name. + wantRef := cert.Namespace + "/" + cert.Name + const injectAnnotation = "cert-manager.io/inject-ca-from" + + assertInject := func(who string, ann map[string]string) { + got, ok := ann[injectAnnotation] + if !ok { + t.Fatalf("%s: missing %s annotation; cert-manager will not inject the caBundle", who, injectAnnotation) + } + if got != wantRef { + t.Fatalf("%s: %s = %q but the serving Certificate is %q; the CA will be injected from the wrong object", + who, injectAnnotation, got, wantRef) + } + } + assertInject("ValidatingWebhookConfiguration", vwcs[0].Annotations) + assertInject("MutatingWebhookConfiguration", mwcs[0].Annotations) + + // The Certificate must be issued by the rendered self-signed Issuer and write + // the secret the manager mounts (webhook-server-cert). + if cert.Spec.SecretName != "webhook-server-cert" { + t.Errorf("serving Certificate secretName must be webhook-server-cert (mounted by manager_webhook_patch); got %q", + cert.Spec.SecretName) + } + if cert.Spec.IssuerRef.Name == "" || cert.Spec.IssuerRef.Kind != "Issuer" { + t.Errorf("serving Certificate issuerRef must reference the self-signed Issuer; got %+v", cert.Spec.IssuerRef) + } + + // dnsNames must resolve to the actual webhook Service FQDN so the served cert + // validates against the clientConfig.service the API server dials. The + // replacements substitute SERVICE_NAME/SERVICE_NAMESPACE, so no placeholder + // must survive and the svc form must be present. + var sawSvc, sawClusterLocal bool + for _, dn := range cert.Spec.DNSNames { + if dn == "SERVICE_NAME.SERVICE_NAMESPACE.svc" || dn == "SERVICE_NAME.SERVICE_NAMESPACE.svc.cluster.local" { + t.Fatalf("serving Certificate dnsNames still contains an unsubstituted placeholder %q", dn) + } + if strings.HasSuffix(dn, ".svc") { + sawSvc = true + } + if strings.HasSuffix(dn, ".svc.cluster.local") { + sawClusterLocal = true + } + } + if !sawSvc || !sawClusterLocal { + t.Fatalf("serving Certificate dnsNames must include both ..svc and .svc.cluster.local; got %v", + cert.Spec.DNSNames) + } + + // Cross-check the dnsNames name/namespace against the webhook clientConfig the + // API server will dial, so a future rename of the Service can't silently break + // TLS SAN matching. + svcName := vwcs[0].Webhooks[0].ClientConfig.Service.Name + svcNS := vwcs[0].Webhooks[0].ClientConfig.Service.Namespace + wantFQDN := svcName + "." + svcNS + ".svc" + found := false + for _, dn := range cert.Spec.DNSNames { + if dn == wantFQDN { + found = true + } + } + if !found { + t.Fatalf("serving Certificate dnsNames must include the webhook Service FQDN %q so the served cert's "+ + "SAN matches the dialed service; got %v", wantFQDN, cert.Spec.DNSNames) + } +} diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 87d18d30..0aef6099 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -98,6 +98,28 @@ func TestMain(m *testing.M) { return ctx, nil }, + // install cert-manager BEFORE deploy. + // + // The admission-webhook feature wires config/default to reference + // ../certmanager: a cert-manager Certificate/Issuer issues the + // webhook-server-cert secret, and manager_webhook_patch.yaml mounts that + // secret into the manager pod. If cert-manager is absent at deploy time the + // Certificate/Issuer manifests have no CRDs to apply against and the secret + // never materializes, so the manager pod hangs in ContainerCreating and the + // DeploymentAvailable wait below times out before a single webhook assertion + // can run. cert-manager must therefore be installed (and its own webhook + // Ready) here, not lazily inside an individual feature test that runs after + // TestMain. InstallCertManager blocks until cert-manager-webhook is Available. + func(ctx context.Context, cfg *envconf.Config) (context.Context, error) { + log.Println("Installing cert-manager...") + if err := test_utils.InstallCertManager(); err != nil { + log.Printf("Unable to install cert-manager: %s", err) + return ctx, err + } + + return ctx, nil + }, + // set up environment func(ctx context.Context, cfg *envconf.Config) (context.Context, error) { // create namespace diff --git a/test/e2e/webhook_test.go b/test/e2e/webhook_test.go new file mode 100644 index 00000000..7b7e6d17 --- /dev/null +++ b/test/e2e/webhook_test.go @@ -0,0 +1,265 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + "strings" + "testing" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/e2e-framework/pkg/envconf" + "sigs.k8s.io/e2e-framework/pkg/features" + + ecv1alpha1 "go.etcd.io/etcd-operator/api/v1alpha1" +) + +// webhookValidVersion is a hardcoded, well-formed semantic version used by every +// webhook test that is NOT specifically exercising version validation. +// +// It deliberately does NOT use the suite-wide etcdVersion (sourced from +// `go list -m {{.Version}} go.etcd.io/etcd/api/v3`), because that value is +// v-prefixed (e.g. "v3.6.12"). semver.NewVersion("v3.6.12") errors, so the new +// validateVersionFormat webhook rejects it. Feeding etcdVersion into these cases +// would (a) make the happy-path "admits valid cluster" assertion fail outright, +// and (b) cause the size/tls/storage reject cases to pass for the WRONG reason -- +// the aggregated rejection would also carry a version error, so a substring match +// could no longer isolate the invariant actually under test. A fixed valid semver +// keeps each assertion proving exactly the one thing it claims to prove. +const webhookValidVersion = "3.6.1" + +// skipWebhooksLabel is the break-glass opt-out label injected as an objectSelector +// by config/default/webhook_selector_patch.yaml. An EtcdCluster carrying this label +// bypasses BOTH admission webhooks (the objectSelector matches every object that +// does NOT have the label set to "true"). +const skipWebhooksLabel = "etcd.operator.etcd.io/skip-webhooks" + +// newWebhookTestCluster builds an EtcdCluster with the given spec parameters. +// It is local to this file so the suite does not depend on helpers added by +// other in-flight e2e PRs. +func newWebhookTestCluster(name string, size int, version string) *ecv1alpha1.EtcdCluster { + return &ecv1alpha1.EtcdCluster{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "operator.etcd.io/v1alpha1", + Kind: "EtcdCluster", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Spec: ecv1alpha1.EtcdClusterSpec{Size: size, Version: version}, + } +} + +// requireWebhookRejection asserts that err is a rejection produced by the +// validating webhook (a structured 422 Invalid), that it names the expected +// spec field path, carries the expected actionable detail, and -- for isolation -- +// does NOT also carry any of the forbidden substrings (errors that belong to a +// different invariant). The isolation check is what catches the "passes for the +// wrong reason" failure mode where an aggregated NewInvalid happens to contain the +// asserted substring while really failing on an unrelated field (e.g. a bad +// version riding along on a size test). +func requireWebhookRejection(t *testing.T, err error, wantField, wantDetail string, forbidden ...string) { + t.Helper() + if err == nil { + t.Fatalf("expected the validating webhook to reject the request (field %q), but Create/Update succeeded", wantField) + } + if !apierrors.IsInvalid(err) { + t.Fatalf("expected an apierrors.IsInvalid (422) rejection from the validating webhook, got %T: %v", err, err) + } + msg := err.Error() + if !strings.Contains(msg, wantField) { + t.Fatalf("expected rejection to name field %q, got: %v", wantField, err) + } + if !strings.Contains(msg, wantDetail) { + t.Fatalf("expected rejection detail %q, got: %v", wantDetail, err) + } + for _, f := range forbidden { + if strings.Contains(msg, f) { + t.Fatalf("rejection leaked an unrelated invariant %q (test is not isolating %q); got: %v", f, wantField, err) + } + } +} + +// TestAdmissionWebhooks verifies that the validating and defaulting admission +// webhooks are registered behind a real cert-manager-issued serving cert and +// enforce the documented invariants at apply time through the live API server. +// Each invalid apply must be rejected synchronously, as a structured Invalid +// (422), with an actionable, field-scoped message; each valid apply must be +// admitted (and defaulted). It additionally proves the two behaviors that only an +// e2e against a live API server can prove: the kustomize CA-injection chain (every +// rejection here is a real API-server -> webhook-pod call over TLS) and the +// break-glass objectSelector opt-out. +// +// Admission is synchronous, so the test body contains no sleeps or polling: a +// successful Create/Update return IS the admission decision. +func TestAdmissionWebhooks(t *testing.T) { + feature := features.New("admission-webhooks") + + // --- Negative cases: each isolates exactly one invariant. ----------------- + + feature.Assess("rejects even cluster size with quorum guidance", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + err := c.Resources().Create(ctx, newWebhookTestCluster("wh-even", 2, webhookValidVersion)) + requireWebhookRejection(t, err, + "spec.size", + "size must be an odd number", + // isolation + remediation: must not be failing on the version, and + // must carry the concrete quorum remediation the envtest twin asserts. + "is not a valid semantic version") + if !strings.Contains(err.Error(), "Use 1 or 3 instead") { + t.Fatalf("expected concrete quorum remediation %q, got: %v", "Use 1 or 3 instead", err) + } + return ctx + }) + + feature.Assess("rejects non-semver version", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + err := c.Resources().Create(ctx, newWebhookTestCluster("wh-badver", 3, "not-a-version")) + requireWebhookRejection(t, err, + "spec.version", + "is not a valid semantic version", + // size is valid (3) here, so a size error would mean the wrong path fired. + "size must be an odd number") + return ctx + }) + + feature.Assess("rejects unknown TLS provider", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + cluster := newWebhookTestCluster("wh-badtls", 3, webhookValidVersion) + cluster.Spec.TLS = &ecv1alpha1.TLSCertificate{Provider: "vault"} + err := c.Resources().Create(ctx, cluster) + requireWebhookRejection(t, err, + "spec.tls.provider", + // exact NotSupported rendering -- catches a garbled supported-values list + // that a loose Contains("auto") && Contains("cert-manager") would miss. + `supported values: "auto", "cert-manager"`, + "size must be an odd number", "is not a valid semantic version") + return ctx + }) + + feature.Assess("rejects undersized storageSpec volumeSizeRequest", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + cluster := newWebhookTestCluster("wh-badstorage", 3, webhookValidVersion) + cluster.Spec.StorageSpec = &ecv1alpha1.StorageSpec{ + VolumeSizeRequest: resource.MustParse("512Ki"), // below the 1Mi floor + } + err := c.Resources().Create(ctx, cluster) + requireWebhookRejection(t, err, + "spec.storageSpec.volumeSizeRequest", + "volumeSizeRequest must be at least 1Mi", + "size must be an odd number", "is not a valid semantic version") + return ctx + }) + + // --- Positive case: admitted + defaulted. --------------------------------- + + feature.Assess("admits valid cluster and defaults tls.provider to auto", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + // size 1 (still odd/valid) minimizes the reconcile load this otherwise + // pure-admission test imposes on the kind node. + cluster := newWebhookTestCluster("wh-valid", 1, webhookValidVersion) + cluster.Spec.TLS = &ecv1alpha1.TLSCertificate{} // empty provider -> defaulted + if err := c.Resources().Create(ctx, cluster); err != nil { + t.Fatalf("expected valid cluster to be admitted, got: %v", err) + } + defer func() { _ = c.Resources().Delete(ctx, cluster) }() + + var got ecv1alpha1.EtcdCluster + if err := c.Resources().Get(ctx, cluster.Name, namespace, &got); err != nil { + t.Fatalf("failed to read back cluster: %v", err) + } + if got.Spec.TLS == nil || got.Spec.TLS.Provider != "auto" { + t.Fatalf("expected defaulting webhook to set tls.provider=auto, got: %+v", got.Spec.TLS) + } + return ctx + }) + + // --- Update path: rejects skip-minor, admits a single-minor upgrade. ------ + + feature.Assess("guards version upgrades on update (rejects skip-minor, admits single-minor)", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + cluster := newWebhookTestCluster("wh-upgrade", 1, "3.5.1") + if err := c.Resources().Create(ctx, cluster); err != nil { + t.Fatalf("failed to create initial cluster: %v", err) + } + defer func() { _ = c.Resources().Delete(ctx, cluster) }() + + // Skip-minor (3.5 -> 3.7) must be rejected. + var got ecv1alpha1.EtcdCluster + if err := c.Resources().Get(ctx, cluster.Name, namespace, &got); err != nil { + t.Fatalf("failed to read back cluster: %v", err) + } + got.Spec.Version = "3.7.1" // skips 3.6 + err := c.Resources().Update(ctx, &got) + requireWebhookRejection(t, err, "spec.version", "skips a minor version") + + // Single-minor (3.5 -> 3.6) must be admitted. This proves the validator + // is not simply over-blocking ALL updates -- the negative case alone + // cannot distinguish a correct guard from a reject-everything bug. + var cur ecv1alpha1.EtcdCluster + if err := c.Resources().Get(ctx, cluster.Name, namespace, &cur); err != nil { + t.Fatalf("failed to re-read cluster before single-minor update: %v", err) + } + cur.Spec.Version = "3.6.1" + if err := c.Resources().Update(ctx, &cur); err != nil { + t.Fatalf("expected single-minor upgrade 3.5.1 -> 3.6.1 to be admitted, got: %v", err) + } + return ctx + }) + + // --- Break-glass: the objectSelector opt-out. ----------------------------- + // + // This is the assertion that justifies paying the kind+build+deploy cost: it + // cannot be exercised by envtest or the static render test. A wrong operator + // (In vs NotIn) or a wrong value in webhook_selector_patch.yaml would silently + // disable admission fleet-wide; here we prove against a LIVE API server that + // (a) the label bypasses the otherwise-fatal even-size rejection, and + // (b) the very same invalid spec WITHOUT the label is still rejected -- so the + // bypass is genuinely scoped to the labeled object, not globally off. + + feature.Assess("break-glass skip-webhooks label bypasses admission for that object only", + func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { + c := cfg.Client() + + // Labeled: an even size (2) that the webhook would normally reject must be + // ADMITTED because the objectSelector excludes it. + labeled := newWebhookTestCluster("wh-skip-labeled", 2, webhookValidVersion) + labeled.Labels = map[string]string{skipWebhooksLabel: "true"} + if err := c.Resources().Create(ctx, labeled); err != nil { + t.Fatalf("expected break-glass-labeled even-sized cluster to bypass the webhook and be admitted, got: %v", err) + } + defer func() { _ = c.Resources().Delete(ctx, labeled) }() + + // Unlabeled control: the SAME invalid spec must still be rejected, proving + // the bypass is per-object and admission is otherwise live. + unlabeled := newWebhookTestCluster("wh-skip-unlabeled", 2, webhookValidVersion) + err := c.Resources().Create(ctx, unlabeled) + requireWebhookRejection(t, err, "spec.size", "size must be an odd number") + return ctx + }) + + _ = testEnv.Test(t, feature.Feature()) +}