From 5e9977270f40ad073e3e6fd5b9e51b7d49977310 Mon Sep 17 00:00:00 2001
From: Cici Huang <cicih@google.com>
Date: Thu, 5 Jun 2025 01:40:21 -0700
Subject: [PATCH] Add KEP for Implicit Tolerations.

---
 keps/prod-readiness/sig-scheduling/5282.yaml  |   3 +
 .../5282-implicit-tolerations/README.md       | 727 ++++++++++++++++++
 .../5282-implicit-tolerations/kep.yaml        |  43 ++
 3 files changed, 773 insertions(+)
 create mode 100644 keps/prod-readiness/sig-scheduling/5282.yaml
 create mode 100644 keps/sig-scheduling/5282-implicit-tolerations/README.md
 create mode 100644 keps/sig-scheduling/5282-implicit-tolerations/kep.yaml

diff --git a/keps/prod-readiness/sig-scheduling/5282.yaml b/keps/prod-readiness/sig-scheduling/5282.yaml
new file mode 100644
index 00000000000..51c7d7ad17e
--- /dev/null
+++ b/keps/prod-readiness/sig-scheduling/5282.yaml
@@ -0,0 +1,3 @@
+kep-number: 5282
+alpha:
+  approver: "@johnbelamaric"
diff --git a/keps/sig-scheduling/5282-implicit-tolerations/README.md b/keps/sig-scheduling/5282-implicit-tolerations/README.md
new file mode 100644
index 00000000000..17ecd5b0d19
--- /dev/null
+++ b/keps/sig-scheduling/5282-implicit-tolerations/README.md
@@ -0,0 +1,727 @@
+# KEP-5282: Implicit tolerations
+
+<!-- toc -->
+- [Release Signoff Checklist](#release-signoff-checklist)
+- [Summary](#summary)
+- [Motivation](#motivation)
+  - [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Proposal](#proposal)
+  - [User Stories (Optional)](#user-stories-optional)
+    - [Story 1 - GPU-Based ML Training](#story-1---gpu-based-ml-training)
+    - [Story 2 - Multi-Optional Resource Requests](#story-2---multi-optional-resource-requests)
+  - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional)
+  - [Risks and Mitigations](#risks-and-mitigations)
+    - [Increased Scheduler Complexity](#increased-scheduler-complexity)
+    - [Inconsistent Scheduling Behavior Across Clusters](#inconsistent-scheduling-behavior-across-clusters)
+    - [API Server Load from Pod Patching](#api-server-load-from-pod-patching)
+- [Design Details](#design-details)
+  - [Plugin Lifecycle](#plugin-lifecycle)
+  - [Scheduler configuration API](#scheduler-configuration-api)
+  - [Pod mutation](#pod-mutation)
+  - [Failure Handling](#failure-handling)
+  - [Test Plan](#test-plan)
+      - [Prerequisite testing updates](#prerequisite-testing-updates)
+      - [Unit tests](#unit-tests)
+      - [Integration tests](#integration-tests)
+      - [e2e tests](#e2e-tests)
+  - [Graduation Criteria](#graduation-criteria)
+    - [Alpha](#alpha)
+  - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy)
+  - [Version Skew Strategy](#version-skew-strategy)
+- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire)
+  - [Feature Enablement and Rollback](#feature-enablement-and-rollback)
+  - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning)
+  - [Monitoring Requirements](#monitoring-requirements)
+  - [Dependencies](#dependencies)
+  - [Scalability](#scalability)
+  - [Troubleshooting](#troubleshooting)
+- [Implementation History](#implementation-history)
+- [Drawbacks](#drawbacks)
+- [Alternatives](#alternatives)
+- [Infrastructure Needed (Optional)](#infrastructure-needed-optional)
+<!-- /toc -->
+
+## Release Signoff Checklist
+
+<!--
+**ACTION REQUIRED:** In order to merge code into a release, there must be an
+issue in [kubernetes/enhancements] referencing this KEP and targeting a release
+milestone **before the [Enhancement Freeze](https://git.k8s.io/sig-release/releases)
+of the targeted release**.
+
+For enhancements that make changes to code or processes/procedures in core
+Kubernetes—i.e., [kubernetes/kubernetes], we require the following Release
+Signoff checklist to be completed.
+
+Check these off as they are completed for the Release Team to track. These
+checklist items _must_ be updated for the enhancement to be released.
+-->
+
+Items marked with (R) are required *prior to targeting to a milestone / release*.
+
+- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR)
+- [ ] (R) KEP approvers have approved the KEP status as `implementable`
+- [ ] (R) Design details are appropriately documented
+- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors)
+  - [ ] e2e Tests for all Beta API Operations (endpoints)
+  - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+  - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free
+- [ ] (R) Graduation criteria is in place
+  - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) 
+- [ ] (R) Production readiness review completed
+- [ ] (R) Production readiness review approved
+- [ ] "Implementation History" section is up-to-date for milestone
+- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io]
+- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes
+
+<!--
+**Note:** This checklist is iterative and should be reviewed and updated every time this enhancement is being considered for a milestone.
+-->
+
+[kubernetes.io]: https://kubernetes.io/
+[kubernetes/enhancements]: https://git.k8s.io/enhancements
+[kubernetes/kubernetes]: https://git.k8s.io/kubernetes
+[kubernetes/website]: https://git.k8s.io/website
+
+## Summary
+
+This proposal introduces a scheduler-based mechanism to automatically apply tolerations to pods based on their Dynamic Resource Allocation (DRA) requirements. 
+The tolerations are derived from the resource classes and claims bound to a pod and applied within the scheduler during the PreEnqueue extension point.
+
+This avoids reliance on user-specified tolerations or external admission webhooks, reducing friction for users running workloads that consume extended resources like GPUs.
+
+## Motivation
+
+Cluster administrators often taint nodes with special resources (e.g., GPUs, FPGAs, DPUs) to prevent generic workloads from landing on them. 
+For pods to schedule on those nodes, users must add appropriate tolerations, which is often forgotten or error-prone.
+
+DRA introduced a more dynamic and decoupled way to specify resource requirements using ResourceClaims and DeviceClasses. 
+However, these claims are not fully resolved at API admission time, making it impossible for admission-time webhooks or admission plugins to inject correct tolerations.
+
+This proposal moves toleration logic into the scheduler to make it aware of the actual resolved resource requests, 
+enabling dynamic and reliable toleration injection.
+
+### Goals
+
+- Automatically add tolerations to pods based on their resolved DRA ResourceClaims and DeviceClasses.
+
+- Reduce user errors due to forgotten tolerations.
+
+- Avoid scheduling failures for pods that legitimately need tainted resources.
+
+### Non-Goals
+
+- Supporting non-DRA resource inference.
+
+- Inferring tolerations for arbitrary annotations or labels.
+
+- Changing the DRA APIs or Pod API.
+
+- Change the tolerations mechanism in any way.
+
+## Proposal
+
+Introduce a new PreEnqueue scheduler plugin (or extend the existing TaintToleration plugin) that:
+
+- Depends on the DRA PreEnqueue plugin to ensure all ResourceClaims and DeviceClasses are resolved.
+
+- Inspects the resolved resources and applies appropriate tolerations in-memory.
+
+- Optionally patches the Pod via the API server to persist these tolerations (configurable).
+
+This ensures that pods are only admitted to the scheduling queue when all resource info is ready, and allows toleration logic to be based on full context.
+
+
+### User Stories (Optional)
+
+<!--
+Detail the things that people will be able to do if this KEP is implemented.
+Include as much detail as possible so that people can understand the "how" of
+the system. The goal here is to make this feel real for users without getting
+bogged down.
+-->
+
+#### Story 1 - GPU-Based ML Training
+
+When a pod that uses a DRA claim for a GPU-backed DeviceClass is submitted. 
+The node with GPUs is tainted. 
+The user may forget to specify the toleration.
+Cluster admin could config through scheduler configuration.
+The scheduler sees the resolved DeviceClass and injects the toleration. 
+The pod schedules successfully.
+
+
+#### Story 2 - Multi-Optional Resource Requests
+A pod declares multiple optional claims. Depending on binding, 
+the scheduler injects tolerations for the correct resource class without user intervention.
+
+### Notes/Constraints/Caveats (Optional)
+
+<!--
+What are the caveats to the proposal?
+What are some important details that didn't come across above?
+Go in to as much detail as necessary here.
+This might be a good place to talk about core concepts and how they relate.
+-->
+
+### Risks and Mitigations
+
+#### Increased Scheduler Complexity
+
+Introducing dynamic toleration injection logic into the scheduler increases the cognitive and maintenance complexity of the scheduling logic.
+
+Mitigation: Isolate the logic within a dedicated PreEnqueue plugin to limit impact on other scheduling components. 
+Maintain strict test coverage and fuzzing for rule evaluation.
+
+#### Inconsistent Scheduling Behavior Across Clusters
+
+Clusters with different plugin configurations may behave differently for identical pods, potentially surprising users.
+
+Mitigation: Clearly document expected behavior and make the plugin opt-in. 
+Encourage platform providers to standardize configurations where applicable.
+
+#### API Server Load from Pod Patching
+
+The scheduler might issue one PATCH request per scheduled pod, increasing API server traffic and write load.
+
+Mitigation: Make PATCHing optional and off by default. Coordinate with SIG Scalability to assess and benchmark impact before enabling by default.
+
+## Design Details
+
+### Plugin Lifecycle
+
+Introduce a new PreEnqueue scheduler plugin (or extend the existing TaintToleration plugin) in scheduler plugin lifecycle:
+
+- Implement a PreEnqueue plugin.
+
+- Ensure it runs after DRA readiness is guaranteed.
+
+- Inspect bound ResourceClaims and DeviceClasses.
+
+- Inject tolerations based on scheduler config rules.
+
+### Scheduler configuration API
+
+The plugin will be configurable through the scheduler policy config, using pluginConfig entries. 
+The API will support defining toleration rules based on device class selectors.
+
+```go
+type ImplicitTolerationRule struct {
+  Selector RuleSelector
+  Toleration corev1.Toleration
+}
+
+type RuleSelector struct {
+  Type string
+
+  // for Type == 'ExtendedResource'
+  ResourceNames []string
+
+  // for Type == 'Device'
+ DevicePrototypes []resourcev1.Device
+
+  // for Type == 'CEL'
+  Expression *string
+}
+
+type ImplicitTolerationArgs struct {
+TolerationRules []ImplicitTolerationRule 
+PatchPods       bool 
+}
+```
+
+### Pod mutation
+
+To make user and other components stay informed, the tolerations could be injected into pod.
+A asynchronous call to api-server would be needed to make the patch.
+The asynchronous API call discussion happens in: https://github.com/kubernetes/enhancements/issues/5229
+
+### Failure Handling
+
+If ResourceClaims are not resolved, DRA plugin returns Unschedulable.
+This plugin never runs until DRA PreEnqueue permits.
+
+### Test Plan
+
+<!--
+**Note:** *Not required until targeted at a release.*
+The goal is to ensure that we don't accept enhancements with inadequate testing.
+
+All code is expected to have adequate tests (eventually with coverage
+expectations). Please adhere to the [Kubernetes testing guidelines][testing-guidelines]
+when drafting this test plan.
+
+[testing-guidelines]: https://git.k8s.io/community/contributors/devel/sig-testing/testing.md
+-->
+
+[X] I/we understand the owners of the involved components may require updates to
+existing tests to make this code solid enough prior to committing the changes necessary
+to implement this enhancement.
+
+##### Prerequisite testing updates
+
+<!--
+Based on reviewers feedback describe what additional tests need to be added prior
+implementing this enhancement to ensure the enhancements have also solid foundations.
+-->
+
+##### Unit tests
+
+- `<package>`: `<date>` - `<test coverage>`
+
+##### Integration tests
+
+- [test name](https://github.com/kubernetes/kubernetes/blob/2334b8469e1983c525c0c6382125710093a25883/test/integration/...): [integration master](https://testgrid.k8s.io/sig-release-master-blocking#integration-master?include-filter-by-regex=MyCoolFeature), [triage search](https://storage.googleapis.com/k8s-triage/index.html?test=MyCoolFeature)
+
+##### e2e tests
+
+- [test name](https://github.com/kubernetes/kubernetes/blob/2334b8469e1983c525c0c6382125710093a25883/test/e2e/...): [SIG ...](https://testgrid.k8s.io/sig-...?include-filter-by-regex=MyCoolFeature), [triage search](https://storage.googleapis.com/k8s-triage/index.html?test=MyCoolFeature)
+
+### Graduation Criteria
+#### Alpha
+
+- Feature implemented behind a feature flag
+- Initial e2e tests completed and enabled
+
+<!--
+**Note:** *Not required until targeted at a release.*
+
+Define graduation milestones.
+
+These may be defined in terms of API maturity, [feature gate] graduations, or as
+something else. The KEP should keep this high-level with a focus on what
+signals will be looked at to determine graduation.
+
+Consider the following in developing the graduation criteria for this enhancement:
+- [Maturity levels (`alpha`, `beta`, `stable`)][maturity-levels]
+- [Feature gate][feature gate] lifecycle
+- [Deprecation policy][deprecation-policy]
+
+Clearly define what graduation means by either linking to the [API doc
+definition](https://kubernetes.io/docs/concepts/overview/kubernetes-api/#api-versioning)
+or by redefining what graduation means.
+
+In general we try to use the same stages (alpha, beta, GA), regardless of how the
+functionality is accessed.
+
+[feature gate]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[maturity-levels]: https://git.k8s.io/community/contributors/devel/sig-architecture/api_changes.md#alpha-beta-and-stable-versions
+[deprecation-policy]: https://kubernetes.io/docs/reference/using-api/deprecation-policy/
+
+Below are some examples to consider, in addition to the aforementioned [maturity levels][maturity-levels].
+
+#### Alpha
+
+- Feature implemented behind a feature flag
+- Initial e2e tests completed and enabled
+
+#### Beta
+
+- Gather feedback from developers and surveys
+- Complete features A, B, C
+- Additional tests are in Testgrid and linked in KEP
+- More rigorous forms of testing—e.g., downgrade tests and scalability tests
+- All functionality completed
+- All security enforcement completed
+- All monitoring requirements completed
+- All testing requirements completed
+- All known pre-release issues and gaps resolved 
+
+**Note:** Beta criteria must include all functional, security, monitoring, and testing requirements along with resolving all issues and gaps identified
+
+#### GA
+
+- N examples of real-world usage
+- N installs
+- Allowing time for feedback
+- All issues and gaps identified as feedback during beta are resolved
+
+**Note:** GA criteria must not include any functional, security, monitoring, or testing requirements.  Those must be beta requirements.
+
+**Note:** Generally we also wait at least two releases between beta and
+GA/stable, because there's no opportunity for user feedback, or even bug reports,
+in back-to-back releases.
+
+**For non-optional features moving to GA, the graduation criteria must include
+[conformance tests].**
+
+[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md
+
+#### Deprecation
+
+- Announce deprecation and support policy of the existing flag
+- Two versions passed since introducing the functionality that deprecates the flag (to address version skew)
+- Address feedback on usage/changed behavior, provided on GitHub issues
+- Deprecate the flag
+-->
+
+### Upgrade / Downgrade Strategy
+
+Implicit Toleration gets disabled when downgrading to a release without support for it or
+when disabling the feature. The effect is as if the toleration weren't set.
+
+### Version Skew Strategy
+
+During version skew where the apiserver supports the feature and the scheduler
+doesn't, an older scheduler will simply not be configured with the plugin or the plugin will be disabled by default. 
+No impact to API server behavior.
+
+During version skew where the apiserver does not support the feature and the scheduler
+does, when the patching of Pods is enabled, the API server may reject unknown toleration keys or object structure.
+
+Mitigation: The scheduler should treat failed PATCH operations as non-fatal and proceed with in-memory toleration injection.
+
+## Production Readiness Review Questionnaire
+
+### Feature Enablement and Rollback
+
+<!--
+This section must be completed when targeting alpha to a release.
+-->
+
+###### How can this feature be enabled / disabled in a live cluster?
+
+<!--
+Pick one of these and delete the rest.
+
+Documentation is available on [feature gate lifecycle] and expectations, as
+well as the [existing list] of feature gates.
+
+[feature gate lifecycle]: https://git.k8s.io/community/contributors/devel/sig-architecture/feature-gates.md
+[existing list]: https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/
+-->
+
+- [ ] Feature gate (also fill in values in `kep.yaml`)
+  - Feature gate name: ImplicitToleration
+  - Components depending on the feature gate: scheduler, apiserver
+
+###### Does enabling the feature change any default behavior?
+
+<!--
+Any change of default behavior may be surprising to users or break existing
+automations, so be extremely careful here.
+-->
+No. 
+
+###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)?
+
+<!--
+Describe the consequences on existing workloads (e.g., if this is a runtime
+feature, can it break the existing applications?).
+
+Feature gates are typically disabled by setting the flag to `false` and
+restarting the component. No other changes should be necessary to disable the
+feature.
+
+NOTE: Also set `disable-supported` to `true` or `false` in `kep.yaml`.
+-->
+Yes. It can be disabled through the feature gate added.
+
+###### What happens if we reenable the feature if it was previously rolled back?
+
+The implicit toleration will takes effect again for scheduling after feature reenabled.
+
+###### Are there any tests for feature enablement/disablement?
+
+This will be covered through unit tests and e2e test for the apiserver and scheduler.
+
+
+### Rollout, Upgrade and Rollback Planning
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### How can a rollout or rollback fail? Can it impact already running workloads?
+
+<!--
+Try to be as paranoid as possible - e.g., what if some components will restart
+mid-rollout?
+
+Be sure to consider highly-available clusters, where, for example,
+feature flags will be enabled on some API servers and not others during the
+rollout. Similarly, consider large clusters and how enablement/disablement
+will rollout across nodes.
+-->
+
+###### What specific metrics should inform a rollback?
+
+<!--
+What signals should users be paying attention to when the feature is young
+that might indicate a serious problem?
+-->
+
+###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested?
+
+<!--
+Describe manual testing that was done and the outcomes.
+Longer term, we may want to require automated upgrade/rollback tests, but we
+are missing a bunch of machinery and tooling and can't do that now.
+-->
+
+###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.?
+
+<!--
+Even if applying deprecation policies, they may still surprise some users.
+-->
+
+### Monitoring Requirements
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### How can an operator determine if the feature is in use by workloads?
+
+<!--
+Ideally, this should be a metric. Operations against the Kubernetes API (e.g.,
+checking if there are objects with field X set) may be a last resort. Avoid
+logs or events for this purpose.
+-->
+
+###### How can someone using this feature know that it is working for their instance?
+
+<!--
+For instance, if this is a pod-related feature, it should be possible to determine if the feature is functioning properly
+for each individual pod.
+Pick one more of these and delete the rest.
+Please describe all items visible to end users below with sufficient detail so that they can verify correct enablement
+and operation of this feature.
+Recall that end users cannot usually observe component logs or access metrics.
+-->
+
+- [ ] Events
+  - Event Reason: 
+- [ ] API .status
+  - Condition name: 
+  - Other field: 
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### What are the reasonable SLOs (Service Level Objectives) for the enhancement?
+
+<!--
+This is your opportunity to define what "normal" quality of service looks like
+for a feature.
+
+It's impossible to provide comprehensive guidance, but at the very
+high level (needs more precise definitions) those may be things like:
+  - per-day percentage of API calls finishing with 5XX errors <= 1%
+  - 99% percentile over day of absolute value from (job creation time minus expected
+    job creation time) for cron job <= 10%
+  - 99.9% of /health requests per day finish with 200 code
+
+These goals will help you determine what you need to measure (SLIs) in the next
+question.
+-->
+
+###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?
+
+<!--
+Pick one more of these and delete the rest.
+-->
+
+- [ ] Metrics
+  - Metric name:
+  - [Optional] Aggregation method:
+  - Components exposing the metric:
+- [ ] Other (treat as last resort)
+  - Details:
+
+###### Are there any missing metrics that would be useful to have to improve observability of this feature?
+
+<!--
+Describe the metrics themselves and the reasons why they weren't added (e.g., cost,
+implementation difficulties, etc.).
+-->
+
+### Dependencies
+
+<!--
+This section must be completed when targeting beta to a release.
+-->
+
+###### Does this feature depend on any specific services running in the cluster?
+
+<!--
+Think about both cluster-level services (e.g. metrics-server) as well
+as node-level agents (e.g. specific version of CRI). Focus on external or
+optional services that are needed. For example, if this feature depends on
+a cloud provider API, or upon an external software-defined storage or network
+control plane.
+
+For each of these, fill in the following—thinking about running existing user workloads
+and creating new ones, as well as about cluster-level services (e.g. DNS):
+  - [Dependency name]
+    - Usage description:
+      - Impact of its outage on the feature:
+      - Impact of its degraded performance or high-error rates on the feature:
+-->
+
+### Scalability
+
+<!--
+For alpha, this section is encouraged: reviewers should consider these questions
+and attempt to answer them.
+
+For beta, this section is required: reviewers must answer these questions.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+-->
+
+###### Will enabling / using this feature result in any new API calls?
+
+<!--
+Describe them, providing:
+  - API call type (e.g. PATCH pods)
+  - estimated throughput
+  - originating component(s) (e.g. Kubelet, Feature-X-controller)
+Focusing mostly on:
+  - components listing and/or watching resources they didn't before
+  - API calls that may be triggered by changes of some Kubernetes resources
+    (e.g. update of object X triggers new updates of object Y)
+  - periodic API calls to reconcile state (e.g. periodic fetching state,
+    heartbeats, leader election, etc.)
+-->
+
+###### Will enabling / using this feature result in introducing new API types?
+
+<!--
+Describe them, providing:
+  - API type
+  - Supported number of objects per cluster
+  - Supported number of objects per namespace (for namespace-scoped objects)
+-->
+
+###### Will enabling / using this feature result in any new calls to the cloud provider?
+
+<!--
+Describe them, providing:
+  - Which API(s):
+  - Estimated increase:
+-->
+
+###### Will enabling / using this feature result in increasing size or count of the existing API objects?
+
+<!--
+Describe them, providing:
+  - API type(s):
+  - Estimated increase in size: (e.g., new annotation of size 32B)
+  - Estimated amount of new objects: (e.g., new Object X for every existing Pod)
+-->
+
+###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs?
+
+<!--
+Look at the [existing SLIs/SLOs].
+
+Think about adding additional work or introducing new steps in between
+(e.g. need to do X to start a container), etc. Please describe the details.
+
+[existing SLIs/SLOs]: https://git.k8s.io/community/sig-scalability/slos/slos.md#kubernetes-slisslos
+-->
+
+###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components?
+
+<!--
+Things to keep in mind include: additional in-memory state, additional
+non-trivial computations, excessive access to disks (including increased log
+volume), significant amount of data sent and/or received over network, etc.
+This through this both in small and large cases, again with respect to the
+[supported limits].
+
+[supported limits]: https://git.k8s.io/community//sig-scalability/configs-and-limits/thresholds.md
+-->
+
+###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)?
+
+<!--
+Focus not just on happy cases, but primarily on more pathological cases
+(e.g. probes taking a minute instead of milliseconds, failed pods consuming resources, etc.).
+If any of the resources can be exhausted, how this is mitigated with the existing limits
+(e.g. pods per node) or new limits added by this KEP?
+
+Are there any tests that were run/should be run to understand performance characteristics better
+and validate the declared limits?
+-->
+
+### Troubleshooting
+
+<!--
+This section must be completed when targeting beta to a release.
+
+For GA, this section is required: approvers should be able to confirm the
+previous answers based on experience in the field.
+
+The Troubleshooting section currently serves the `Playbook` role. We may consider
+splitting it into a dedicated `Playbook` document (potentially with some monitoring
+details). For now, we leave it here.
+-->
+
+###### How does this feature react if the API server and/or etcd is unavailable?
+
+###### What are other known failure modes?
+
+<!--
+For each of them, fill in the following information by copying the below template:
+  - [Failure mode brief description]
+    - Detection: How can it be detected via metrics? Stated another way:
+      how can an operator troubleshoot without logging into a master or worker node?
+    - Mitigations: What can be done to stop the bleeding, especially for already
+      running user workloads?
+    - Diagnostics: What are the useful log messages and their required logging
+      levels that could help debug the issue?
+      Not required until feature graduated to beta.
+    - Testing: Are there any tests for failure mode? If not, describe why.
+-->
+
+###### What steps should be taken if SLOs are not being met to determine the problem?
+
+## Implementation History
+
+<!--
+Major milestones in the lifecycle of a KEP should be tracked in this section.
+Major milestones might include:
+- the `Summary` and `Motivation` sections being merged, signaling SIG acceptance
+- the `Proposal` section being merged, signaling agreement on a proposed design
+- the date implementation started
+- the first Kubernetes release where an initial version of the KEP was available
+- the version of Kubernetes where the KEP graduated to general availability
+- when the KEP was retired or superseded
+-->
+
+## Drawbacks
+
+<!--
+Why should this KEP _not_ be implemented?
+-->
+
+## Alternatives
+
+1. Admission Webhook
+
+Not viable due to incomplete resource resolution at admission time.
+
+2. Controller-Based Solution
+
+Separate controller updates pod with tolerations post-claim resolution.
+Pros: 
+  - Clean separation
+  - avoids scheduler API writes
+Cons: 
+  - Adds latency
+  - introduces new component
+
+
+## Infrastructure Needed (Optional)
+
+<!--
+Use this section if you need things from the project/SIG. Examples include a
+new subproject, repos requested, or GitHub details. Listing these here allows a
+SIG to get the process for these resources started right away.
+-->
diff --git a/keps/sig-scheduling/5282-implicit-tolerations/kep.yaml b/keps/sig-scheduling/5282-implicit-tolerations/kep.yaml
new file mode 100644
index 00000000000..4a22d1248ac
--- /dev/null
+++ b/keps/sig-scheduling/5282-implicit-tolerations/kep.yaml
@@ -0,0 +1,43 @@
+title: Implicit tolerations
+kep-number: 5282
+authors:
+  - "@cici37"
+owning-sig: sig-scheduling
+participating-sigs:
+  - sig-scheduling
+status: implementable
+creation-date: 2025-06-04
+reviewers:
+  - "@sanposhiho"
+  - "@dom4ha"
+  - "@johnbelamaric"
+approvers:
+  - "@sanposhiho"
+  - "@dom4ha"
+  - "@johnbelamaric"
+
+# The target maturity stage in the current dev cycle for this KEP.
+# If the purpose of this KEP is to deprecate a user-visible feature
+# and a Deprecated feature gates are added, they should be deprecated|disabled|removed.
+stage: alpha
+
+# The most recent milestone for which work toward delivery of this KEP has been
+# done. This can be the current (upcoming) milestone, if it is being actively
+# worked on.
+latest-milestone: "v1.34"
+
+# The milestone at which this feature was, or is targeted to be, at each stage.
+milestone:
+  alpha: "v1.34"
+
+# The following PRR answers are required at alpha release
+# List the feature gate name and the components for which it must be enabled
+feature-gates:
+  - name: ImplicitTolerations
+    components:
+      - kube-apiserver
+      - kube-scheduler
+disable-supported: true
+
+# The following PRR answers are required at beta release
+metrics: