vllm-project · DarkLight1337 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
@@ -2282,7 +2282,6 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
 ) -> None:
     """Validate chunked prefill settings in the scheduler config for
     encoder-decoder models."""
-    assert scheduler_config.chunked_prefill_enabled is expect_enabled
     assert scheduler_config.enable_chunked_prefill is expect_enabled
     if is_encoder_decoder:
         # Encoder-decoder models should automatically disable chunked multimodal

diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
@@ -272,7 +272,7 @@ def test_speculators_model_integration(
 
 
 @pytest.mark.parametrize(
-    ["model_setup", "mm_enabled", "chunked_prefill_enabled"],
+    ["model_setup", "mm_enabled", "enable_chunked_prefill"],
     [
         (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
         pytest.param(
@@ -358,7 +358,7 @@ def test_eagle_correctness(
     sampling_config: SamplingParams,
     model_setup: tuple[str, str, str, int],
     mm_enabled: bool,
-    chunked_prefill_enabled: bool,
+    enable_chunked_prefill: bool,
     attn_backend: str,
 ):
     if attn_backend == "TREE_ATTN":
@@ -396,9 +396,7 @@ def test_eagle_correctness(
 
         method, model_name, spec_model_name, tp_size = model_setup
         max_model_len = 2048
-        max_num_batched_tokens = max_model_len
-        if chunked_prefill_enabled:
-            max_num_batched_tokens = 128
+        max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
 
         ref_llm = LLM(
             model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
@@ -420,7 +418,7 @@ def test_eagle_correctness(
             },
             max_model_len=max_model_len,
             max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=chunked_prefill_enabled,
+            enable_chunked_prefill=enable_chunked_prefill,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
@@ -571,7 +571,7 @@ def test_encoder_instance_zero_kv_cache(
         )
 
         # Check 5: Verify chunked prefill is disabled
-        assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
+        assert not vllm_config.scheduler_config.enable_chunked_prefill, (
             "Encoder instance should disable chunked prefill (no KV cache)"
         )
 

@@ -232,19 +232,11 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 self.long_prefill_token_threshold,
             )
 
-    @property
-    def chunked_prefill_enabled(self) -> bool:
-        return self.enable_chunked_prefill
-
-    @chunked_prefill_enabled.setter
-    def chunked_prefill_enabled(self, value: bool):
-        self.enable_chunked_prefill = value
-
     @model_validator(mode="after")
     def _verify_args(self) -> Self:
         if (
             self.max_num_batched_tokens < self.max_model_len
-            and not self.chunked_prefill_enabled
+            and not self.enable_chunked_prefill
         ):
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
@@ -271,7 +263,7 @@ def _verify_args(self) -> Self:
             )
 
         if self.max_num_partial_prefills > 1:
-            if not self.chunked_prefill_enabled:
+            if not self.enable_chunked_prefill:
                 raise ValueError(
                     "Chunked prefill must be enabled to set "
                     "max_num_partial_prefills > 1."

@@ -411,7 +411,7 @@ def __post_init__(self):
 
         if (
             self.model_config is not None
-            and self.scheduler_config.chunked_prefill_enabled
+            and self.scheduler_config.enable_chunked_prefill
             and self.model_config.dtype == torch.float32
             and current_platform.get_device_capability() == (7, 5)
         ):
@@ -584,7 +584,7 @@ def __post_init__(self):
         ):
             for reason in disable_chunked_prefill_reasons:
                 logger.info(reason)
-            self.scheduler_config.chunked_prefill_enabled = False
+            self.scheduler_config.enable_chunked_prefill = False
             self.scheduler_config.long_prefill_token_threshold = 0
 
             if self.cache_config is not None:
@@ -1026,7 +1026,7 @@ def __str__(self):
             f"seed={self.model_config.seed}, "
             f"served_model_name={self.model_config.served_model_name}, "
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
-            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
+            f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, "  # noqa
             f"pooler_config={self.model_config.pooler_config!r}, "
             f"compilation_config={self.compilation_config!r}"
         )

@@ -192,7 +192,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         scheduler_config = vllm_config.scheduler_config
         if (
-            scheduler_config.chunked_prefill_enabled
+            scheduler_config.enable_chunked_prefill
             or cache_config.enable_prefix_caching
         ) and cache_config.cache_dtype != "auto":
             raise RuntimeError(

@@ -497,7 +497,7 @@ def schedule(self) -> SchedulerOutput:
                     # chunked prefill has to be enabled explicitly to allow
                     # pooling requests to be chunked
                     if (
-                        not self.scheduler_config.chunked_prefill_enabled
+                        not self.scheduler_config.enable_chunked_prefill
                         and num_new_tokens > token_budget
                     ):
                         self.waiting.pop_request()

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -124,7 +124,7 @@ def __init__(
             # Encoder models without KV cache don't support
             # chunked prefill. But do SSM models?
             logger.info("Disabling chunked prefill for model without KVCache")
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.enable_chunked_prefill = False
 
         scheduler_block_size = (
             vllm_config.cache_config.block_size

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2031,7 +2031,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]:
 
         supported_tasks = list(model.pooler.get_supported_tasks())
 
-        if self.scheduler_config.chunked_prefill_enabled:
+        if self.scheduler_config.enable_chunked_prefill:
             if "token_embed" in supported_tasks:
                 supported_tasks.remove("token_embed")
             if "token_classify" in supported_tasks:
@@ -3825,7 +3825,7 @@ def _dummy_pooler_run(
         supported_pooling_tasks = self.get_supported_pooling_tasks()
 
         if not supported_pooling_tasks:
-            if self.scheduler_config.chunked_prefill_enabled:
+            if self.scheduler_config.enable_chunked_prefill:
                 raise RuntimeError(
                     f"Model {self.model_config.model} does not support "
                     "any pooling tasks with chunked prefill enabled. "