Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2282,7 +2282,6 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
) -> None:
"""Validate chunked prefill settings in the scheduler config for
encoder-decoder models."""
assert scheduler_config.chunked_prefill_enabled is expect_enabled
assert scheduler_config.enable_chunked_prefill is expect_enabled
if is_encoder_decoder:
# Encoder-decoder models should automatically disable chunked multimodal
Expand Down
10 changes: 4 additions & 6 deletions tests/v1/e2e/test_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def test_speculators_model_integration(


@pytest.mark.parametrize(
["model_setup", "mm_enabled", "chunked_prefill_enabled"],
["model_setup", "mm_enabled", "enable_chunked_prefill"],
[
(("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
pytest.param(
Expand Down Expand Up @@ -358,7 +358,7 @@ def test_eagle_correctness(
sampling_config: SamplingParams,
model_setup: tuple[str, str, str, int],
mm_enabled: bool,
chunked_prefill_enabled: bool,
enable_chunked_prefill: bool,
attn_backend: str,
):
if attn_backend == "TREE_ATTN":
Expand Down Expand Up @@ -396,9 +396,7 @@ def test_eagle_correctness(

method, model_name, spec_model_name, tp_size = model_setup
max_model_len = 2048
max_num_batched_tokens = max_model_len
if chunked_prefill_enabled:
max_num_batched_tokens = 128
max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len

ref_llm = LLM(
model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
Expand All @@ -420,7 +418,7 @@ def test_eagle_correctness(
},
max_model_len=max_model_len,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=chunked_prefill_enabled,
enable_chunked_prefill=enable_chunked_prefill,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/engine/test_engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ def test_encoder_instance_zero_kv_cache(
)

# Check 5: Verify chunked prefill is disabled
assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
assert not vllm_config.scheduler_config.enable_chunked_prefill, (
"Encoder instance should disable chunked prefill (no KV cache)"
)

Expand Down
11 changes: 8 additions & 3 deletions vllm/config/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from pydantic import Field, field_validator, model_validator
from pydantic.dataclasses import dataclass
from typing_extensions import Self
from typing_extensions import Self, deprecated

from vllm.config.utils import config
from vllm.logger import init_logger
Expand Down Expand Up @@ -233,6 +233,11 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
)

@property
@deprecated(
"`SchedulerConfig.chunked_prefill_enabled` has been renamed to "
"`SchedulerConfig.enable_chunked_prefill`. "
"The old name will be removed in v0.12."
)
def chunked_prefill_enabled(self) -> bool:
return self.enable_chunked_prefill

Expand All @@ -244,7 +249,7 @@ def chunked_prefill_enabled(self, value: bool):
def _verify_args(self) -> Self:
if (
self.max_num_batched_tokens < self.max_model_len
and not self.chunked_prefill_enabled
and not self.enable_chunked_prefill
):
raise ValueError(
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
Expand All @@ -271,7 +276,7 @@ def _verify_args(self) -> Self:
)

if self.max_num_partial_prefills > 1:
if not self.chunked_prefill_enabled:
if not self.enable_chunked_prefill:
raise ValueError(
"Chunked prefill must be enabled to set "
"max_num_partial_prefills > 1."
Expand Down
6 changes: 3 additions & 3 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ def __post_init__(self):

if (
self.model_config is not None
and self.scheduler_config.chunked_prefill_enabled
and self.scheduler_config.enable_chunked_prefill
and self.model_config.dtype == torch.float32
and current_platform.get_device_capability() == (7, 5)
):
Expand Down Expand Up @@ -584,7 +584,7 @@ def __post_init__(self):
):
for reason in disable_chunked_prefill_reasons:
logger.info(reason)
self.scheduler_config.chunked_prefill_enabled = False
self.scheduler_config.enable_chunked_prefill = False
self.scheduler_config.long_prefill_token_threshold = 0

if self.cache_config is not None:
Expand Down Expand Up @@ -1026,7 +1026,7 @@ def __str__(self):
f"seed={self.model_config.seed}, "
f"served_model_name={self.model_config.served_model_name}, "
f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa
f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, " # noqa
f"pooler_config={self.model_config.pooler_config!r}, "
f"compilation_config={self.compilation_config!r}"
)
Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:

scheduler_config = vllm_config.scheduler_config
if (
scheduler_config.chunked_prefill_enabled
scheduler_config.enable_chunked_prefill
or cache_config.enable_prefix_caching
) and cache_config.cache_dtype != "auto":
raise RuntimeError(
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ def schedule(self) -> SchedulerOutput:
# chunked prefill has to be enabled explicitly to allow
# pooling requests to be chunked
if (
not self.scheduler_config.chunked_prefill_enabled
not self.scheduler_config.enable_chunked_prefill
and num_new_tokens > token_budget
):
self.waiting.pop_request()
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def __init__(
# Encoder models without KV cache don't support
# chunked prefill. But do SSM models?
logger.info("Disabling chunked prefill for model without KVCache")
vllm_config.scheduler_config.chunked_prefill_enabled = False
vllm_config.scheduler_config.enable_chunked_prefill = False

scheduler_block_size = (
vllm_config.cache_config.block_size
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2031,7 +2031,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]:

supported_tasks = list(model.pooler.get_supported_tasks())

if self.scheduler_config.chunked_prefill_enabled:
if self.scheduler_config.enable_chunked_prefill:
if "token_embed" in supported_tasks:
supported_tasks.remove("token_embed")
if "token_classify" in supported_tasks:
Expand Down Expand Up @@ -3825,7 +3825,7 @@ def _dummy_pooler_run(
supported_pooling_tasks = self.get_supported_pooling_tasks()

if not supported_pooling_tasks:
if self.scheduler_config.chunked_prefill_enabled:
if self.scheduler_config.enable_chunked_prefill:
raise RuntimeError(
f"Model {self.model_config.model} does not support "
"any pooling tasks with chunked prefill enabled. "
Expand Down