Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion tests/v1/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2282,7 +2282,6 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
) -> None:
"""Validate chunked prefill settings in the scheduler config for
encoder-decoder models."""
assert scheduler_config.chunked_prefill_enabled is expect_enabled
assert scheduler_config.enable_chunked_prefill is expect_enabled
if is_encoder_decoder:
# Encoder-decoder models should automatically disable chunked multimodal
Expand Down
10 changes: 4 additions & 6 deletions tests/v1/e2e/test_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def test_speculators_model_integration(


@pytest.mark.parametrize(
["model_setup", "mm_enabled", "chunked_prefill_enabled"],
["model_setup", "mm_enabled", "enable_chunked_prefill"],
[
(("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
pytest.param(
Expand Down Expand Up @@ -358,7 +358,7 @@ def test_eagle_correctness(
sampling_config: SamplingParams,
model_setup: tuple[str, str, str, int],
mm_enabled: bool,
chunked_prefill_enabled: bool,
enable_chunked_prefill: bool,
attn_backend: str,
):
if attn_backend == "TREE_ATTN":
Expand Down Expand Up @@ -396,9 +396,7 @@ def test_eagle_correctness(

method, model_name, spec_model_name, tp_size = model_setup
max_model_len = 2048
max_num_batched_tokens = max_model_len
if chunked_prefill_enabled:
max_num_batched_tokens = 128
max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len

ref_llm = LLM(
model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
Expand All @@ -420,7 +418,7 @@ def test_eagle_correctness(
},
max_model_len=max_model_len,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=chunked_prefill_enabled,
enable_chunked_prefill=enable_chunked_prefill,
)
spec_outputs = spec_llm.chat(test_prompts, sampling_config)
matches = 0
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/engine/test_engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ def test_encoder_instance_zero_kv_cache(
)

# Check 5: Verify chunked prefill is disabled
assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
assert not vllm_config.scheduler_config.enable_chunked_prefill, (
"Encoder instance should disable chunked prefill (no KV cache)"
)

Expand Down
12 changes: 2 additions & 10 deletions vllm/config/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,19 +232,11 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
self.long_prefill_token_threshold,
)

@property
def chunked_prefill_enabled(self) -> bool:
return self.enable_chunked_prefill

@chunked_prefill_enabled.setter
def chunked_prefill_enabled(self, value: bool):
self.enable_chunked_prefill = value

@model_validator(mode="after")
def _verify_args(self) -> Self:
if (
self.max_num_batched_tokens < self.max_model_len
and not self.chunked_prefill_enabled
and not self.enable_chunked_prefill
):
raise ValueError(
f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
Expand All @@ -271,7 +263,7 @@ def _verify_args(self) -> Self:
)

if self.max_num_partial_prefills > 1:
if not self.chunked_prefill_enabled:
if not self.enable_chunked_prefill:
raise ValueError(
"Chunked prefill must be enabled to set "
"max_num_partial_prefills > 1."
Expand Down
6 changes: 3 additions & 3 deletions vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ def __post_init__(self):

if (
self.model_config is not None
and self.scheduler_config.chunked_prefill_enabled
and self.scheduler_config.enable_chunked_prefill
and self.model_config.dtype == torch.float32
and current_platform.get_device_capability() == (7, 5)
):
Expand Down Expand Up @@ -584,7 +584,7 @@ def __post_init__(self):
):
for reason in disable_chunked_prefill_reasons:
logger.info(reason)
self.scheduler_config.chunked_prefill_enabled = False
self.scheduler_config.enable_chunked_prefill = False
self.scheduler_config.long_prefill_token_threshold = 0

if self.cache_config is not None:
Expand Down Expand Up @@ -1026,7 +1026,7 @@ def __str__(self):
f"seed={self.model_config.seed}, "
f"served_model_name={self.model_config.served_model_name}, "
f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa
f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, " # noqa
f"pooler_config={self.model_config.pooler_config!r}, "
f"compilation_config={self.compilation_config!r}"
)
Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:

scheduler_config = vllm_config.scheduler_config
if (
scheduler_config.chunked_prefill_enabled
scheduler_config.enable_chunked_prefill
or cache_config.enable_prefix_caching
) and cache_config.cache_dtype != "auto":
raise RuntimeError(
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ def schedule(self) -> SchedulerOutput:
# chunked prefill has to be enabled explicitly to allow
# pooling requests to be chunked
if (
not self.scheduler_config.chunked_prefill_enabled
not self.scheduler_config.enable_chunked_prefill
and num_new_tokens > token_budget
):
self.waiting.pop_request()
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def __init__(
# Encoder models without KV cache don't support
# chunked prefill. But do SSM models?
logger.info("Disabling chunked prefill for model without KVCache")
vllm_config.scheduler_config.chunked_prefill_enabled = False
vllm_config.scheduler_config.enable_chunked_prefill = False

scheduler_block_size = (
vllm_config.cache_config.block_size
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2031,7 +2031,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]:

supported_tasks = list(model.pooler.get_supported_tasks())

if self.scheduler_config.chunked_prefill_enabled:
if self.scheduler_config.enable_chunked_prefill:
if "token_embed" in supported_tasks:
supported_tasks.remove("token_embed")
if "token_classify" in supported_tasks:
Expand Down Expand Up @@ -3825,7 +3825,7 @@ def _dummy_pooler_run(
supported_pooling_tasks = self.get_supported_pooling_tasks()

if not supported_pooling_tasks:
if self.scheduler_config.chunked_prefill_enabled:
if self.scheduler_config.enable_chunked_prefill:
raise RuntimeError(
f"Model {self.model_config.model} does not support "
"any pooling tasks with chunked prefill enabled. "
Expand Down
Loading