diff --git a/vllm/config.py b/vllm/config.py index 28ff22323b13..67df843d34d4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1686,9 +1686,6 @@ def _verify_args(self) -> None: and not self.use_padding_aware_scheduling: raise ValueError("max_num_prefill_seqs can be only " "used with padding-aware-scheduling. ") - if self.use_padding_aware_scheduling and self.chunked_prefill_enabled: - raise ValueError("Padding-aware scheduling currently " - "does not work with chunked prefill ") @property def is_multi_step(self) -> bool: diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index e1cc14a475ff..26ffc9693135 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1399,10 +1399,17 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: inter token latency because decodes requests don't need to be blocked by prefill requests. """ - budget = SchedulingBudget( - token_budget=self.scheduler_config.max_num_batched_tokens, - max_num_seqs=self.scheduler_config.max_num_seqs, - ) + if self.scheduler_config.use_padding_aware_scheduling: + budget = PaddingAwareSchedulingBudget( + token_budget=self.scheduler_config.max_num_batched_tokens, + max_num_seqs=self.scheduler_config.max_num_seqs, + max_num_prefill_seqs=self.scheduler_config.max_num_prefill_seqs + ) + else: + budget = SchedulingBudget( + token_budget=self.scheduler_config.max_num_batched_tokens, + max_num_seqs=self.scheduler_config.max_num_seqs, + ) curr_loras: Set[int] = set() prefills = SchedulerPrefillOutputs.create_empty()