From 739840af0533095d026d6525d2bbe8b47cdb24a1 Mon Sep 17 00:00:00 2001 From: rainyfly <1435317881@qq.com> Date: Wed, 15 Apr 2026 11:59:40 +0800 Subject: [PATCH] Optimize scheduler for chunk prefill --- .../engine/sched/resource_manager_v1.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 71b2d1711e2..741728efd07 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -587,6 +587,7 @@ def schedule(self): preempted_reqs: list[Request] = [] error_reqs: list[tuple[str, str]] = [] token_budget = self.config.scheduler_config.max_num_batched_tokens + chunk_prefill_in_running_not_satisfied = False # First, schedule the RUNNING requests. req_index = 0 @@ -694,18 +695,17 @@ def _allocate_decode_and_extend(): ) num_new_tokens = self._get_num_new_tokens(request, token_budget) num_new_block = self.get_new_block_nums(request, num_new_tokens) + can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block(num_new_block) # Allocate blocks to prefill - if self.cache_manager.can_allocate_gpu_blocks(num_new_block): - request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block)) - # Prepare prefill task - scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) - else: # Not enough blocks to allocate, trigger preemption - can_schedule = self._trigger_preempt(request, num_new_block, preempted_reqs, scheduled_reqs) - if not can_schedule: - break - request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(num_new_block)) + if self.cache_manager.can_allocate_gpu_blocks(can_schedule_block_num_threshold): + request.block_tables.extend( + self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id) + ) # Prepare prefill task scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) + else: # Not enough blocks to allocate + chunk_prefill_in_running_not_satisfied = True + break # For chunk prefill request, if not satisfy condition for prefill, just break token_budget -= num_new_tokens request.num_computed_tokens += num_new_tokens if self.config.cache_config.enable_prefix_caching: @@ -713,8 +713,9 @@ def _allocate_decode_and_extend(): request, self.config.cache_config.block_size, request.num_computed_tokens ) req_index += 1 - # schedule the WAITING requests. - if not preempted_reqs: + + # Second, schedule the WAITING requests. + if (not preempted_reqs) and (not chunk_prefill_in_running_not_satisfied): skip_requests: list[Request] = [] while self.waiting and token_budget > 0: if len(self.running) == self.max_num_seqs: