-
Notifications
You must be signed in to change notification settings - Fork 740
[Scheduler] [Optimization] Only preempt decode requests and better manage reserved blocks in scheduler #7444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -216,18 +216,12 @@ def __init__(self, max_num_seqs, config, tensor_parallel_size, splitwise_role, l | |
| self.bos_client = None | ||
| self.async_preprocess_pool = ThreadPoolExecutor(max_workers=4) | ||
|
|
||
| self.init_reserve_output_block_num = ( | ||
| envs.FD_RESERVE_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL | ||
| ) # int | ||
| self.decay_output_block_num = ( | ||
| envs.FD_RESERVE_DECAY_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL | ||
| ) # float | ||
| self.min_reserve_output_block_num = ( | ||
| envs.FD_RESERVE_MIN_OUTPUT_BLOCK_NUM_FOR_DECODE_WHEN_SCHEDULE_NEW_PREFILL | ||
| ) # int | ||
| self.current_reserve_output_block_num = self.init_reserve_output_block_num | ||
| self.current_reserve_output_block_num_float = self.init_reserve_output_block_num | ||
| self.can_relax_prefill_strategy = True | ||
| self.init_new_token_ratio = envs.FD_INIT_NEW_TOKEN_RATIO | ||
| self.min_new_token_ratio = envs.FD_MIN_NEW_TOKEN_RATIO | ||
| self.new_token_ratio_decay = envs.FD_NEW_TOKEN_RATIO_DECAY | ||
| self.clip_max_new_tokens = envs.FD_CLIP_MAX_NEW_TOKENS | ||
| self.retract_decode_steps = envs.FD_RETRACT_DECODE_STEPS | ||
| self.new_token_ratio = self.init_new_token_ratio | ||
| # Scheduler-side requests that have not been moved into resource manager waiting queue yet. | ||
| self.scheduler_unhandled_request_num = 0 | ||
|
|
||
|
|
@@ -313,6 +307,15 @@ def _can_preempt(self): | |
| return True | ||
| return False | ||
|
|
||
| def _can_preempt_with_decode_task(self): | ||
| """ | ||
| A request is preemptable if it does NOT use extend tables AND is in decode status. | ||
| """ | ||
| for req in self.running: | ||
| if not req.use_extend_tables and req.status == RequestStatus.RUNNING_DECODE: | ||
| return True | ||
| return False | ||
|
|
||
| def preempted_all(self): | ||
| with self.lock: | ||
| preempted_reqs = [] | ||
|
|
@@ -350,14 +353,38 @@ def wait_worker_inflight_requests_finish(self, timeout=60): | |
| def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_reqs): | ||
| """ | ||
| If the request cannot be scheduled, preempt the running request one by one until it can be scheduled. Last in, first out. | ||
| Only requests that is in decode status can be preempted. | ||
| """ | ||
| can_schedule = False | ||
| while self._can_preempt(): | ||
| if not self.cache_manager.can_allocate_gpu_blocks(num_new_blocks): | ||
| preempted_req = self.running.pop() | ||
| if preempted_req.use_extend_tables: | ||
| self.running.insert(0, preempted_req) | ||
| continue | ||
| while self._can_preempt_with_decode_task(): | ||
| if self.cache_manager.can_allocate_gpu_blocks(num_new_blocks): | ||
| # The request can be scheduled. | ||
| can_schedule = True | ||
| break | ||
| else: | ||
| # Scan from back to front to find the last preemptable request | ||
| preempted_req = None | ||
| i = len(self.running) - 1 | ||
| while i >= 0: | ||
| candidate = self.running[i] | ||
| # Skip requests that are not in decode status | ||
| if candidate.status != RequestStatus.RUNNING_DECODE: | ||
| i -= 1 | ||
| continue | ||
| # Skip requests using extend tables | ||
| if candidate.use_extend_tables: | ||
| i -= 1 | ||
| continue | ||
| # Found a valid preempt target | ||
| preempted_req = candidate | ||
| break | ||
|
|
||
| if preempted_req is None: | ||
| # No preemptable request found (all have no output tokens or use extend tables) | ||
| return False | ||
|
|
||
| # Remove the preempted request from the running list | ||
| self.running.pop(i) | ||
| preempted_req.status = RequestStatus.PREEMPTED | ||
| preempted_req.num_computed_tokens = 0 | ||
| if self.config.scheduler_config.splitwise_role == "decode": | ||
|
|
@@ -389,33 +416,78 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re | |
| llm_logger.debug( | ||
| f"preempt {preempted_req.request_id} in idx {preempted_req.idx} with generated ids {preempted_req.output_token_ids}" | ||
| ) | ||
|
|
||
| llm_logger.debug(self.info()) | ||
| self._info_each_block() | ||
| self._recompute_new_token_ratio_on_preemption() | ||
|
|
||
| if preempted_req == request: | ||
| # No more request to preempt. | ||
| can_schedule = False | ||
| break | ||
| else: | ||
| # The request can be scheduled. | ||
| can_schedule = True | ||
| break | ||
| self.current_reserve_output_block_num = self.init_reserve_output_block_num | ||
| self.current_reserve_output_block_num_float = self.init_reserve_output_block_num | ||
| self.can_relax_prefill_strategy = False | ||
|
|
||
| return can_schedule | ||
|
|
||
| def _recompute_new_token_ratio_on_preemption(self): | ||
| """Recompute new_token_ratio based on actual decode progress of running requests. | ||
|
|
||
| Aligned with SGLang's retract_decode logic: estimate the ratio as the actual | ||
| fraction of max_tokens already decoded plus a small lookahead, rather than | ||
| naively resetting to the initial value. This avoids over-reserving when most | ||
| requests are near completion, and under-reserving when they've just started. | ||
|
|
||
| Formula: | ||
| ratio = (total_decoded + RETRACT_DECODE_STEPS * num_running) / (total_max_new_tokens + 1) | ||
| capped at init_new_token_ratio so preemption never makes the ratio more | ||
| aggressive than the initial setting. | ||
| """ | ||
| if not self.running: | ||
| self.new_token_ratio = self.init_new_token_ratio | ||
| return | ||
| total_decoded_tokens = sum(len(req.output_token_ids) for req in self.running) | ||
| total_max_new_tokens = 0 | ||
| for req in self.running: | ||
| max_tokens = req.sampling_params.max_tokens | ||
| if max_tokens is None: | ||
| max_tokens = self.config.model_config.max_model_len - req.prompt_token_ids_len | ||
| total_max_new_tokens += max_tokens | ||
| num_running_decode = sum([1 if req.num_total_tokens > req.need_prefill_tokens else 0 for req in self.running]) | ||
| new_ratio = (total_decoded_tokens + self.retract_decode_steps * num_running_decode) / ( | ||
| total_max_new_tokens + 1 | ||
| ) | ||
| self.new_token_ratio = min(new_ratio, self.init_new_token_ratio) | ||
|
|
||
| def _get_running_request_reserve_blocks(self, request: Request) -> int: | ||
| """Estimate KV-cache blocks to reserve for a running request's future decode tokens. | ||
|
|
||
| Aligned with SGLang's per-request budget estimation: | ||
| reserved_tokens = min(max_tokens - already_generated, CLIP_MAX_NEW_TOKENS) * new_token_ratio | ||
| then ceil-divided by block_size. The ratio decays each scheduling step so that | ||
| the reservation gradually relaxes; on preemption it resets to the initial value. | ||
| """ | ||
| max_tokens = request.sampling_params.max_tokens | ||
| if max_tokens is None: | ||
| max_tokens = self.config.model_config.max_model_len - request.prompt_token_ids_len | ||
| remaining_tokens = max_tokens - len(request.output_token_ids) | ||
| clipped_remaining = min(remaining_tokens, self.clip_max_new_tokens) | ||
| reserved_tokens = max(int(clipped_remaining * self.new_token_ratio), 0) | ||
| block_size = self.config.cache_config.block_size | ||
| return (reserved_tokens + block_size - 1) // block_size | ||
|
|
||
| def _get_can_schedule_prefill_threshold_block(self, num_chunk_new_block): | ||
| if self.can_relax_prefill_strategy: | ||
| can_schedule_block_num_threshold = num_chunk_new_block | ||
| else: | ||
| can_schedule_block_num_threshold = ( | ||
| num_chunk_new_block + len(self.running) * self.current_reserve_output_block_num | ||
| """Compute the minimum free blocks required to admit a new prefill request. | ||
|
|
||
| The threshold includes: (1) blocks needed for the prefill itself, and | ||
| (2) blocks reserved for all running decode requests' future output tokens, | ||
| estimated per-request via _get_running_request_reserve_blocks. This prevents | ||
| new prefills from starving ongoing decodes of KV-cache capacity. | ||
| """ | ||
| reserve_blocks = sum(self._get_running_request_reserve_blocks(req) for req in self.running) | ||
| can_schedule_block_num_threshold = num_chunk_new_block + reserve_blocks | ||
| if self.config.speculative_config.method is not None: | ||
| can_schedule_block_num_threshold = min( | ||
| can_schedule_block_num_threshold + 1, self.config.cache_config.max_block_num_per_seq | ||
| ) | ||
| if self.config.speculative_config.method is not None: | ||
| can_schedule_block_num_threshold = min( | ||
| can_schedule_block_num_threshold + 1, self.config.cache_config.max_block_num_per_seq | ||
| ) | ||
| return can_schedule_block_num_threshold | ||
|
|
||
| def _update_mm_hashes(self, request): | ||
|
|
@@ -770,6 +842,7 @@ def get_enough_request(request, scheduled_reqs): | |
| error_reqs: list[tuple[str, str]] = [] | ||
| token_budget = self.config.scheduler_config.max_num_batched_tokens | ||
| need_abort_requests = [] # users trigger abortion | ||
| chunk_prefill_in_running_not_satisfied = False | ||
|
|
||
| # First, schedule the RUNNING requests. | ||
| req_index = 0 | ||
|
|
@@ -906,22 +979,17 @@ def _allocate_decode_and_extend(): | |
| req_index += 1 | ||
| continue | ||
| num_new_block = self.get_new_block_nums(request, num_new_tokens) | ||
| can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block(num_new_block) | ||
| # Allocate blocks to prefill | ||
| if self.cache_manager.can_allocate_gpu_blocks(num_new_block): | ||
| request.block_tables.extend( | ||
| self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id) | ||
| ) | ||
| # Prepare prefill task | ||
| scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) | ||
| else: # Not enough blocks to allocate, trigger preemption | ||
| can_schedule = self._trigger_preempt(request, num_new_block, preempted_reqs, scheduled_reqs) | ||
| if not can_schedule: | ||
| break | ||
| if self.cache_manager.can_allocate_gpu_blocks(can_schedule_block_num_threshold): | ||
| request.block_tables.extend( | ||
| self.cache_manager.allocate_gpu_blocks(num_new_block, request.request_id) | ||
| ) | ||
| # Prepare prefill task | ||
| scheduled_reqs.append(self._prepare_prefill_task(request, num_new_tokens)) | ||
| else: # Not enough blocks to allocate | ||
| chunk_prefill_in_running_not_satisfied = True | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❓ 疑问 chunk prefill 分配失败后阻塞全部 waiting 请求——行为变更较大,请确认是否符合预期 之前此处的逻辑是:chunk prefill 分配不足时触发 这在高负载场景下可能导致 waiting 队列饥饿。请确认:
|
||
| break # For chunk prefill request, if not satisfy condition for prefill, just break | ||
| token_budget -= num_new_tokens | ||
| request.num_computed_tokens += num_new_tokens | ||
| if ( | ||
|
|
@@ -939,7 +1007,7 @@ def _allocate_decode_and_extend(): | |
| self.running.remove(request) | ||
|
|
||
| # Second, schedule the WAITING requests. | ||
| if not preempted_reqs: | ||
| if (not preempted_reqs) and (not chunk_prefill_in_running_not_satisfied): | ||
| skip_requests: list[Request] = [] | ||
| while self.waiting and token_budget > 0: | ||
| if ( | ||
|
|
@@ -1025,7 +1093,7 @@ def _allocate_decode_and_extend(): | |
| self.cache_manager.update_cache_blocks( | ||
| request, self.config.cache_config.block_size, request.num_computed_tokens | ||
| ) | ||
| request.status = RequestStatus.RUNNING | ||
| request.status = RequestStatus.RUNNING_PREFILL | ||
| if self.config.scheduler_config.splitwise_role == "mixed": | ||
| allocated_position = self.get_available_position() | ||
| request.idx = allocated_position | ||
|
|
@@ -1094,7 +1162,7 @@ def _allocate_decode_and_extend(): | |
| self.cache_manager.update_cache_blocks( | ||
| request, self.config.cache_config.block_size, request.num_computed_tokens | ||
| ) | ||
| request.status = RequestStatus.RUNNING | ||
| request.status = RequestStatus.RUNNING_PREFILL | ||
| else: | ||
| if self.config.cache_config.enable_prefix_caching: | ||
| self._free_blocks(request) | ||
|
|
@@ -1108,14 +1176,10 @@ def _allocate_decode_and_extend(): | |
|
|
||
| if scheduled_reqs: | ||
| llm_logger.debug(f"schedued_reqs: {scheduled_reqs}") | ||
| self.current_reserve_output_block_num_float -= self.decay_output_block_num | ||
| self.current_reserve_output_block_num = max( | ||
| int(self.current_reserve_output_block_num_float), | ||
| self.min_reserve_output_block_num, | ||
| 0, | ||
| self.new_token_ratio = max( | ||
| self.new_token_ratio - self.new_token_ratio_decay, | ||
| self.min_new_token_ratio, | ||
| ) | ||
| if self.current_reserve_output_block_num == 0: | ||
| self.can_relax_prefill_strategy = True | ||
|
|
||
| self._log_console_scheduler_metrics(scheduled_reqs) | ||
|
|
||
|
|
@@ -1334,6 +1398,7 @@ def pre_recycle_resource(self, request_id: str): | |
| def add_request_in_p(self, requests: list[Request]): | ||
| with self.lock: | ||
| for request in requests: | ||
| request.status = RequestStatus.RUNNING_PREFILL | ||
| self.running.append(request) | ||
|
|
||
| def preallocate_resource_in_p(self, request: Request): | ||
|
|
@@ -1467,6 +1532,7 @@ def add_prefilled_request(self, request_output: RequestOutput): | |
| ): | ||
| request.draft_token_ids = copy.deepcopy(request_output.outputs.draft_token_ids) | ||
| request.need_prefill_tokens = len(request.prompt_token_ids) + 1 | ||
| request.status = RequestStatus.RUNNING_DECODE | ||
|
|
||
| request_output.metrics.decode_recv_req_time = request.metrics.decode_recv_req_time | ||
| request_output.metrics.decode_preallocate_req_time = request.metrics.decode_preallocate_req_time | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,6 +45,7 @@ einops | |
| setproctitle | ||
| aistudio_sdk | ||
| p2pstore | ||
| mooncake-transfer-engine>=0.3.10.post1 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❓ 疑问 本 PR 标题为 Scheduler Optimization,但此处新增了 |
||
| py-cpuinfo | ||
| flashinfer-python-paddle @ https://xly-devops.bj.bcebos.com/flashinfer/flashinfer_python_paddle-0.4.1.2-py3-none-any.whl | ||
| flash_mask @ https://xly-devops.bj.bcebos.com/flashmask/flash_mask-4.0.0%2Bg4c84f74-py3-none-any.whl | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -650,7 +650,7 @@ def test_schedule_decode_and_waiting_prefill(self): | |
|
|
||
| decode_request = _make_request(request_id="req-decode", prompt_token_ids=[1, 2]) | ||
| decode_request.idx = 0 | ||
| decode_request.status = RequestStatus.RUNNING | ||
| decode_request.status = RequestStatus.RUNNING_DECODE | ||
| decode_request.num_computed_tokens = 2 | ||
| decode_request.output_token_ids = [99] | ||
| decode_request.block_tables = [1] | ||
|
|
@@ -665,30 +665,7 @@ def test_schedule_decode_and_waiting_prefill(self): | |
| self.assertGreaterEqual(len(scheduled_reqs), 2) | ||
| self.assertEqual(error_reqs, []) | ||
| self.assertIn(decode_request.request_id, manager.using_extend_tables_req_id) | ||
| self.assertEqual(waiting_request.status, RequestStatus.RUNNING) | ||
|
|
||
| def test_trigger_preempt_records_tasks(self): | ||
| manager = _build_manager() | ||
| _register_manager_cleanup(self, manager) | ||
| manager.cache_manager = MagicMock() | ||
| manager.cache_manager.num_gpu_blocks = 8 | ||
| manager.cache_manager.gpu_free_block_list = list(range(8)) | ||
| manager.cache_manager.can_allocate_gpu_blocks.side_effect = [False, True] | ||
| manager._free_blocks = MagicMock() | ||
| preempted_req = _make_request(request_id="req-preempted") | ||
| preempted_req.idx = 0 | ||
| preempted_req.use_extend_tables = False | ||
| request = _make_request(request_id="req-target") | ||
| request.idx = 1 | ||
| manager.running = [request, preempted_req] | ||
|
|
||
| preempted_reqs = [] | ||
| scheduled_reqs = [] | ||
| can_schedule = manager._trigger_preempt(request, 2, preempted_reqs, scheduled_reqs) | ||
| self.assertTrue(can_schedule) | ||
| self.assertIn(preempted_req.request_id, manager.to_be_rescheduled_request_id_set) | ||
| self.assertEqual(preempted_reqs[0], preempted_req) | ||
| self.assertEqual(scheduled_reqs[0].request_id, preempted_req.request_id) | ||
| self.assertEqual(waiting_request.status, RequestStatus.RUNNING_PREFILL) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 建议 新的抢占逻辑缺少单元测试覆盖
|
||
|
|
||
| def test_available_position_and_real_bsz(self): | ||
| manager = _build_manager() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 建议
_can_preempt方法已成为死代码_trigger_preempt已从self._can_preempt()切换到self._can_preempt_with_decode_task(),经全仓搜索确认_can_preempt不再有任何调用方。建议删除该方法,避免后续维护者误用不带 decode 状态过滤的旧版本。