fix finding bucket for context length (#2118)

yangulei · web-flow · commit baa60de5cb32 · 2025-11-19T14:14:36.000+08:00
- Works with HabanaAI/vllm-hpu-extension#385 to enable padding ratio limit for the context length bucketing to reduce the number of buckets. - Truncate the context length based on the bucketing in the APC block manager. - Add assertion for `max_num_prefill_seqs==` when APC is enabled. --------- Signed-off-by: Youlei Yang <youlei.yang@intel.com>
diff --git a/vllm/config.py b/vllm/config.py
@@ -4586,6 +4586,14 @@ def __post_init__(self):
                            "but the scheduler is configured to publish them."
                            "Modify KVEventsConfig.enable_kv_cache_events"
                            "to True to enable.")
+        if (current_platform.is_hpu()
+                and self.cache_config.enable_prefix_caching
+                and self.scheduler_config.max_num_prefill_seqs is not None
+                and self.scheduler_config.max_num_prefill_seqs > 1):
+            logger.warning(
+                "Prefix caching with bs > 1 is not supported on HPU."
+                " Setting max_num_prefill_seqs to 1.")
+            self.scheduler_config.max_num_prefill_seqs = 1
         current_platform.check_and_update_config(self)
 
         if not self.instance_id:
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
@@ -15,6 +15,7 @@
                                          NaiveBlockAllocator)
 from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.sequence import Sequence
 
 PrefixHash = int
@@ -1075,8 +1076,21 @@ def get_num_cached_tokens(self, seq: Sequence) -> int:
         # This is O(logN), where N is the number of blocks.
         num_cached_blocks = len(
             self._allocator.find_cached_blocks_prefix(block_hashes))
+        if current_platform.is_hpu(
+        ) and num_cached_blocks > 0 and seq.is_prefill():
+            from vllm_hpu_extension.bucketing.common import (
+                get_bucketing_manager)
+            hpu_bucketing_manager = get_bucketing_manager()
+            seq_len = seq.get_len() - num_cached_blocks * self._block_size
+            _, _, bkt_cached_blocks = hpu_bucketing_manager.find_prompt_bucket(
+                1, seq_len, num_cached_blocks, False)
+            logger.debug("HPU bucketing adjusted cached blocks from %d to %d",
+                         num_cached_blocks, bkt_cached_blocks)
+            num_cached_blocks = bkt_cached_blocks
         num_cached_tokens = num_cached_blocks * self._block_size
         self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens
+        self._seq_id_to_blocks_hashes[
+            seq.seq_id] = block_hashes[:num_cached_blocks]
         return num_cached_tokens
 
     def remove_seq(self, seq_id: int) -> None:
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -1963,6 +1963,9 @@ def _prepare_prompt(
 
         if any(context_lens):
             assert not self.scheduler_config.chunked_prefill_enabled
+            assert self.scheduler_config.max_num_prefill_seqs == 1
+            assert bs == 1, (
+                "Prefix caching with multiple sequences is not supported yet.")
             # prefix caching
 
             max_num_block = max(len(bt) for bt in prefix_block_tables)
@@ -2836,9 +2839,8 @@ def prepare_model_input_align_worker(
         """
         with self.profiler.record_event('internal', 'prepare_input_tensors'):
             assert seq_group_metadata_list is not None
-            if self.profiler.enabled:
-                self.profiler_counter_helper.capture_seq_group_metadata_stats(
-                    seq_group_metadata_list=seq_group_metadata_list)
+            self.profiler_counter_helper.capture_seq_group_metadata_stats(
+                seq_group_metadata_list=seq_group_metadata_list)
             model_input, sampling_metadata = self.prepare_input_tensors(
                 seq_group_metadata_list, finished_requests_ids, align_worker)
             assert model_input.attn_metadata is not None
@@ -4055,7 +4057,7 @@ def execute_model(
         warmup_mode=False,
         previous_hidden_states: Optional[torch.Tensor] = None,
         seqs=None,
-        ctx_blocks: int = 1,
+        ctx_blocks: int = 0,
         is_dummy_run: bool = False,
         is_pt_profiler_run: bool = False,
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
@@ -4144,6 +4146,9 @@ def execute_model(
                 if not warmup_mode:
                     ctx_blocks = seq_len
                 seq_len = 1
+            elif attn_metadata.block_list is not None:
+                if not warmup_mode:
+                    ctx_blocks = attn_metadata.block_list.shape[-1]
 
             if self._is_fla_model():
                 use_graphs = not is_prompt
@@ -4289,8 +4294,15 @@ def try_revert_dummy_output_tokens():
                         attn_metadata,
                         kv_caches=kv_caches
                     )
+                real_seq_lens = model_input.seq_lens
+                real_seq_lens = real_seq_lens if real_seq_lens else \
+                    self.profiler_counter_helper.real_seq_lens
+                real_query_lens = model_input.query_lens
+                real_query_lens = real_query_lens if real_query_lens else \
+                    self.profiler_counter_helper.prompt_seq_lens
                 profiler_args = {
-                    'real_seq_len': model_input.seq_lens,
+                    'real_seq_lens': real_seq_lens,
+                    'real_query_lens': real_query_lens,
                     'real_batch_size': real_batch_size
                 }