[TRTLLM-6756][chore] Enhance TorchSampler with new setup_sampler_step method and fix bugs

stnie · stnie · commit b5e195dbd1a9 · 2025-11-26T10:03:04.000+01:00
- Introduced setup_sampler_step method to enable the setup process for disaggregated serving in beam search.
- Updated cache indirection initialization to use torch.zeros to prevent reading invalid values from cache_indirection
- Updated mtpSampler to correctly call  TorchSampler functions
- Fixed handle_finish_reasons by wrapping finish reasons in the FinishReason class.
- Adjusted max_lengths_tensor calculation to account for original prompt length.

Signed-off-by: Stefan Niebler &lt;82932102+stnie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -639,7 +639,7 @@ def finish_reasons_list(self) -> FinishReasonsList:
 @dataclass(kw_only=True)
 class SampleStateTorch(SampleState):
     host: SampleStateTensorsHostTorch
-    beam_histories: list[BeamHistory | None]
+    beam_histories: list[BeamHistory | None] | None = None
 
 
 class TorchSampler(Sampler):
@@ -691,7 +691,9 @@ def create_store(self) -> Store:
             return self.Store(
                 new_tokens=int_tensor(self.NEW_TOKENS_SHAPE),
                 finish_reasons=int_tensor(self.NEW_TOKENS_SHAPE),
-                cache_indirection=int_tensor(self.CACHE_INDIRECTION_SHAPE),
+                cache_indirection=torch.zeros(
+                    self.CACHE_INDIRECTION_SHAPE, device="cuda", dtype=torch.int
+                ),
                 cache_indirection_buffer=int_tensor(self.CACHE_INDIRECTION_SHAPE),
                 cum_log_probs=torch.zeros(
                     self.CACHE_INDIRECTION_SHAPE[:-1], device="cuda", dtype=torch.float32
@@ -718,7 +720,7 @@ class Args:
         max_num_sequences: int
         max_beam_width: int
         max_total_draft_tokens: int
-        disable_overlap_scheduler: bool
+        disable_overlap_scheduler: bool = False
         disable_flash_infer_sampling: bool = False
 
     def __init__(self, args: Args):
@@ -873,7 +875,10 @@ def _handle_finish_reasons(
             request.state = LlmRequestState.GENERATION_COMPLETE
             for beam_idx in range(request.sampling_config.beam_width):
                 request.set_finished_reason(
-                    finish_reasons_list[request.py_seq_slot][DEFAULT_STEP_IDX][beam_idx], beam_idx
+                    FinishReason(
+                        finish_reasons_list[request.py_seq_slot][DEFAULT_STEP_IDX][beam_idx]
+                    ),
+                    beam_idx,
                 )
             return True
         return False
@@ -1069,7 +1074,10 @@ def _process_draft_tokens_tree(
         for idx in eagle_paths[longest_match_path_idx][:longest_accepted_len]:
             add_token(request, new_tokens_list, beam_idx=self.DEFAULT_BEAM_IDX, step=cast(int, idx.item()))
             num_accepted_draft_tokens += 1
-            if self.finish_if_reason(request, finish_reasons, step=num_accepted_draft_tokens):
+            if self.finish_if_reason(request,
+                        finish_reasons,
+                        step=num_accepted_draft_tokens,
+                        beam_idx=DEFAULT_BEAM_IDX,):
                 break
 
         assert num_accepted_draft_tokens <= longest_accepted_len
@@ -1080,6 +1088,15 @@ def _process_draft_tokens_tree(
         return num_accepted_draft_tokens - 1
 
 
+    def setup_sampler_step(self, requests: ScheduledRequests):
+        """Setup the sampler step for the requests
+
+        Args:
+            requests: list[LlmRequest]. The requests to setup the sampler step for
+        """
+        if self._use_beam_search:
+            self._prepare_beam_search(requests)
+
     def _prepare_beam_search(
         self,
         requests: list[LlmRequest],
@@ -1090,12 +1107,11 @@ def _prepare_beam_search(
         initialize/reset the buffers for the request
         """
         for request in requests:
-            if (
-                not request.is_finished
-                and request.is_context_init_state
-                and request.is_last_context_chunk
+            if not request.is_finished and (
+                (request.is_context_init_state and request.is_last_context_chunk)
+                or request.is_disagg_generation_transmission_complete
             ):
-                if request.py_num_logprobs > 1:
+                if request.py_return_log_probs and request.py_num_logprobs > 1:
                     raise ValueError("Beam search does not support multiple logprobs")
                 self.store.cache_indirection[request.py_seq_slot, :, request.py_prompt_len].fill_(0)
                 self.store.cum_log_probs[request.py_seq_slot].fill_(0)
@@ -1559,7 +1575,7 @@ def update_requests(
                 or req.context_remaining_length != 0
             ):
                 continue
-            if beam_histories[req_idx] is not None:
+            if beam_histories is not None and beam_histories[req_idx] is not None:
                 self._finalize_beam(
                     req,
                     beam_histories[req_idx],
@@ -1579,7 +1595,7 @@ def update_requests(
             if req.state == LlmRequestState.GENERATION_COMPLETE:
                 continue
             if req.sampling_config.beam_width > 1:
-                if beam_histories[req_idx] is not None:
+                if beam_histories is not None and beam_histories[req_idx] is not None:
                     self._finalize_beam(
                         req,
                         beam_histories[req_idx],
@@ -2206,7 +2222,10 @@ def _are_max_length(self, requests: list[LlmRequest]) -> torch.Tensor:
         )
         max_lengths_tensor = torch.tensor(
             [
-                ([min(req.py_max_new_tokens, self.max_seq_len)] * self.max_beam_width)
+                (
+                    [min(req.py_max_new_tokens, self.max_seq_len - req.orig_prompt_len)]
+                    * self.max_beam_width
+                )
                 for req in requests
             ]
             * self.max_tokens
diff --git a/tensorrt_llm/_torch/speculative/mtp.py b/tensorrt_llm/_torch/speculative/mtp.py
@@ -236,11 +236,12 @@ def __init__(self, args: TorchSampler.Args, *, nextn: int):
 
         seq_slots = args.max_num_sequences
         max_tokens = args.max_total_draft_tokens + 1
-        max_beam_width = args.max_beam_width
+        self.max_beam_width = args.max_beam_width
 
         self.store = self.Store(
-            new_tokens=int_tensor((max_tokens, seq_slots, max_beam_width)),
-            next_new_tokens=int_tensor((max_tokens, seq_slots, max_beam_width)),
+            new_tokens=int_tensor((max_tokens, seq_slots, self.max_beam_width)),
+            next_new_tokens=int_tensor(
+                (max_tokens, seq_slots, self.max_beam_width)),
             next_draft_tokens=int_tensor(
                 (seq_slots, args.max_total_draft_tokens)),
             new_tokens_lens=int_tensor((seq_slots, )),
@@ -271,20 +272,27 @@ def update_requests(
         for req in state.scheduled_requests.context_requests:
             if req.state == LlmRequestState.GENERATION_COMPLETE or req.context_remaining_length != 0:
                 continue
-            new_token = add_token(req, new_tokens, beam=beam_idx)
+            new_token = add_token(req, new_tokens, beam_idx=beam_idx)
             TorchSampler._handle_stop_criteria(req,
                                                new_token,
-                                               max_seq_len=self.max_seq_len)
+                                               max_seq_len=self.max_seq_len,
+                                               beam_idx=beam_idx)
             self._request_common_handling(req, next_draft_tokens_list)
 
         for req in state.scheduled_requests.generation_requests:
             if req.state == LlmRequestState.GENERATION_COMPLETE:
                 continue
             num_new_tokens = new_tokens_lens_list[req.py_seq_slot]
             for i in range(num_new_tokens):
-                new_token = add_token(req, new_tokens, beam=beam_idx, step=i)
+                new_token = add_token(req,
+                                      new_tokens,
+                                      beam_idx=beam_idx,
+                                      step=i)
                 if TorchSampler._handle_stop_criteria(
-                        req, new_token, max_seq_len=self.max_seq_len):
+                        req,
+                        new_token,
+                        max_seq_len=self.max_seq_len,
+                        beam_idx=beam_idx):
                     break
             req.py_num_accepted_draft_tokens = num_new_tokens - 1
             req.py_rewind_len = self.draft_len - req.py_num_accepted_draft_tokens
diff --git a/tests/unittest/_torch/speculative/test_draft_token_tree_verification.py b/tests/unittest/_torch/speculative/test_draft_token_tree_verification.py
@@ -47,8 +47,8 @@ def run_test(eagle_model_dir, max_seq_len, beam_width, use_dynamic_tree,
         ))
     # fill with NOT_FINISHED to ensure that all finish reasons are NOT_FINISHED
     torch_sampler.store.finish_reasons.fill_(FinishReason.NOT_FINISHED.value)
-    finish_reasons_list = torch_sampler.store.finish_reasons[..., 0].to(
-        device="cpu").T.tolist()
+    finish_reasons_list = torch_sampler.store.finish_reasons.to(
+        device="cpu").permute(1, 0, 2).tolist()
     input_new_tokens_list = input_new_tokens.tolist()
     num_accepted_draft_tokens = torch_sampler._process_draft_tokens_tree(
         request=input_request,