Use function sample instead of process logits and change based on review comment.

dominicshanshan · dominicshanshan · commit c9bade25c4a6 · 2025-11-20T01:55:09.000-08:00
Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -438,6 +438,8 @@ class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
     """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
     but detour some features to Python implementation"""
 
+    _logprob_params = None
+
     def __init__(
             self,
             *args,
@@ -797,8 +799,8 @@ def executor_request_to_llm_request(
         py_multimodal_data=getattr(executor_request, "py_multimodal_data",
                                    None),
         kv_cache_retention_config=executor_request.kv_cache_retention_config)
-    if hasattr(executor_request, "_logprob_params"):
-        llm_request._logprob_params = executor_request._logprob_params
+    llm_request._logprob_params = getattr(executor_request, "_logprob_params",
+                                          None)
     if child_req_ids:
         for child_id in child_req_ids:
             llm_request.create_child_request(child_id)
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -68,7 +68,6 @@
     Strategy,
     UtilsSamplingParams,
     get_rejected_indices,
-    process_logits,
     resolve_sampling_strategy,
     sample,
     sample_rejected,
@@ -975,7 +974,7 @@ def _process_draft_tokens_rejection_sampling(
             else _request_strategy(request, vocab_size=2**31)
         )
         generator = self.get_generator(request.py_draft_logits.device)
-        _, draft_probs = sample(
+        _, draft_probs, _ = sample(
             draft_sampling_strategy,
             request.py_draft_logits,
             generator=generator,
@@ -1800,21 +1799,19 @@ def _process_requests(
             if logprobs_mode == "processed_logprobs":
                 # Process logits with the same transformations as sampling (temperature, top-k, top-p)
                 # but without actually sampling
-                processed_logits_list = []
+                logprobs_list = []
                 for req_id in logprobs_req_indices:
                     req = requests[req_id]
                     strategy = _request_strategy(req, vocab_size=logits_cuda.size(1))
                     req_logits_indices = logits_cuda_indexer[req_id]
                     req_logits = logits_cuda[req_logits_indices].to(
                         dtype=torch.float32, non_blocking=True
                     )
-                    # Apply the same processing as sampling would apply
-                    processed_req_logits = process_logits(strategy, req_logits)
-                    processed_logits_list.append(processed_req_logits)
-                # Concatenate all processed logits
-                processed_logits_cuda = torch.cat(processed_logits_list, dim=0)
-                # Apply log_softmax to get log probabilities
-                logprobs_cuda = F.log_softmax(processed_logits_cuda, dim=-1)
+                    # Use sample() to get processed logprobs (after temperature, top-k, top-p applied)
+                    _, _, req_logprobs = sample(strategy, req_logits, return_probs=True)
+                    logprobs_list.append(req_logprobs)
+                # Concatenate all logprobs
+                logprobs_cuda = torch.cat(logprobs_list, dim=0)
             else:
                 # For raw_logprobs and other modes, use raw logits (before sampling modifications)
                 raw_logits_for_logprobs = raw_logits_cuda[:sum_steps]
diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils.py
@@ -24,6 +24,7 @@
 from typing import Generic, Literal, Optional, TypeAlias, TypeVar, cast
 
 import torch
+import torch.nn.functional as F
 
 from tensorrt_llm.sampling_params import SamplingParams
 
@@ -95,7 +96,7 @@ def top_k_sampling_batch(
     top_k: int,
     temperature: float,
     generator: Optional[torch.Generator] = None,
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     # NB: To be replaced by a more efficient implementation.
     return top_k_top_p_sampling_batch(
         logits,
@@ -112,7 +113,7 @@ def top_p_sampling_batch(
     top_p: float,
     temperature: float,
     generator: Optional[torch.Generator] = None,
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     # NB: To be replaced by a more efficient implementation.
     return top_k_top_p_sampling_batch(
         logits,
@@ -128,7 +129,7 @@ def temperature_sampling_batch(
     *,
     temperature: float,
     generator: Optional[torch.Generator] = None,
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     # NB: To be replaced by a more efficient implementation.
     return top_k_top_p_sampling_batch(
         logits,
@@ -146,7 +147,7 @@ def top_k_top_p_sampling_batch(
     top_p: float,
     temperature: float,
     generator: Optional[torch.Generator] = None,
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     logits_dim = logits.dim()
     assert logits_dim == 2, "logits should be 2D: [batch_size, vocab_size]"
     assert temperature > 0, "non-greedy sampling requires valid temperature"
@@ -189,21 +190,26 @@ def top_k_top_p_sampling_batch(
     # compute probability distribution
     softmax = torch.softmax(logits, dim=-1)
 
+    # compute log probabilities
+    logprobs = F.log_softmax(logits, dim=-1)
+
     # sample from the distribution and generate result of [batch_size, 1]
     next_tokens = torch.multinomial(softmax, num_samples=1, generator=generator).squeeze(-1)
-    return next_tokens, softmax
+    return next_tokens, softmax, logprobs
 
 
 def greedy_search_sampling_batch(
     logits,
     *,
     return_probs: bool = True,
-) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
     next_tokens = torch.argmax(logits, dim=-1)
     softmax: Optional[torch.Tensor] = None
+    logprobs: Optional[torch.Tensor] = None
     if return_probs:
         softmax = torch.softmax(logits, dim=-1)
-    return next_tokens, softmax
+        logprobs = F.log_softmax(logits, dim=-1)
+    return next_tokens, softmax, logprobs
 
 
 def get_rejected_indices(
@@ -248,71 +254,6 @@ def sample_rejected(
     return cast(int, new_token.item())
 
 
-def process_logits(
-    strategy: Strategy,
-    logits: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Process logits according to the specified strategy (temperature, top-k, top-p)
-    without sampling. Returns processed logits ready for log_softmax.
-
-    Args:
-        strategy: Sampling strategy tuple (strategy_name, *params)
-        logits: Input logits tensor [batch_size, vocab_size]
-
-    Returns:
-        Processed logits tensor [batch_size, vocab_size]
-    """
-    logits = logits.clone()
-    match strategy:
-        case ("top_k", top_k, temperature):
-            logits = logits / max(temperature, 1e-5)
-            batch_size, vocab_size = logits.size()
-            if top_k < vocab_size:
-                values, _ = torch.topk(logits, top_k, dim=-1)
-                min_values = values[:, -1].unsqueeze(-1).expand(batch_size, vocab_size)
-                logits = torch.where(
-                    logits < min_values, torch.full_like(logits, float("-inf")), logits
-                )
-        case ("top_p", top_p, temperature):
-            logits = logits / max(temperature, 1e-5)
-            if top_p < 1:
-                sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
-                cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
-                sorted_indices_to_remove = cumulative_probs > top_p
-                sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
-                sorted_indices_to_remove[:, 0] = 0
-                indices_to_remove = sorted_indices_to_remove.scatter(
-                    1, sorted_indices, sorted_indices_to_remove
-                )
-                logits = logits.masked_fill(indices_to_remove, float("-inf"))
-        case ("top_k_top_p", top_k, top_p, temperature):
-            logits = logits / max(temperature, 1e-5)
-            batch_size, vocab_size = logits.size()
-            if top_k < vocab_size:
-                values, _ = torch.topk(logits, top_k, dim=-1)
-                min_values = values[:, -1].unsqueeze(-1).expand(batch_size, vocab_size)
-                logits = torch.where(
-                    logits < min_values, torch.full_like(logits, float("-inf")), logits
-                )
-            if top_p < 1:
-                sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
-                cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
-                sorted_indices_to_remove = cumulative_probs > top_p
-                sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
-                sorted_indices_to_remove[:, 0] = 0
-                indices_to_remove = sorted_indices_to_remove.scatter(
-                    1, sorted_indices, sorted_indices_to_remove
-                )
-                logits = logits.masked_fill(indices_to_remove, float("-inf"))
-        case ("temperature", temperature):
-            logits = logits / max(temperature, 1e-5)
-        case ("greedy", None):
-            # No processing needed for greedy
-            pass
-    return logits
-
-
 def sample(
     strategy: Strategy,
     logits: torch.Tensor,
@@ -327,43 +268,45 @@ def sample(
         strategy: Sampling strategy tuple (strategy_name, *params)
         logits: Input logits tensor
         generator: Optional random generator
-        return_probs: If True, return softmax probabilities
+        return_probs: If True, return softmax probabilities and log probabilities
 
     Returns:
-        Tuple of (sampled_tokens, softmax_probs)
+        Tuple of (sampled_tokens, softmax_probs, logprobs)
     """
     match strategy:
         case ("top_k", top_k, temperature):
-            tokens, softmax = top_k_sampling_batch(
+            tokens, softmax, logprobs = top_k_sampling_batch(
                 logits,
                 top_k=top_k,
                 temperature=temperature,
                 generator=generator,
             )
         case ("top_p", top_p, temperature):
-            tokens, softmax = top_p_sampling_batch(
+            tokens, softmax, logprobs = top_p_sampling_batch(
                 logits,
                 top_p=top_p,
                 generator=generator,
                 temperature=temperature,
             )
         case ("top_k_top_p", top_k, top_p, temperature):
-            tokens, softmax = top_k_top_p_sampling_batch(
+            tokens, softmax, logprobs = top_k_top_p_sampling_batch(
                 logits,
                 top_k=top_k,
                 top_p=top_p,
                 temperature=temperature,
                 generator=generator,
             )
         case ("temperature", temperature):
-            tokens, softmax = temperature_sampling_batch(
+            tokens, softmax, logprobs = temperature_sampling_batch(
                 logits,
                 temperature=temperature,
                 generator=generator,
             )
         case ("greedy", None):
-            tokens, softmax = greedy_search_sampling_batch(logits, return_probs=return_probs)
-    return tokens, softmax
+            tokens, softmax, logprobs = greedy_search_sampling_batch(
+                logits, return_probs=return_probs
+            )
+    return tokens, softmax, logprobs
 
 
 GenericStrategyKeyType = TypeVar("GenericStrategyKeyType")
@@ -415,12 +358,13 @@ def sample_grouped_strategies(
 
         assert all(strategy == group_key for strategy in strategies), "group must be consistent"
 
-        return sample(
+        tokens, probs, _ = sample(
             group_key,
             logits,
             generator=generator,
             return_probs=return_probs,
         )
+        return tokens, probs
 
 
 class _AcceptSyncCompute:
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py