WAR for sampled logprob

hchings · hchings · commit 2bc8bbf23914 · 2025-11-22T01:21:19.000-08:00
diff --git a/examples/llm-api/llm_inference_logprob.py b/examples/llm-api/llm_inference_logprob.py
@@ -0,0 +1,55 @@
+from tensorrt_llm import LLM, SamplingParams
+
+
+def main():
+    llm = LLM(
+        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        gather_generation_logits=True  # Required. TODO: Acutal API TBD.
+    )
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Current behavior:
+    # - With return_generation_logits=True: Returns ONLY the sampled token's logprob
+    # - Without return_generation_logits=True: Returns top-K tokens (sampled token NOT guaranteed)
+    sampling_params = SamplingParams(
+        max_tokens=10,
+        temperature=0.7,
+        top_p=0.95,
+        logprobs=1,
+        return_generation_logits=True,
+    )
+
+    for output in llm.generate(prompts, sampling_params):
+        print(f"\n{'='*80}")
+        print(f"Prompt: {output.prompt!r}")
+        print(f"Generated text: {output.outputs[0].text!r}")
+        print(f"Generated token IDs: {output.outputs[0].token_ids}")
+
+        if output.outputs[0].logprobs:
+            print(f"\nLogprobs for each generated token:")
+            for i, (token_id, token_logprobs) in enumerate(
+                zip(output.outputs[0].token_ids, output.outputs[0].logprobs)
+            ):
+                print(f"\n  Token {i}: ID={token_id}, Text={llm.tokenizer.decode([token_id])!r}")
+
+                # TODO. move to proper test
+                assert len(token_logprobs) == 1
+                assert token_id in token_logprobs, f"Sampled token {token_id} not in logprobs dict."
+
+                for tid, logprob_obj in token_logprobs.items():
+                    token_text = llm.tokenizer.decode([tid])
+                    is_sampled = "← SAMPLED" if tid == token_id else ""
+                    print(f"    • Token {tid:5d} ({token_text:15s}): "
+                          f"logprob={logprob_obj.logprob:8.4f}, "
+                          f"rank={logprob_obj.rank} {is_sampled}")
+        print(f"{'='*80}\n")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -745,15 +745,44 @@ def handle_logprobs(
             topk_log_probs_vals = request.py_topk_logprobs_vals[:count]
             topk_log_probs_indices = request.py_topk_logprobs_indices[:count]
 
-            token_log_probs = [
-                {
-                    token: Logprob(logprob=logprob, rank=rank + 1)
-                    for rank, (token, logprob) in enumerate(
-                        zip(topk_token.tolist(), topk_logprob.tolist())
-                    )
-                }
-                for topk_token, topk_logprob in zip(topk_log_probs_indices, topk_log_probs_vals)
-            ]
+            sampled_tokens = request.get_tokens(beam)[-count:]
+
+            token_log_probs = []
+            for step, (topk_token, topk_logprob) in enumerate(zip(topk_log_probs_indices, topk_log_probs_vals)):
+                sampled_token = sampled_tokens[step]
+
+                # TODO. WAR: If both gather_generation_logits and return_generation_logits are set,
+                # return ONLY the sampled token's logprob (not top-K).
+                if request.py_return_generation_logits:
+                    generation_logits_storage = request.py_result._generation_logits
+                    if generation_logits_storage and generation_logits_storage._storage is not None:
+                        # Compute log_softmax to get logprobs for the sampled token
+                        # Iinternal storage tensor: [seq_length, beam_width, vocab_size]
+                        logits_for_step = generation_logits_storage._storage[step]  # [beam_width, vocab_size]
+                        logprobs_for_step = F.log_softmax(logits_for_step[beam].float(), dim=-1)
+                        sampled_logprob = logprobs_for_step[sampled_token].item()
+
+                        rank = (logprobs_for_step > sampled_logprob).sum().item() + 1
+
+                        step_dict = {sampled_token: Logprob(logprob=sampled_logprob, rank=rank)}
+                    else:
+                        step_dict = {
+                            token: Logprob(logprob=logprob, rank=rank + 1)
+                            for rank, (token, logprob) in enumerate(
+                                zip(topk_token.tolist(), topk_logprob.tolist())
+                            )
+                        }
+                else:
+                    # Original behavior: return top-K
+                    step_dict = {
+                        token: Logprob(logprob=logprob, rank=rank + 1)
+                        for rank, (token, logprob) in enumerate(
+                            zip(topk_token.tolist(), topk_logprob.tolist())
+                        )
+                    }
+
+                token_log_probs.append(step_dict)
+
             assert beam == 0, (
                 "The following call relies on beam_width to be 1 - hence the list with a single element"
             )