fix step

hchings · hchings · commit ede907bed035 · 2025-11-23T06:58:47.000-08:00
diff --git a/examples/llm-api/llm_inference_logprob.py b/examples/llm-api/llm_inference_logprob.py
@@ -1,12 +1,21 @@
+import torch
+
 from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm._tensorrt_engine import LLM as TrtLLM
 
 
 def main():
     llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        gather_generation_logits=True  # Required. TODO: Acutal API TBD.
+        model="/scratch/llm-models/llama-models-v2/TinyLlama-1.1B-Chat-v1.0",
+        gather_generation_logits=True,  # Required. TODO: Acutal API TBD.
+        orchestrator_type="ray"
     )
 
+    # llm = TrtLLM(
+    #     model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    #     gather_generation_logits=True
+    # )
+
     # Sample prompts.
     prompts = [
         "Hello, my name is",
@@ -19,8 +28,8 @@ def main():
     # - Without return_generation_logits=True: Returns top-K tokens (sampled token NOT guaranteed)
     sampling_params = SamplingParams(
         max_tokens=10,
-        temperature=0.7,
-        top_p=0.95,
+        # temperature=0.7,
+        # top_p=0.95,
         logprobs=1,
         return_generation_logits=True,
     )
@@ -31,6 +40,14 @@ def main():
         print(f"Generated text: {output.outputs[0].text!r}")
         print(f"Generated token IDs: {output.outputs[0].token_ids}")
 
+        if output.outputs[0].generation_logits is not None:
+            logits = output.outputs[0].generation_logits
+
+            # sanity check on sampled logits
+            num_logits = logits.shape[0]
+            sampled_logits = [logits[i, token_id].item() for i, token_id in enumerate(output.outputs[0].token_ids[:num_logits])]
+            print(f"Logits of sampled tokens: {sampled_logits}")
+
         if output.outputs[0].logprobs:
             print(f"\nLogprobs for each generated token:")
             for i, (token_id, token_logprobs) in enumerate(
@@ -39,7 +56,7 @@ def main():
                 print(f"\n  Token {i}: ID={token_id}, Text={llm.tokenizer.decode([token_id])!r}")
 
                 # TODO. move to proper test
-                assert len(token_logprobs) == 1
+                # assert len(token_logprobs) == 1
                 assert token_id in token_logprobs, f"Sampled token {token_id} not in logprobs dict."
 
                 for tid, logprob_obj in token_logprobs.items():
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -756,9 +756,12 @@ def handle_logprobs(
                 if request.py_return_generation_logits:
                     generation_logits_storage = request.py_result._generation_logits
                     if generation_logits_storage and generation_logits_storage._storage is not None:
-                        # Compute log_softmax to get logprobs for the sampled token
-                        # Iinternal storage tensor: [seq_length, beam_width, vocab_size]
-                        logits_for_step = generation_logits_storage._storage[step]  # [beam_width, vocab_size]
+                        # Internal storage tensor: [seq_length, beam_width, vocab_size]
+                        # Calculate absolute step index in the generation sequence
+                        num_generated_tokens = len(request.get_tokens(beam)) - request.py_prompt_len
+                        absolute_step = num_generated_tokens - count + step
+
+                        logits_for_step = generation_logits_storage._storage[absolute_step]  # [beam_width, vocab_size]
                         logprobs_for_step = F.log_softmax(logits_for_step[beam].float(), dim=-1)
                         sampled_logprob = logprobs_for_step[sampled_token].item()