fix

QiJune · QiJune · commit 7f7e2bb008b9 · 2025-11-04T11:24:18.000+08:00
Signed-off-by: junq &lt;22017000+QiJune@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1604,16 +1604,16 @@ def previous_seq_slots_device():
 
                 # The order of requests in a batch: [context requests, generation requests]
                 # generation requests: ['requests that do not have previous batch', 'requests that already have previous batch', 'dummy requests']
-                #    1) 'requests that do not have previous batch': disable overlap scheduler or the first step in the generation server of disaggregated serving.
-                #    2) 'requests that already have previous batch': previous iteration's requests.
-                #    3) 'dummy requests': pad dummy requests for CUDA graph or attention dp.
+                #   1) 'requests that do not have previous batch': disable overlap scheduler or the first step in the generation server of disaggregated serving.
+                #   2) 'requests that already have previous batch': previous iteration's requests.
+                #   3) 'dummy requests': pad dummy requests for CUDA graph or attention dp.
                 # Therefore, both of self.previous_pos_id_offsets_cuda and self.previous_kv_lens_offsets_cuda are also 3 segments.
-                #    For 1) 'requests that do not have previous batch': disable overlap scheduler or the first step in the generation server of disaggregated serving.
+                #   For 1) 'requests that do not have previous batch': disable overlap scheduler or the first step in the generation server of disaggregated serving.
                 #       Set these requests' previous_pos_id_offsets and previous_kv_lens_offsets to '0' to skip the value changes in _preprocess_inputs.
                 #       Already set to '0' during initialization.
-                #    For 2) 'requests that already have previous batch': enable overlap scheduler.
+                #   For 2) 'requests that already have previous batch': enable overlap scheduler.
                 #       Set their previous_pos_id_offsets and previous_kv_lens_offsets according to new_tokens_lens_device and kv_len_offsets_device.
-                #    For 3) 'dummy requests': pad dummy requests for CUDA graph or attention dp.
+                #   For 3) 'dummy requests': pad dummy requests for CUDA graph or attention dp.
                 #       Already set to '0' during initialization.
 
                 num_extend_reqeust_wo_dummy = len(extend_requests) - len(
@@ -2401,8 +2401,7 @@ def capture_postprocess_fn(inputs: Dict[str, Any]):
                         outputs = self.cuda_graph_runner.replay(key, inputs)
                     else:
                         with MoeLoadBalancerIterContext(moe_load_balancer):
-                            outputs = self._forward_step(
-                                inputs, gather_ids, gather_context_logits)
+                            outputs = self.cuda_graph_runner.replay(key, inputs)
 
             if self.forward_pass_callable is not None:
                 self.forward_pass_callable()