QiJune
diff --git a/‎tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py‎
Lines changed: 62 additions & 26 deletions b/‎tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py‎
Lines changed: 62 additions & 26 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 3 additions & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 3 additions & 0 deletions
@@ -8,8 +8,10 @@
 from ...inputs.multimodal import MultimodalParams
 from ..expert_statistic import ExpertStatistic
 from ..modules.multi_stream_utils import with_multi_stream
+from ..speculative.eagle3 import Eagle3ResourceManager
 from ..utils import make_weak_ref, piecewise_cuda_graph
-from .resource_manager import ResourceManager, ResourceManagerType
+from .resource_manager import (BaseResourceManager, ResourceManager,
+                               ResourceManagerType)
 from .scheduler import ScheduledRequests
 
 if TYPE_CHECKING:
@@ -25,7 +27,7 @@ class CUDAGraphRunner:
 
     This unified class handles high-level orchestration (padding, eligibility)
     and low-level execution (capturing, resource management, replaying) for
-    multiple graphs, keyed by (batch size, draft_len).
+    multiple graphs, keyed by (batch size, draft_len, is_first_draft).
     """
     WARMUP_STEPS = 2
 
@@ -41,10 +43,10 @@ def __init__(self, engine: "PyTorchModelEngine"):
         self.max_beam_width = engine.max_beam_width
         self.spec_config = engine.spec_config
 
-        self.graphs: Dict[Tuple[int, int], torch.cuda.CUDAGraph] = {}
-        self.graph_outputs: Dict[Tuple[int, int],
+        self.graphs: Dict[Tuple[int, int, int], torch.cuda.CUDAGraph] = {}
+        self.graph_outputs: Dict[Tuple[int, int, int],
                                  Callable[[], Optional[torch.Tensor]]] = {}
-        self.graph_metadata: Dict[Tuple[int, int], Dict[str, Any]] = {}
+        self.graph_metadata: Dict[Tuple[int, int, int], Dict[str, Any]] = {}
         self.memory_pool = engine._cuda_graph_mem_pool
         self.padding_dummy_request: Optional["Request"] = None
 
@@ -56,7 +58,7 @@ def _create_shared_static_tensors(self):
         """Allocates static tensors sized for the largest possible batch."""
         engine = self._get_engine()
 
-        token_per_request = self.draft_len + 1
+        token_per_request = self.max_possible_draft_len + 1
         max_total_tokens = (self.max_supported_batch_size *
                             self.max_beam_width * token_per_request)
         max_total_tokens = min(max_total_tokens, engine.max_num_tokens)
@@ -87,8 +89,23 @@ def enable_spec_decode(self):
         return self._get_engine().enable_spec_decode
 
     @property
-    def draft_len(self):
-        return self.spec_config.max_draft_len if self.enable_spec_decode else 0
+    def max_possible_draft_len(self):
+        engine = self._get_engine()
+        return (engine.original_max_draft_len if self.enable_spec_decode else 0)
+
+    def get_graph_key(
+            self,
+            batch_size,
+            spec_resource_manager: Optional[BaseResourceManager] = None):
+        engine = self._get_engine()
+        if engine.is_draft_model and spec_resource_manager is not None and isinstance(
+                spec_resource_manager, Eagle3ResourceManager):
+            draft_len = engine.original_max_draft_len if spec_resource_manager.is_first_draft else 0
+            key = (batch_size, draft_len, spec_resource_manager.is_first_draft)
+        else:
+            draft_len = self.spec_config.max_draft_len if self.enable_spec_decode else 0
+            key = (batch_size, draft_len, False)
+        return key
 
     @property
     def spec_metadata(self):
@@ -113,21 +130,25 @@ def _get_engine(self) -> "PyTorchModelEngine":
                 "The parent PyTorchModelEngine has been garbage collected.")
         return engine
 
-    def maybe_get_cuda_graph(self, batch: ScheduledRequests):
+    def maybe_get_cuda_graph(
+            self,
+            batch: ScheduledRequests,
+            spec_resource_manager: Optional[BaseResourceManager] = None):
         """
         Determines if the current batch can be run with a CUDA graph.
 
         Returns a tuple containing:
         - A boolean indicating if a graph can be used.
         - The attn_metadata for the graph, if applicable.
         - The spec_metadata for the graph, if applicable.
+        - The key for the graph.
         """
         engine = self._get_engine()
 
         # disable when doing statistic
         if hasattr(engine, 'iter_counter') and ExpertStatistic.set_iter(
                 engine.iter_counter):
-            return False, None, None
+            return False, None, None, None
 
         can_run_cuda_graph = batch.can_run_cuda_graph
         batch_size = batch.batch_size
@@ -141,22 +162,22 @@ def maybe_get_cuda_graph(self, batch: ScheduledRequests):
                 for all_gen_only in all_can_graph_batch)
 
             if not is_all_gen_only or not all_batch_size_equal:
-                return False, None, None
+                return False, None, None, None
 
         if not self.enabled or not can_run_cuda_graph:
-            return False, None, None
+            return False, None, None, None
+        key = self.get_graph_key(batch_size, spec_resource_manager)
 
-        key = (batch_size, self.draft_len)
         if key in self.graphs:
             return True, self.graph_metadata[key][
-                "attn_metadata"], self.graph_metadata[key]["spec_metadata"]
+                "attn_metadata"], self.graph_metadata[key]["spec_metadata"], key
 
         if batch_size not in self.supported_batch_sizes:
-            return False, None, None
+            return False, None, None, None
 
         num_sequences_in_batch = batch_size * self.max_beam_width
         attn_metadata = self.attn_metadata.create_cuda_graph_metadata(
-            num_sequences_in_batch, False, self.draft_len)
+            num_sequences_in_batch, False, key[1])
         assert attn_metadata.is_cuda_graph
 
         if self.enable_spec_decode:
@@ -165,23 +186,25 @@ def maybe_get_cuda_graph(self, batch: ScheduledRequests):
             spec_metadata.draft_tokens = self.draft_tokens_cuda
         else:
             spec_metadata = None
-        return True, attn_metadata, spec_metadata
+        return True, attn_metadata, spec_metadata, key
+
+    def needs_capture(self, key: Tuple[int, int, int]):
 
-    def needs_capture(self, batch_size: int):
-        return (batch_size, self.draft_len) not in self.graph_outputs
+        return key not in self.graph_outputs
 
     def capture(self,
-                batch_size: int,
+                key: Tuple[int, int, int],
                 forward_fn: Callable,
                 initial_inputs: Dict[str, Any],
                 postprocess_fn: Optional[Callable] = None):
         """Captures the forward pass for a given batch size."""
         engine = self._get_engine()
-        key = (batch_size, self.draft_len)
+        batch_size = key[0]
         # [CUDA graph spec decode padding]
         # We pad input IDs/position IDs to the maximum draft length (token per request).
         # We're forced to do this because we cannot reallocate inputs over many graph runs.
-        token_per_request = self.draft_len + 1
+        max_draft_len = key[1]
+        token_per_request = max_draft_len + 1
         num_tokens_for_capture = (batch_size * self.max_beam_width *
                                   token_per_request)
 
@@ -207,30 +230,43 @@ def capture(self,
             "spec_metadata": initial_inputs.get("spec_metadata", None),
         }
 
+        def _setup_spec_decoding_and_forward(key: Tuple[int, int, int],
+                                             forward_fn: Callable,
+                                             capture_inputs: Dict[str, Any]):
+            engine = self._get_engine()
+            # for the first inference of draft model, we need to set the use_spec_decoding to True when capture the graph for multiple runs.
+            is_first_draft = key[2]
+            needs_kv_cache_recompute = True if engine.enable_spec_decode and engine.spec_config.spec_dec_mode.needs_kv_cache_recompute(
+            ) else False
+            if is_first_draft and engine.is_draft_model and needs_kv_cache_recompute:
+                capture_inputs['attn_metadata'].use_spec_decoding = True
+            return forward_fn(capture_inputs)
+
         # We have to do warm up runs to initialize PyTorch's
         # internal states according to the docs:
         # https://pytorch.org/docs/stable/notes/cuda.html#cuda-graph-semantics
         # This also lets us initialize states in the attn_metadata.
         graph = torch.cuda.CUDAGraph()
         with with_multi_stream(True), piecewise_cuda_graph(False):
             for _ in range(self.WARMUP_STEPS):
-                forward_fn(capture_inputs)
+                _setup_spec_decoding_and_forward(key, forward_fn,
+                                                 capture_inputs)
                 if postprocess_fn is not None:
                     postprocess_fn(capture_inputs)
             with torch.cuda.graph(graph, pool=self.memory_pool):
-                output = forward_fn(capture_inputs)
+                output = _setup_spec_decoding_and_forward(
+                    key, forward_fn, capture_inputs)
             if postprocess_fn is not None:
                 postprocess_fn(capture_inputs)
 
         self.graphs[key] = graph
         self.graph_outputs[key] = make_weak_ref(output)
         self.memory_pool = graph.pool()
 
-    def replay(self, batch_size: int,
+    def replay(self, key: Tuple[int, int, int],
                current_inputs: Dict[str, Any]) -> Optional[torch.Tensor]:
         """Replays a previously captured graph."""
         engine = self._get_engine()
-        key = (batch_size, self.draft_len)
         stored_meta = self.graph_metadata[key]
         assert current_inputs["attn_metadata"] is stored_meta["attn_metadata"]
         if stored_meta["spec_metadata"] is not None:
 
@@ -311,6 +311,7 @@ def __init__(
             is_draft: bool = False,
             seq_slot: Optional[int] = None,
             target_seq_slot: Optional[int] = None,
+            is_first_draft: bool = False,
             **kwargs):
 
         self.py_logits_post_processors = kwargs.pop("py_logits_post_processors",
@@ -365,6 +366,8 @@ def __init__(
         # If the request is a draft request, target_seq_slot is the sequence slot ID of its target request.
         self.py_target_seq_slot = target_seq_slot
         self.use_draft_model = is_draft
+        # Whether the request is for the first forward of the draft model.
+        self.py_is_first_draft = is_first_draft
 
         # TODO: remove this when use DynamicDecodeOp in pytorch flow.
         # currently, keep py_stop_words_list as python list, rather than tensor.