fix: fix cuda graph padding for spec decoding (#4853)

lfr-0531 · lfr-0531 · commit ec8dadcddb5f · 2025-06-08T18:09:46.000+08:00
Signed-off-by: Fanrong Li &lt;23290157+lfr-0531@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -268,6 +268,10 @@ def create_response(
         return LlmResponse(response,
                            self.py_result) if response is not None else None
 
+    @property
+    def is_dummy(self):
+        return self.is_attention_dp_dummy or self.is_cuda_graph_dummy
+
 
 def convert_wordlist(word_list) -> List[List[int]]:
     """Converts a wordlist from format:
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1114,13 +1114,18 @@ def _prepare_tp_inputs(
                 new_tokens_lens_device = new_tensors_device.new_tokens_lens  # [batch]
                 next_draft_tokens_device = new_tensors_device.next_draft_tokens  # [batch, draft_len]
 
-        # Requests with draft tokens are treated like extend requests.
+        # Requests with draft tokens are treated like extend requests. Dummy extend requests should be
+        # at the end of extend_requests.
         extend_requests = []
+        extend_dummy_requests = []
         generation_requests = []
         for request in scheduled_requests.generation_requests:
             if len(request.py_draft_tokens
                    ) > 0 or next_draft_tokens_device is not None:
-                extend_requests.append(request)
+                if request.is_dummy:
+                    extend_dummy_requests.append(request)
+                else:
+                    extend_requests.append(request)
             else:
                 generation_requests.append(request)
 
@@ -1130,6 +1135,7 @@ def _prepare_tp_inputs(
                     torch.tensor([mrope_position_deltas],
                                  dtype=torch.int32).to('cuda',
                                                        non_blocking=True))
+        extend_requests += extend_dummy_requests
 
         if not self._disable_overlap_scheduler and self.is_spec_decode:
             spec_dec_mode = self.spec_config.spec_dec_mode
@@ -1139,21 +1145,18 @@ def _prepare_tp_inputs(
         # will contain previous batch incices of generation requests
         previous_batch_indices = []
         previous_pos_indices = []
-        request_ids_with_previous_batch = []
-        num_extend_reqs_wo_previous_batch = 0
         for request in extend_requests:
-            if next_draft_tokens_device is None or request.py_batch_idx is None:
-                # the request has no previous device tensors:
-                # (1) next_draft_tokens_device is None, which means overlap scheduler is disabled; or
-                # (2) request.py_batch_idx is None, which means the request has no previous batch.
-                # the second condition includes dummy generation requests created for CUDA graph padding or
-                # attention DP. These dummy generation requests should be at the head of generation_requests.
-                # TODO: move the dummy generation requests to the end of generation_requests to align with
-                # the logic for those requests in generation_requests.
-                # get token ids, including input token ids and draft token ids
-                input_ids.append(request.get_last_tokens(0))
-                input_ids.extend(request.py_draft_tokens)
-                draft_tokens.extend(request.py_draft_tokens)
+            # the request has no previous tensor:
+            # (1) next_draft_tokens_device is None, which means overlap scheduler is disabled; or
+            # (2) a dummy request; or
+            # (3) the first step in the generation server of disaggregated serving
+            if next_draft_tokens_device is None or request.is_dummy or request.py_batch_idx is None:
+                # get token ids, including input token ids and draft token ids. For these dummy requests,
+                # no need to copy the token ids.
+                if not request.is_dummy:
+                    input_ids.append(request.get_last_tokens(0))
+                    input_ids.extend(request.py_draft_tokens)
+                    draft_tokens.extend(request.py_draft_tokens)
                 # get other ids and lengths
                 num_draft_tokens = len(request.py_draft_tokens)
                 past_seen_token_num = request.max_beam_num_tokens - 1
@@ -1173,7 +1176,6 @@ def _prepare_tp_inputs(
                 # update batch index
                 request.py_batch_idx = batch_idx
                 batch_idx += 1
-                num_extend_reqs_wo_previous_batch += 1
             else:
                 # update batch index
                 previous_batch_idx = request.py_batch_idx
@@ -1200,10 +1202,7 @@ def _prepare_tp_inputs(
                 num_cached_tokens_per_seq.append(past_seen_token_num +
                                                  self.max_draft_len + 1)
                 prompt_lengths.append(request.py_prompt_len)
-                request_ids_with_previous_batch.append(request.py_request_id)
-
-        # move requests with previous batch to the end of the list
-        request_ids.extend(request_ids_with_previous_batch)
+                request_ids.append(request.py_request_id)
 
         sequence_lengths.extend([1] * len(generation_requests))
         gather_ids.extend(
@@ -1238,6 +1237,7 @@ def _prepare_tp_inputs(
         num_tokens = len(input_ids)
         num_draft_tokens = len(draft_tokens)
         previous_batchs = len(previous_batch_indices)
+        num_requests = len(request_ids)
         # if exist requests that do not have previous batch, copy input_ids and draft_tokens
         if num_tokens > 0:
             input_ids = torch.tensor(input_ids,
@@ -1276,31 +1276,27 @@ def _prepare_tp_inputs(
                                                        non_blocking=True)
                 # prepare data for the preprocess inputs
                 kv_len_offsets_device = new_tokens_lens_device - self.max_draft_len - 1
-                pre_tokens_start_idx = num_extend_reqs_wo_previous_batch * (
-                    1 + self.max_draft_len)
-                pre_tokens_end_idx = pre_tokens_start_idx + previous_batch_tokens
-                pre_batch_start_idx = num_extend_reqs_wo_previous_batch
-                pre_batch_end_idx = pre_batch_start_idx + previous_batchs
                 previous_pos_indices = torch.tensor(previous_pos_indices,
                                                     dtype=torch.int,
                                                     pin_memory=True)
-                self.previous_pos_indices_cuda[
-                    pre_tokens_start_idx:pre_tokens_end_idx].copy_(
-                        previous_pos_indices, non_blocking=True)
+                self.previous_pos_indices_cuda[0:previous_batch_tokens].copy_(
+                    previous_pos_indices, non_blocking=True)
                 self.previous_pos_id_offsets_cuda[
-                    pre_tokens_start_idx:pre_tokens_end_idx].copy_(
+                    0:previous_batch_tokens].copy_(
                         new_tokens_lens_device[self.previous_pos_indices_cuda[
-                            pre_tokens_start_idx:pre_tokens_end_idx]],
-                        non_blocking=True)
-                self.previous_kv_lens_offsets_cuda[
-                    pre_batch_start_idx:pre_batch_end_idx].copy_(
-                        kv_len_offsets_device[
-                            self.previous_batch_indices_cuda[:previous_batchs]],
+                            0:previous_batch_tokens]],
                         non_blocking=True)
+                self.previous_kv_lens_offsets_cuda[0:previous_batchs].copy_(
+                    kv_len_offsets_device[
+                        self.previous_batch_indices_cuda[:previous_batchs]],
+                    non_blocking=True)
                 # for the requests that do not have previous batch, set the previous_pos_id_offsets and
                 # previous_kv_lens_offsets to zeros to skip the value changes in _preprocess_inputs
-                self.previous_pos_id_offsets_cuda[:pre_tokens_start_idx] *= 0
-                self.previous_kv_lens_offsets_cuda[:pre_batch_start_idx] *= 0
+                self.previous_pos_id_offsets_cuda[
+                    previous_batch_tokens:num_requests *
+                    (1 + self.max_draft_len)] *= 0
+                self.previous_kv_lens_offsets_cuda[
+                    previous_batchs:num_requests] *= 0
             else:
                 # change the data to zeros to skip the value changes in _preprocess_inputs
                 self.previous_pos_id_offsets_cuda *= 0
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -568,16 +568,53 @@ def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
             task.evaluate(llm)
 
     @pytest.mark.skip_device_not_contain(["H100"])
-    def test_fp8_block_scales_cuda_graph_padding(self):
+    @parametrize_with_ids("mtp_nextn", [0, 2])
+    def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
+        mtp_config = None
+        if mtp_nextn > 0:
+            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         pytorch_config = PyTorchConfig(disable_overlap_scheduler=False,
                                        use_cuda_graph=True,
                                        cuda_graph_max_batch_size=512,
                                        cuda_graph_padding_enabled=True)
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                   kv_cache_config=kv_cache_config,
-                  pytorch_backend_config=pytorch_config)
+                  pytorch_backend_config=pytorch_config,
+                  speculative_config=mtp_config)
+        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+        with llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.skip_device_not_contain(["H100", "H200"])
+    @parametrize_with_ids("mtp_nextn", [0, 2])
+    @parametrize_with_ids("attention_dp", [False, True])
+    def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
+                                                       attention_dp):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        mtp_config = None
+        if mtp_nextn > 0:
+            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
+        pytorch_config = PyTorchConfig(
+            disable_overlap_scheduler=False,
+            use_cuda_graph=True,
+            cuda_graph_padding_enabled=True,
+        )
+        quant_config = QuantConfig()
+        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
+
+        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
+                  tensor_parallel_size=4,
+                  kv_cache_config=kv_cache_config,
+                  pytorch_backend_config=pytorch_config,
+                  quant_config=quant_config,
+                  enable_attention_dp=attention_dp,
+                  speculative_config=mtp_config)
         assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
         with llm:
             task = CnnDailymail(self.MODEL_NAME)
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -101,6 +101,8 @@ l0_dgx_h100:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=2]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -51,7 +51,8 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=fp8-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=0]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=2]
   - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]
   - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu[DeepSeek-V3-Lite-fp8]