[None][feat] Support MLA chunked prefill for DeepSeek V3.2 model (NVIDIA#9376)

chang-l · web-flow · commit b10137fdd543 · 2025-11-26T16:38:25.000+08:00
Signed-off-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.cu b/cpp/tensorrt_llm/kernels/mlaKernels.cu
@@ -230,15 +230,22 @@ __global__ void applyMLARopeAndAssignQKVKernelOptContext(T* q_ptr, T* q_pe, T* k
 
             int const global_token_offset = cu_q_seqlens[batch_idx];
             int const cache_seq_len = kv_cache_lengths[batch_idx];
-            int token_idx_in_kv_cache = local_token_idx;
-            bool const valid_token = token_idx_in_kv_cache < cache_seq_len;
+
+            // Derive cached offset and current input length
+            int const current_seq_len = cu_q_seqlens[batch_idx + 1] - global_token_offset;
+            int const cached_offset = cache_seq_len - current_seq_len;
+
+            int token_idx_in_kv_cache = local_token_idx + cached_offset;
+            // Check against BOTH total cache length (valid slot) AND input length (valid read)
+            bool const valid_token = (token_idx_in_kv_cache < cache_seq_len) && (local_token_idx < current_seq_len);
+
             // Limit the token_idx to cache seq length (we need all threads in this block to be involved).
             token_idx_in_kv_cache = std::min(token_idx_in_kv_cache, cache_seq_len - 1);
-            local_token_idx = std::min(local_token_idx, cache_seq_len - 1);
-            int const global_token_idx = local_token_idx + global_token_offset;
+            int const safe_local_token_idx = std::min(local_token_idx, current_seq_len - 1);
+            int const global_token_idx = safe_local_token_idx + global_token_offset;
 
             auto const position_id
-                = helix_position_offsets ? helix_position_offsets[global_token_idx] : local_token_idx;
+                = helix_position_offsets ? helix_position_offsets[global_token_idx] : token_idx_in_kv_cache;
             float2 const* rotary_coef_cache_buffer
                 = cos_sin_cache + static_cast<size_t>(ROPE_DIM) * position_id + (head_dim_idx / 2);
 
@@ -317,12 +324,19 @@ __global__ void applyMLARopeAndAssignQKVKernelOptContext(T* q_ptr, T* q_pe, T* k
 
             int const global_token_offset = cu_q_seqlens[batch_idx];
             int const cache_seq_len = kv_cache_lengths[batch_idx];
-            int token_idx_in_kv_cache = local_token_idx;
-            bool const valid_token = token_idx_in_kv_cache < cache_seq_len;
+
+            // Derive cached offset and current input length (same as first loop)
+            int const current_seq_len = cu_q_seqlens[batch_idx + 1] - global_token_offset;
+            int const cached_offset = cache_seq_len - current_seq_len;
+
+            int token_idx_in_kv_cache = local_token_idx + cached_offset;
+            // Check against BOTH total cache length (valid slot) AND input length (valid read)
+            bool const valid_token = (token_idx_in_kv_cache < cache_seq_len) && (local_token_idx < current_seq_len);
+
             // Limit the token_idx to cache seq length (we need all threads in this block to be involved).
             token_idx_in_kv_cache = std::min(token_idx_in_kv_cache, cache_seq_len - 1);
-            local_token_idx = std::min(local_token_idx, cache_seq_len - 1);
-            int const global_token_idx = local_token_idx + global_token_offset;
+            int const safe_local_token_idx = std::min(local_token_idx, current_seq_len - 1);
+            int const global_token_idx = safe_local_token_idx + global_token_offset;
 
             if (valid_token)
             {
diff --git a/examples/llm-api/llm_sparse_attention.py b/examples/llm-api/llm_sparse_attention.py
@@ -121,6 +121,10 @@ def parse_arguments():
                         nargs='+',
                         type=int,
                         default=None)
+    parser.add_argument('--enable_chunked_prefill',
+                        default=False,
+                        action='store_true',
+                        help='Enable chunked prefill')
     args = parser.parse_args()
     return args
 
@@ -136,6 +140,7 @@ def run_llm(args, sparse_attention_config):
         False,  # sparse attention does not support kv cache reuse now
         free_gpu_memory_fraction=args.kv_cache_fraction,
         dtype=args.kv_cache_dtype,
+        tokens_per_block=64,
     )
 
     cuda_graph_config = CudaGraphConfig(
@@ -159,6 +164,7 @@ def run_llm(args, sparse_attention_config):
         print_iter_log=args.print_iter_log,
         enable_iter_perf_stats=args.print_iter_log,
         moe_config=MoeConfig(backend=args.moe_backend),
+        enable_chunked_prefill=args.enable_chunked_prefill,
     )
 
     prompts = []
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2530,6 +2530,68 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                 task = GSM8K(self.MODEL_NAME)
                 task.evaluate(llm)
 
+    @pytest.mark.skip_less_mpi_world_size(8)
+    @pytest.mark.skip_less_device(8)
+    @skip_pre_blackwell
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
+        [
+            (8, 1, 8, 0, True, True, True, True, 32, "CUTLASS"),
+            (8, 1, 8, 3, False, False, True, True, 1, "TRTLLM"),
+        ],
+        ids=["baseline_fp8kv", "latency"])
+    def test_nvfp4_multi_gpus_chunked_prefill(self, tp_size, pp_size, ep_size,
+                                              mtp_nextn, fp8kv, attention_dp,
+                                              cuda_graph, overlap_scheduler,
+                                              max_batch_size, moe_backend):
+        if moe_backend == "TRTLLM" and (get_sm_version() == 120
+                                        or get_sm_version() == 121):
+            pytest.skip(
+                "MOE TRTLLM backend does not support SM version 120 or 121")
+
+        moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
+        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+                                        free_gpu_memory_fraction=0.7,
+                                        tokens_per_block=64)
+        cuda_graph_config = CudaGraphConfig(
+            enable_padding=True,
+            max_batch_size=max_batch_size) if cuda_graph else None
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=cuda_graph_config,
+            moe_config=moe_config,
+        )
+
+        if fp8kv:
+            kv_cache_config.dtype = "fp8"
+        mtp_config = None
+        if mtp_nextn > 0:
+            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
+        with LLM(f"{llm_models_root()}/DeepSeek-V3.2-Exp-FP4-v2",
+                 max_batch_size=max_batch_size,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config,
+                 enable_chunked_prefill=True,
+                 max_num_tokens=512) as llm:
+
+            # GPQA Diamond takes too long to run, we enable it only for fp8kv.
+            if fp8kv:
+                task = GPQADiamond(self.MODEL_NAME)
+                task.evaluate(llm,
+                              extra_evaluator_kwargs=dict(
+                                  apply_chat_template=True,
+                                  chat_template_kwargs=dict(thinking=True)))
+            else:
+                task = MMLU(self.MODEL_NAME)
+                task.evaluate(llm)
+                task = GSM8K(self.MODEL_NAME)
+                task.evaluate(llm)
+
 
 @skip_pre_blackwell
 class TestGLM4_6(LlmapiAccuracyTestHarness):
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -501,6 +501,8 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baselin
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency]
 accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
@@ -56,6 +56,8 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baselin
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[latency]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=0-moe_backend=WIDEEP]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -123,6 +123,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (180)
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/unittest/_torch/attention/sparse/test_dsa_indexer.py b/tests/unittest/_torch/attention/sparse/test_dsa_indexer.py