Add support for KVCache reuse for DSAv32

Tabrizian · Tabrizian · commit 233e38ad32a1 · 2025-11-24T17:56:55.000Z
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -876,14 +876,7 @@ void WindowBlockManager::allocatePools(bool useUvm)
         }
 
         nvinfer1::Dims cacheShape;
-        if (pool.containsIndexerKCache)
-        {
-            cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, blockSize});
-        }
-        else
-        {
-            cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize});
-        }
+        cacheShape = ITensor::makeShape({mNumPrimaryBlocks, pool.numLayers, mKVFactor, blockSize});
 
         TLLM_LOG_DEBUG("[%s] Allocating primary pool with %d blocks for %d layers with %d kv heads", mLogPrefix.c_str(),
             mNumPrimaryBlocks, pool.numLayers, pool.numKvHeads);
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -972,7 +972,11 @@ def prepare(metadata: DSAtrtllmAttentionMetadata):
                 and host_cached_tokens.sum().item() > 0
                 and metadata.runtime_features.chunked_prefill)
 
+<<<<<<< HEAD
             if has_mla_chunked_prefill:
+=======
+            if has_mla_chunked_prefill or metadata.kv_cache_manager.enable_block_reuse:
+>>>>>>> a786050756 (Add support for KVCache reuse for DSAv32)
                 # MLA chunked prefill mode: prepare single indexer chunk for current MLA chunk
                 # The MLA has already split the sequence, we just process what's given
                 chunk_specs = [(i, 0, host_seq_lens[i].item(),
@@ -1009,7 +1013,7 @@ def prepare(metadata: DSAtrtllmAttentionMetadata):
 
             # Compute causal attention bounds accounting for cached KV tokens
             # For chunked prefill: Q has new tokens, K has cached + new tokens
-            if has_mla_chunked_prefill:
+            if has_mla_chunked_prefill or metadata.kv_cache_manager.enable_block_reuse:
                 # Chunked prefill mode: adjust bounds for cached KV
                 host_cu_seqlen_ks, host_cu_seqlen_ke = compute_cu_seqlen_kv_bounds_with_cache(
                     host_seq_lens, host_cached_tokens, num_contexts,
@@ -1639,10 +1643,6 @@ def __init__(
         sparse_attn_config: "SparseAttentionConfig",
         **kwargs,
     ) -> None:
-
-        if kv_cache_config.enable_block_reuse:
-            raise NotImplementedError(
-                "DSA indexer K-cache manager does not support block reuse yet")
         self.quant_block_size = 128
         self.index_head_dim = sparse_attn_config.index_head_dim
         # Use a fixed tokens_per_block for indexer k cache due to DG kernel constraints
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2490,7 +2490,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                 "MOE TRTLLM backend does not support SM version 120 or 121")
 
         moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
-        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+        kv_cache_config = KvCacheConfig(enable_block_reuse=True,
                                         free_gpu_memory_fraction=0.7,
                                         tokens_per_block=64)
         cuda_graph_config = CudaGraphConfig(