NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎3rdparty/flash-mla‎ b/‎3rdparty/flash-mla‎
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 125 additions & 55 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h‎
Lines changed: 125 additions & 55 deletions
diff --git a/‎cpp/tensorrt_llm/CMakeLists.txt‎
Lines changed: 4 additions & 22 deletions b/‎cpp/tensorrt_llm/CMakeLists.txt‎
Lines changed: 4 additions & 22 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions b/‎cpp/tensorrt_llm/batch_manager/CMakeLists.txt‎
Lines changed: 0 additions & 4 deletions
@@ -47,6 +47,9 @@ tensorrt_llm/deep_gemm/
 tensorrt_llm/deep_gemm_cpp_tllm.*.so
 tensorrt_llm/deep_gemm_cpp_tllm.pyi
 tensorrt_llm/pg_utils_bindings.*.so
+tensorrt_llm/flash_mla/
+tensorrt_llm/flash_mla_cpp_tllm.*.so
+tensorrt_llm/flash_mla_cpp_tllm.pyi
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
 docs/source/**/*.rst
 
@@ -30,3 +30,6 @@
 	path = 3rdparty/DeepGEMM
 	url = https://github.com/ruoqianguo/DeepGEMM.git
 	branch = swapab_sm100
+[submodule "3rdparty/flash-mla"]
+	path = 3rdparty/flash-mla
+	url = https://github.com/deepseek-ai/FlashMLA.git
@@ -32,6 +32,7 @@ option(BUILD_TESTS "Build Google tests" ON)
 option(BUILD_BENCHMARKS "Build benchmarks" ON)
 option(BUILD_DEEP_EP "Build the Deep EP module" ON)
 option(BUILD_DEEP_GEMM "Build the DeepGEMM module" ON)
+option(BUILD_FLASH_MLA "Build the FlashMLA module" ON)
 option(BUILD_MICRO_BENCHMARKS "Build C++ micro benchmarks" OFF)
 option(NVTX_DISABLE "Disable all NVTX features" ON)
 option(WARNING_IS_ERROR "Treat all warnings as errors" OFF)
 
@@ -17,33 +17,66 @@
 #pragma once
 
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/runtime/iTensor.h"
 
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
 class BlockIterator;
 
-class BlockRange
+class BlockRangeForWindow
 {
 public:
-    // C++20 std::default_sentinel_t equivalent
+    BlockRangeForWindow(BaseKVCacheManager const* cacheManager, SizeType32 windowSize, std::vector<SizeType32> blockIds,
+        runtime::ITensor::SharedPtr pool)
+        : mCacheManager(cacheManager)
+        , mWindowSize(windowSize)
+        , mBlockIds(std::move(blockIds))
+        , mPool(std::move(pool))
+    {
+    }
+
     struct Sentinel
     {
     };
 
-    static BlockRange fromAllBlockIds(BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId,
-        SizeType32 beam = kFIRST_AND_ONLY_BEAM)
+    friend class BlockIterator;
+    BlockIterator begin() const;
+
+    [[nodiscard]] Sentinel end() const
+    {
+        return {};
+    }
+
+    [[nodiscard]] size_t size() const
+    {
+        return mBlockIds.size();
+    }
+
+private:
+    BaseKVCacheManager const* mCacheManager;
+    SizeType32 mWindowSize;
+    std::vector<SizeType32> mBlockIds;
+    runtime::ITensor::SharedPtr mPool;
+};
+
+class BlockRange
+{
+public:
+    static BlockRange fromAllBlockIds(BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId)
     {
-        assert(kFIRST_AND_ONLY_BEAM == beam);
-        auto const windowSize = firstWindowSize(cacheManager);
-        auto const blockIds = cacheManager.getSequence(requestId).getCacheBlockIds(windowSize).at(kFIRST_AND_ONLY_BEAM);
-        return BlockRange(cacheManager, blockIds, requestId);
+
+        return BlockRange(cacheManager, requestId);
     }
 
     static BlockRange fromReuseTree(
         BaseKVCacheManager& cacheManager, BlockKey const& lastBlockKey, int32_t indexFromEnd)
     {
-        auto const windowSize = firstWindowSize(cacheManager);
+
+        auto poolNum = cacheManager.getNumPools();
+        TLLM_CHECK_WITH_INFO(poolNum == 1, "Reuse tree is not supported for multiple pools or variable window size");
+
+        auto windowSize = cacheManager.getBlockManager().getWindowSizesMetadata().begin()->first;
         // Find the last block in the reuse tree for the provided full sequence of block keys
         auto lastBlock = cacheManager.findBlocksInReuseTreeByBlockKey(lastBlockKey, windowSize);
         // TODO: handle the case where the last block is not found
@@ -65,78 +98,104 @@ class BlockRange
         }
         // Reverse to chronological order: oldest to newest
         std::reverse(blockIds.begin(), blockIds.end());
-        return BlockRange(cacheManager, blockIds, 0);
-    }
-
-    BlockRange(runtime::ITensor::SharedPtr pool, std::vector<SizeType32> const& blockIds) // Only used in tests
-        : mManager{nullptr}
-        , mPool{std::move(pool)}
-        , mWindowSize{0}
-        , mRequestId{0}
-        , mBlockIds{blockIds}
-    {
-        TLLM_CHECK(mPool);
+        std::unordered_map<SizeType32, std::vector<SizeType32>> blockIdsPerWindow;
+        blockIdsPerWindow[windowSize] = blockIds;
+        return BlockRange(cacheManager, blockIdsPerWindow, 0);
     }
 
-    [[nodiscard]] BlockIterator begin() const;
-
-    [[nodiscard]] Sentinel end() const
+    void setBlockIdsForWindow(SizeType32 windowSize, std::vector<SizeType32> blockIds)
     {
-        return {};
+        TLLM_CHECK_WITH_INFO(mBlockIdsPerWindow.find(windowSize) != mBlockIdsPerWindow.end(),
+            "Window size %d should exists", windowSize);
+        mBlockIdsPerWindow[windowSize] = std::move(blockIds);
     }
 
-    [[nodiscard]] size_t size() const
+    void setBlockIdsForAllWindows(std::unordered_map<SizeType32, std::vector<SizeType32>> blockIdsPerWindow)
     {
-        return mBlockIds.size();
+        for (auto const& [windowSize, blockIds] : blockIdsPerWindow)
+        {
+            TLLM_CHECK_WITH_INFO(
+                mPoolsPerWindow.find(windowSize) != mPoolsPerWindow.end(), "Window size %d should exists", windowSize);
+        }
+        mBlockIdsPerWindow = std::move(blockIdsPerWindow);
     }
 
-    [[nodiscard]] std::vector<SizeType32> const& getBlockIds() const
+    [[nodiscard]] std::unordered_map<SizeType32, std::vector<size_t>> getBlockHashesPerWindow() const
     {
-        return mBlockIds;
+        TLLM_CHECK(mManager);
+        std::unordered_map<SizeType32, std::vector<size_t>> blockHashesPerWindow;
+        auto& blockManager = mManager->getBlockManager();
+        for (auto const& [windowSize, blockIds] : mBlockIdsPerWindow)
+        {
+            for (auto const& blockId : blockIds)
+            {
+                blockHashesPerWindow[windowSize].emplace_back(
+                    blockManager.getBlockById(blockId, windowSize)->getHash());
+            }
+        }
+        return blockHashesPerWindow;
     }
 
-    void setBlockIds(std::vector<SizeType32> blockIds)
+    BlockRangeForWindow getBlockRangeForWindow(SizeType32 windowSize) const
     {
-        mBlockIds = std::move(blockIds);
+        TLLM_CHECK_WITH_INFO(
+            mPoolsPerWindow.find(windowSize) != mPoolsPerWindow.end(), "Window size %d not found", windowSize);
+        auto pool = mPoolsPerWindow.at(windowSize).front();
+        auto blockIds = mBlockIdsPerWindow.at(windowSize);
+        return BlockRangeForWindow(mManager, windowSize, std::move(blockIds), std::move(pool));
     }
 
-    void updatePoolIdx(SizeType32 poolIdx)
+    std::vector<SizeType32> getWindowSizes() const
     {
-        TLLM_CHECK(mManager);
-        mPool = mManager->getBlockManager().getPrimaryPool(poolIdx);
-        auto const newWindowSize = mManager->getBlockManager().getPoolWindowSize(poolIdx);
-        if (newWindowSize != mWindowSize)
+        std::vector<SizeType32> windowSizes;
+        for (auto const& [windowSize, _] : mPoolsPerWindow)
         {
-            mWindowSize = newWindowSize;
-            mBlockIds = mManager->getSequence(mRequestId).getCacheBlockIds(mWindowSize).at(kFIRST_AND_ONLY_BEAM);
+            windowSizes.push_back(windowSize);
         }
+        return windowSizes;
     }
 
-    friend class BlockIterator;
+    std::unordered_map<SizeType32, std::vector<SizeType32>> const& getBlockIdsPerWindow() const
+    {
+        return mBlockIdsPerWindow;
+    }
 
 private:
-    BlockRange(
-        BaseKVCacheManager const& cacheManager, std::vector<SizeType32> blockIds, LlmRequest::RequestIdType requestId)
+    BlockRange(BaseKVCacheManager const& cacheManager,
+        std::unordered_map<SizeType32, std::vector<SizeType32>> blockIdsPerWindow, LlmRequest::RequestIdType requestId)
         : mManager(&cacheManager)
-        , mPool(cacheManager.getBlockManager().getPrimaryPool(kFIRST_POOL_INDEX))
-        , mWindowSize(firstWindowSize(cacheManager))
         , mRequestId(requestId)
-        , mBlockIds(std::move(blockIds))
+        , mBlockIdsPerWindow(std::move(blockIdsPerWindow))
     {
+
+        // cacheManager.getBlockManager.getPrimaryPool(0);
+        auto poolNum = mManager->getNumPools();
+        for (SizeType32 poolIdx = 0; poolIdx < poolNum; ++poolIdx)
+        {
+            auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
+            mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx));
+        }
     }
 
-    static SizeType32 firstWindowSize(BaseKVCacheManager const& cacheManager)
+    BlockRange(BaseKVCacheManager const& cacheManager, LlmRequest::RequestIdType requestId)
+        : mManager(&cacheManager)
+        , mRequestId(requestId)
     {
-        constexpr SizeType32 FIRST_POOL_IDX = 0;
-        return cacheManager.getBlockManager().getPoolWindowSize(FIRST_POOL_IDX);
+        auto poolNum = mManager->getNumPools();
+        for (SizeType32 poolIdx = 0; poolIdx < poolNum; ++poolIdx)
+        {
+            auto windowSize = cacheManager.getBlockManager().getPoolWindowSize(poolIdx);
+            mPoolsPerWindow[windowSize].push_back(cacheManager.getBlockManager().getPrimaryPool(poolIdx));
+            mBlockIdsPerWindow[windowSize]
+                = cacheManager.getSequence(mRequestId).getCacheBlockIds(windowSize).at(kFIRST_AND_ONLY_BEAM);
+        }
     }
 
 private:
     BaseKVCacheManager const* mManager;
-    runtime::ITensor::SharedPtr mPool;
-    SizeType32 mWindowSize;
-    const LlmRequest::RequestIdType mRequestId;
-    std::vector<SizeType32> mBlockIds;
+    LlmRequest::RequestIdType const mRequestId;
+    std::unordered_map<SizeType32, std::vector<SizeType32>> mBlockIdsPerWindow;
+    std::unordered_map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> mPoolsPerWindow;
 
     static constexpr SizeType32 kFIRST_AND_ONLY_BEAM = 0;
     static constexpr SizeType32 kFIRST_POOL_INDEX = 0;
@@ -151,7 +210,7 @@ class BlockIterator
     using reference = value_type&;
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
 
-    BlockIterator(BlockRange const* range, size_t idx)
+    BlockIterator(BlockRangeForWindow const* range, size_t idx)
         : mRange{range}
         , mIdx{idx}
     {
@@ -194,7 +253,7 @@ class BlockIterator
         return mIdx == other.mIdx && mRange == other.mRange;
     }
 
-    [[nodiscard]] bool operator==(BlockRange::Sentinel other) const
+    [[nodiscard]] bool operator==(BlockRangeForWindow::Sentinel other) const
     {
         return mIdx == mRange->mBlockIds.size();
     }
@@ -210,16 +269,27 @@ class BlockIterator
     {
         if (mIdx < mRange->mBlockIds.size())
         {
-            mCurrent = runtime::ITensor::slice(mRange->mPool, mRange->mBlockIds.at(mIdx), 1);
+            if (mRange->mCacheManager != nullptr)
+            {
+                BlockPtr const& block = mRange->mCacheManager->getBlockManager().getBlockById(
+                    mRange->mBlockIds.at(mIdx), mRange->mWindowSize);
+                TLLM_CHECK_WITH_INFO(block->isPrimary(), "cache transceiver only supports primary blocks");
+                auto const blockOffset = block->getMemoryPoolBlockIndex();
+                mCurrent = runtime::ITensor::slice(mRange->mPool, blockOffset, 1);
+            }
+            else
+            {
+                mCurrent = runtime::ITensor::slice(mRange->mPool, mRange->mBlockIds.at(mIdx), 1);
+            }
         }
     }
 
-    BlockRange const* mRange;
+    BlockRangeForWindow const* mRange;
     runtime::ITensor::SharedPtr mCurrent;
     size_t mIdx;
 };
 
-inline BlockIterator BlockRange::begin() const
+inline BlockIterator BlockRangeForWindow::begin() const
 {
     return {this, 0};
 }
 
@@ -147,24 +147,6 @@ add_subdirectory(runtime)
 add_subdirectory(testing)
 add_subdirectory(executor_worker)
 
-if(ENABLE_CUFILE)
-  find_library(
-    CUFILE_LIBRARY cufile HINTS ${CUDAToolkit_LIBRARY_DIR}
-                                /usr/lib/${TARGET_ARCH} /usr/local/lib)
-  if(NOT CUFILE_LIBRARY)
-    # FATAL_ERROR if user explicitly requests with GDS if CUDA's libcufile.so is
-    # not found.
-    message(
-      FATAL_ERROR
-        "cuFile library not found. Set -DENABLE_CUFILE=OFF if cufile isn't required."
-    )
-  else()
-    message(STATUS "Linking with cufile: ${CUFILE_LIBRARY}")
-  endif()
-else()
-  message(STATUS "ENABLE_CUFILE=OFF, skipping GDS linkage.")
-endif()
-
 set(BATCH_MANAGER_TARGET tensorrt_llm_batch_manager_static)
 set(BATCH_MANAGER_TARGET_ARCH ${TARGET_ARCH})
 add_subdirectory(batch_manager)
@@ -263,10 +245,6 @@ set_target_properties(
 
 target_link_libraries(${SHARED_TARGET} PUBLIC ${TRTLLM_LINK_LIBS})
 
-if(ENABLE_CUFILE)
-  target_link_libraries(${SHARED_TARGET} PUBLIC ${CUFILE_LIBRARY})
-endif()
-
 target_link_libraries(
   ${SHARED_TARGET}
   PRIVATE $<LINK_LIBRARY:WHOLE_ARCHIVE,${BATCH_MANAGER_TARGET}>
@@ -320,4 +298,8 @@ if(BUILD_DEEP_GEMM)
   add_subdirectory(deep_gemm)
 endif()
 
+if(BUILD_FLASH_MLA)
+  add_subdirectory(flash_mla)
+endif()
+
 add_subdirectory(plugins)
@@ -94,10 +94,6 @@ set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
 target_compile_definitions(${BATCH_MANAGER_STATIC_TARGET}
                            PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
 
-if(ENABLE_CUFILE)
-  target_link_libraries(${BATCH_MANAGER_STATIC_TARGET} PUBLIC ${CUFILE_LIBRARY})
-endif()
-
 if(ENABLE_UCX)
   find_package(ucx REQUIRED)
   find_package(ucxx REQUIRED)