QiJune
diff --git a/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h‎
Lines changed: 23 additions & 15 deletions b/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h‎
Lines changed: 23 additions & 15 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp‎
Lines changed: 65 additions & 44 deletions b/‎cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp‎
Lines changed: 65 additions & 44 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h‎
Lines changed: 18 additions & 0 deletions b/‎cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h‎
Lines changed: 18 additions & 0 deletions
@@ -549,6 +549,9 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     ActivationType mActType = ActivationType::Relu;
 
     constexpr static int64_t NUM_BUFFERS = 32;
+    int64_t mNumWorkspaceBuffers = NUM_BUFFERS;
+    int64_t mNumInputBuffers = NUM_BUFFERS;
+    int64_t mNumGemmProfilerBuffers = NUM_BUFFERS;
 
     std::array<QuantParams, NUM_BUFFERS> mQuantParams{};
     bool mUseLora = false;
@@ -619,12 +622,12 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
         if (gemm_to_profile == GemmToProfile::LAYER)
         {
-
             mWorkspaceSize = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK,
                 mActType, parallelism_config, mUseLora, /*use_deepseek_fp8_block_scale=*/false,
                 /*min_latency_mode=*/false, mUsePrequantScale);
 
-            mWorkspace = allocBuffer<char>(mWorkspaceSize * NUM_BUFFERS);
+            mNumWorkspaceBuffers = mWorkspaceSize > 1024 * 1024 * 1024 ? 2 : NUM_BUFFERS;
+            mWorkspace = allocBuffer<char>(mWorkspaceSize * mNumWorkspaceBuffers);
 
             mExpertBias1 = nullptr;
             mExpertBias2 = nullptr;
@@ -690,9 +693,10 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             mScaleProbsSize = padSize(mTotalTokens * mK);
             mScaleProbs = allocBuffer<float>(mScaleProbsSize * NUM_BUFFERS);
             mInputTensorSize = padSize(mTotalTokens * mHiddenSize);
-            mInputTensor = allocBuffer<DataType>(mInputTensorSize * NUM_BUFFERS);
+            mNumInputBuffers = mInputTensorSize > 1024 * 1024 * 1024 ? 2 : NUM_BUFFERS;
+            mInputTensor = allocBuffer<DataType>(mInputTensorSize * mNumInputBuffers);
             mFinalOutputSize = padSize(mTotalTokens * mHiddenSize);
-            mFinalOutput = allocBuffer<OutputType>(mFinalOutputSize * NUM_BUFFERS);
+            mFinalOutput = allocBuffer<OutputType>(mFinalOutputSize * mNumInputBuffers);
 
             mSourceToExpandedMapSize = padSize(mTotalTokens * mK);
             mSourceToExpandedMap = allocBuffer<int>(mSourceToExpandedMapSize * NUM_BUFFERS);
@@ -732,10 +736,11 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
                 = std::max(mGemmProfilerWorkspaceSize, mGemmProfilerBackend.getWorkspaceSize(mTotalTokens));
         }
 
-        int64_t num_gemm_buffers = gemm_to_profile == GemmToProfile::LAYER ? 1 : NUM_BUFFERS;
         mGemmProfilerWorkspaceSize = padSize(mGemmProfilerWorkspaceSize);
+        mNumGemmProfilerBuffers = mGemmProfilerWorkspaceSize > 1024 * 1024 * 1024 ? 2 : NUM_BUFFERS;
+        mNumGemmProfilerBuffers = gemm_to_profile == GemmToProfile::LAYER ? 1 : mNumGemmProfilerBuffers;
         mGemmProfilerWorkspace = mGemmProfilerWorkspaceSize > 0
-            ? allocBuffer<char>(mGemmProfilerWorkspaceSize * num_gemm_buffers)
+            ? allocBuffer<char>(mGemmProfilerWorkspaceSize * mNumGemmProfilerBuffers)
             : nullptr;
 
         check_cuda_error(cudaStreamSynchronize(streamPtr->get()));
@@ -748,7 +753,8 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         mGemmProfilerBackend.mGemmToProfile = static_cast<GemmProfilerBackend::GemmToProfile>(gemm_to_profile);
         auto* expert_weights = gemm_to_profile == GemmToProfile::GEMM_1 ? mExpertWeight1 : mExpertWeight2;
         auto expert_weights_size = gemm_to_profile == GemmToProfile::GEMM_1 ? mExpertWeight1Size : mExpertWeight2Size;
-        mGemmProfilerBackend.prepare(mTotalTokens, mGemmProfilerWorkspace + mGemmProfilerWorkspaceSize * mBufferIndex,
+        mGemmProfilerBackend.prepare(mTotalTokens,
+            mGemmProfilerWorkspace + mGemmProfilerWorkspaceSize * (mBufferIndex % mNumGemmProfilerBuffers),
             /*expert_weights=*/expert_weights + expert_weights_size * mBufferIndex, streamPtr->get());
     }
 
@@ -865,7 +871,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
                 }
 
                 // Profile all samples or for 1 sec
-                int const max_iters = mGemmProfilerBackend.NUM_ROUTING_SAMPLES;
+                int const max_iters = mGemmProfilerBackend.NUM_ROUTING_SAMPLES * 2;
                 float const max_time_ms = 1000.f;
 
                 float time = 0.f;
@@ -974,7 +980,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             }
             mGemmProfilerBackend.mSampleIndex = mBufferIndex % mGemmProfilerBackend.NUM_ROUTING_SAMPLES;
             mGemmProfilerBackend.runProfiler(mTotalTokens, tactics,
-                mGemmProfilerWorkspace + mGemmProfilerWorkspaceSize * mBufferIndex,
+                mGemmProfilerWorkspace + mGemmProfilerWorkspaceSize * (mBufferIndex % mNumGemmProfilerBuffers),
                 /*expert_weights=*/expert_weights + expert_weights_size * mBufferIndex, streamPtr->get());
             break;
         }
@@ -983,26 +989,28 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             auto stream = streamPtr->get();
             MoeMinLatencyParams min_latency_params;
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
-            mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr, true,
+            mMoERunner.runMoe(mInputTensor + mInputTensorSize * (mBufferIndex % mNumInputBuffers), nullptr, true,
                 mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
                 mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
                 mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
                 ActivationParams(mActType), mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
                 mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
-                mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
-                mFinalOutput + mFinalOutputSize * mBufferIndex,
+                mHiddenSize, mInterSize, mNumExperts, mK,
+                mWorkspace + mWorkspaceSize * (mBufferIndex % mNumWorkspaceBuffers),
+                mFinalOutput + mFinalOutputSize * (mBufferIndex % mNumInputBuffers),
                 mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config,
                 /*enable_alltoall=*/false, mUseLora, mLoraParams[mBufferIndex],
                 /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
 #else
-            mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr, true,
+            mMoERunner.runMoe(mInputTensor + mInputTensorSize * (mBufferIndex % mNumInputBuffers), nullptr, true,
                 mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
                 mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
                 mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
                 ActivationParams(mActType), mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
                 mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
-                mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
-                mFinalOutput + mFinalOutputSize * mBufferIndex,
+                mHiddenSize, mInterSize, mNumExperts, mK,
+                mWorkspace + mWorkspaceSize * (mBufferIndex % mNumWorkspaceBuffers),
+                mFinalOutput + mFinalOutputSize * (mBufferIndex % mNumInputBuffers),
                 mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config,
                 /*enable_alltoall=*/false, mUseLora, mLoraParams[mBufferIndex],
                 /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
 
@@ -37,12 +37,39 @@
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
+int getBlockNumAccountingForCP(int cpRank, int cpSize, int numTotalBlocks, bool strict)
+{
+    TLLM_CHECK(cpRank >= 0 && cpRank < cpSize);
+    if (cpSize == 1)
+    {
+        return numTotalBlocks;
+    }
+    // NOTE: Non-strict mode may over-allocate blocks when numTotalBlocks is not divisible by cpSize.
+    // This is a known limitation and will be addressed in a future MR.
+    if (!strict)
+    {
+        // Simple ceiling division.
+        return (numTotalBlocks + cpSize - 1) / cpSize;
+    }
+    // In strict mode, blocks are distributed among CP ranks in a round-robin fashion as evenly as possible.
+    // When the number of blocks is not divisible by cpSize, the remainder shall be distributed evenly among
+    // lowest-indexed CP ranks (let's call them overflow ranks).
+    int numBlocksCurrRank = numTotalBlocks / cpSize;
+    if (numTotalBlocks % cpSize > cpRank)
+    {
+        numBlocksCurrRank++;
+    }
+    return numBlocksCurrRank;
+}
+
 // some context rank in connection
 std::vector<size_t> MLACacheFormatter::pickRecvConnections(
     size_t numConnections, CacheState const& selfConfig, SizeType32 selfIdx, CacheState const& destConfig) const
 {
 
     auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
+    // This function is called only by gen side and we only support CPSize=1 on context size.
+    TLLM_CHECK(targetInfo.mDomainCPSize == 1);
     TLLM_CHECK(numConnections == targetInfo.mIRanks.size());
     std::vector<size_t> ret;
     // targetInfo , mRanks [tpranks, dpranks]
@@ -97,14 +124,11 @@ void MLACacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& ses
     auto& bufferManager = session.getBufferManager();
     TLLM_CHECK_WITH_INFO(llmRequest.mSamplingConfig.beamWidth == 1, "Currently only supports beam width 1.");
     TLLM_CHECK(!connections.empty());
-    // diff start
     if (!needSendCache(selfConfig, destConfig, selfIdx))
     {
         return;
     }
 
-    // diff end
-
     auto const numPools = mCacheManager->getBlockManager().getNumPools();
     auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest);
 
@@ -147,43 +171,48 @@ void MLACacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& ses
         return;
     }
 
-    auto cacheBlockSize = inputKvCacheBlocks.at(0)->getSize();
-
-    auto cacheBufferId = mCacheTransBufferManager->assignBufferIndexForSend();
-    // diff start
-
     auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
-    auto ppRank = selfIdx
-        / (selfConfig.getParallelConfig().mTensorParallelism * selfConfig.getParallelConfig().mContextParallelism);
-    int selfAttentionLayerNum = selfConfig.getParallelConfig().mAttentionLayerNumPerPP.at(ppRank);
     size_t pPDomainSize = targetInfo.mDomainPPSize;
+    size_t cPDomainSize = targetInfo.mDomainCPSize;
+
     auto getBufferSizeForTarget = [&]()
     {
-        std::vector<size_t> bufferSizeForTarget(pPDomainSize, 0);
-        size_t cacheSizePerLayer = cacheBlockSize * blockNum / selfAttentionLayerNum;
-        for (size_t i = 0; i < pPDomainSize; i++)
+        auto const ppRank = selfIdx
+            / (selfConfig.getParallelConfig().mTensorParallelism * selfConfig.getParallelConfig().mContextParallelism);
+        auto const selfAttentionLayerNum = selfConfig.getParallelConfig().mAttentionLayerNumPerPP.at(ppRank);
+        auto const cacheBlockSize = inputKvCacheBlocks.at(0)->getSize();
+        auto const blockSizePerLayer = cacheBlockSize / selfAttentionLayerNum;
+        std::vector<size_t> bufferSizeForTarget(pPDomainSize * cPDomainSize, 0);
+        for (size_t ppDomainIdx = 0; ppDomainIdx < pPDomainSize; ppDomainIdx++)
         {
-            auto layerNum = targetInfo.getPeerPPDomainLayerNum(i);
-            bufferSizeForTarget[i] = cacheSizePerLayer * layerNum;
+            auto const peerAttentionLayerNum = targetInfo.getPeerPPDomainLayerNum(ppDomainIdx);
+            for (size_t cpDomainIdx = 0; cpDomainIdx < cPDomainSize; cpDomainIdx++)
+            {
+                auto const idx = cpDomainIdx * pPDomainSize + ppDomainIdx;
+                // Note: contextCP is always 1. So, cpDomainSize == genCPSize and cpDomainIdx == genCPRank.
+                auto const peerBlockNum
+                    = getBlockNumAccountingForCP(cpDomainIdx, cPDomainSize, blockNum, /*strict=*/false);
+                bufferSizeForTarget[idx] = blockSizePerLayer * peerAttentionLayerNum * peerBlockNum;
+            }
         }
         return bufferSizeForTarget;
     };
     auto bufferEleSizes = getBufferSizeForTarget();
+    auto cacheBufferId = mCacheTransBufferManager->assignBufferIndexForSend();
     auto result = mCacheTransBufferManager->getOrAllocateSendBuffers(
-        cacheBufferId, static_cast<int>(pPDomainSize), bufferEleSizes, bufferManager);
+        cacheBufferId, static_cast<int>(pPDomainSize * cPDomainSize), bufferEleSizes, bufferManager);
     auto& outputSplitCaches = std::get<0>(result);
     auto& bufferCoverTargetNum = std::get<1>(result);
     auto& onlyUseDynamicBuffer = std::get<2>(result);
     auto* agentConnnecion = dynamic_cast<executor::kv_cache::AgentConnection const*>(connections[0]);
     if (agentConnnecion != nullptr)
     {
-        TLLM_CHECK_WITH_INFO(bufferCoverTargetNum == pPDomainSize, "Agent need all buffer pre-allocated");
+        TLLM_CHECK_WITH_INFO(
+            bufferCoverTargetNum == pPDomainSize * cPDomainSize, "Agent need all buffer pre-allocated");
         TLLM_CHECK(onlyUseDynamicBuffer == false);
     }
-    // diff end
-
-    // The size of outputSplitCaches should be equal to pPDomainSize
 
+    // The size of outputSplitCaches should be equal to pPDomainSize * cPDomainSize.
     SizeType32 window = mCacheManager->getBlockManager().getPoolWindowSize(0);
     std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> inputKvCacheBlocksPerWindow;
     inputKvCacheBlocksPerWindow.emplace(window, inputKvCacheBlocks);
@@ -203,7 +232,7 @@ void MLACacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& ses
 
         TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
         auto startTime = std::chrono::steady_clock::now();
-        auto cacheIdx = processIdx % pPDomainSize;
+        auto cacheIdx = processIdx % (pPDomainSize * cPDomainSize);
         if (cacheIdx < bufferCoverTargetNum)
         {
             size_t size = outputSplitCaches.at(cacheIdx)->getSizeInBytes();
@@ -259,7 +288,8 @@ void MLACacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& ses
         else
         {
             // concurrency num
-            auto concurrencyNum = std::min(std::max(static_cast<size_t>(1), bufferCoverTargetNum), pPDomainSize);
+            auto concurrencyNum
+                = std::min(std::max(static_cast<size_t>(1), bufferCoverTargetNum), pPDomainSize * cPDomainSize);
 
             auto remainSendNum = connections.size();
 
@@ -307,9 +337,7 @@ void MLACacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& s
     auto& bufferManager = session.getBufferManager();
     auto arrivalTime = llmRequest.getPerfMetrics().timingMetrics.arrivalTime;
     bool recordDelay = arrivalTime != std::chrono::steady_clock::time_point();
-    // diff start
     auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);
-    // diff end
     auto blockRange = getBlockRangeForReceiving(mCacheManager, llmRequest);
     std::vector<runtime::ITensor::SharedPtr> recvBufferTmps;
     std::vector<runtime::ITensor::SharedPtr> outputBuffers;
@@ -364,23 +392,24 @@ void MLACacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& s
             cacheBufferId = mCacheTransBufferManager->assignBufferIndexForRecv();
         }
 
-        auto cacheBlockSize = outputBuffers.at(0)->getSize();
-
         auto targetNum = pickUpConnections.size();
-        auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
-        auto ppRank = selfIdx
-            / (selfConfig.getParallelConfig().mTensorParallelism * selfConfig.getParallelConfig().mContextParallelism);
-        auto selfAttentionLayerNum = selfConfig.getParallelConfig().mAttentionLayerNumPerPP.at(ppRank);
-        TLLM_CHECK_WITH_INFO(selfAttentionLayerNum != 0, "selfAttentionLayerNum should not be 0");
 
         auto getBufferSizeForTarget = [&]()
         {
+            auto const targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
+            auto const cacheBlockSize = outputBuffers.at(0)->getSize();
+            auto const ppRank = selfIdx
+                / (selfConfig.getParallelConfig().mTensorParallelism
+                    * selfConfig.getParallelConfig().mContextParallelism);
+            auto const selfAttentionLayerNum = selfConfig.getParallelConfig().mAttentionLayerNumPerPP.at(ppRank);
+            TLLM_CHECK_WITH_INFO(selfAttentionLayerNum != 0, "selfAttentionLayerNum should not be 0");
             std::vector<size_t> bufferEleSizes(targetNum, 0);
-            auto cacheSizePerLayer = cacheBlockSize * blockNum / selfAttentionLayerNum;
+            auto const cacheSizePerLayer = cacheBlockSize * blockNum / selfAttentionLayerNum;
             for (size_t i = 0; i < targetNum; i++)
             {
-                auto layerNum = targetInfo.getPeerPPDomainLayerNum(static_cast<SizeType32>(pickUpConnections[i]));
-                bufferEleSizes[i] = cacheSizePerLayer * layerNum;
+                auto const peerAttentionLayerNum
+                    = targetInfo.getPeerPPDomainLayerNum(static_cast<SizeType32>(pickUpConnections[i]));
+                bufferEleSizes[i] = cacheSizePerLayer * peerAttentionLayerNum;
             }
             return bufferEleSizes;
         };
@@ -506,7 +535,7 @@ void MLACacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& s
             outputCachesPerWindow.emplace(window, outputBuffers);
             NVTX3_SCOPED_RANGE(formatInputConcatenate);
 
-            // recvSplitCaches size == ppdomainsize
+            // recvSplitCaches size == ppdomainsize * cpdomainsize.
             executor::kv_cache::concatKvCacheV2Dispatch(
                 recvSplitCaches, outputCachesPerWindow, destConfig, selfConfig, selfIdx, bufferManager);
         }
@@ -581,14 +610,6 @@ void MLACacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& s
         TLLM_LOG_WARNING("MLACacheFormatter::inquireSupport: TP size must be divisible by DP size");
         return false;
     }
-    if (selfConfig.getParallelConfig().mContextParallelism != 1
-        || destConfig.getParallelConfig().mContextParallelism != 1)
-    {
-        TLLM_LOG_WARNING(
-            "MLACacheFormatter::inquireSupport: context parallelism is not currently supported (selfCP=%d, destCP=%d).",
-            selfConfig.getParallelConfig().mContextParallelism, destConfig.getParallelConfig().mContextParallelism);
-        return false;
-    }
     if (destConfig.getParallelConfig().mEnableAttentionDP
         && (destConfig.getParallelConfig().mTensorParallelism % destConfig.getParallelConfig().mDPsize != 0))
     {
 
@@ -22,6 +22,24 @@
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
+/**
+ * @brief Calculate the number of blocks allocated to a specific Context Parallelism (CP) rank.
+ *
+ * This function determines how many blocks should be allocated to a given CP rank when
+ * distributing a total number of blocks across multiple CP ranks. It supports two distribution
+ * modes: strict and non-strict.
+ *
+ * @param cpRank The rank (index) of the current CP process. Must be in range [0, cpSize).
+ * @param cpSize The total number of CP ranks/processes in the parallel group.
+ * @param numTotalBlocks The total number of blocks to be distributed across all CP ranks.
+ * @param strict Flag controlling the distribution strategy:
+ *               - true: Use strict round-robin distribution with exact allocation
+ *               - false: Use ceiling division which may over-allocate
+ *
+ * @return The number of blocks allocated to the specified CP rank.
+ */
+int getBlockNumAccountingForCP(int cpRank, int cpSize, int numTotalBlocks, bool strict);
+
 // Simple cache block copy. Because it does not involve data splitting or merging, it performs best when the
 // parallel topology is completely identical, making it the preferred method.
 class MLACacheFormatter final : public BaseCacheFormatter