QiJune
diff --git a/‎3rdparty/DeepGEMM‎ b/‎3rdparty/DeepGEMM‎
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/cmake/modules/FindNIXL.cmake‎
Lines changed: 11 additions & 1 deletion b/‎cpp/cmake/modules/FindNIXL.cmake‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 35 additions & 23 deletions b/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp‎
Lines changed: 35 additions & 23 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h‎
Lines changed: 13 additions & 0 deletions b/‎cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/CMakeLists.txt‎
Lines changed: 0 additions & 7 deletions b/‎cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/CMakeLists.txt‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎cpp/tests/unit_tests/executor/executorTestSmall.cpp‎
Lines changed: 85 additions & 0 deletions b/‎cpp/tests/unit_tests/executor/executorTestSmall.cpp‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎docs/source/_static/custom.css‎
Lines changed: 9 additions & 3 deletions b/‎docs/source/_static/custom.css‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎docs/source/conf.py‎
Lines changed: 2 additions & 3 deletions b/‎docs/source/conf.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎docs/source/developer-guide/perf-benchmarking.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/developer-guide/perf-benchmarking.md‎
Lines changed: 1 addition & 1 deletion
@@ -558,6 +558,7 @@ if(ENABLE_UCX)
     find_package(ucxx REQUIRED PATHS ${CMAKE_BINARY_DIR}/ucxx/build
                  NO_DEFAULT_PATH)
   endif()
+  find_package(NIXL)
 endif()
 if(ENABLE_UCX)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_UCX=1")
 
@@ -23,6 +23,16 @@ endif()
 
 find_package(ucx REQUIRED)
 
+# Set default NIXL_ROOT if not provided
+if(NOT NIXL_ROOT)
+  set(NIXL_ROOT
+      "/opt/nvidia/nvda_nixl"
+      CACHE PATH "NIXL installation directory" FORCE)
+  message(STATUS "NIXL_ROOT not set, using default: ${NIXL_ROOT}")
+else()
+  message(STATUS "Using provided NIXL_ROOT: ${NIXL_ROOT}")
+endif()
+
 find_path(NIXL_INCLUDE_DIR nixl.h HINTS ${NIXL_ROOT}/include)
 
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
@@ -69,5 +79,5 @@ else()
   message(STATUS "NIXL_LIBRARY: ${NIXL_LIBRARY}")
   message(STATUS "NIXL_BUILD_LIBRARY: ${NIXL_BUILD_LIBRARY}")
   message(STATUS "SERDES_LIBRARY: ${SERDES_LIBRARY}")
-  message(FATAL_ERROR "NIXL not found after installation attempt.")
+  unset(NIXL_ROOT CACHE)
 endif()
@@ -85,6 +85,40 @@ using tensorrt_llm::batch_manager::CacheTransceiverFactory;
 namespace tensorrt_llm::batch_manager
 {
 
+std::map<SizeType32, SizeType32> TrtGptModelInflightBatching::calculateCacheSizePerTokenForDisagg(
+    ModelConfig const& modelConfig, WorldConfig const& worldConfig,
+    std::vector<SizeType32> const& maxAttentionWindowVec, bool isCrossAttention, SizeType32 kvFactor)
+{
+    // These are the number of attention layers on this PP rank.
+    auto const numLocalAttnLayers
+        = modelConfig.getNbAttentionLayers(worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank());
+    // These are the number of attention layers on all previous PP ranks.
+    auto const numLowerRankAttnLayers = modelConfig.countLowerRankLayers(ModelConfig::LayerType::kATTENTION,
+        worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank());
+    // Use global ranks of attention layers to lookup from maxAttentionWindowVec.
+    auto const startAttnLayerId = numLowerRankAttnLayers;
+    auto const endAttnLayerId = numLowerRankAttnLayers + numLocalAttnLayers;
+    auto const numNonUniqueWindowSizes = static_cast<SizeType32>(maxAttentionWindowVec.size());
+    std::map<SizeType32, std::vector<SizeType32>> uniqueWindowSizeToLayers;
+    for (SizeType32 layerIdx = startAttnLayerId; layerIdx < endAttnLayerId; layerIdx++)
+    {
+        // maxAttentionWindowVec may or may not be stretched to the length of numLayers yet.
+        // If not stretched yet, we cycle through the window sizes.
+        auto const windowSize = maxAttentionWindowVec.at(layerIdx % numNonUniqueWindowSizes);
+        uniqueWindowSizeToLayers[windowSize].push_back(layerIdx);
+    }
+    std::map<SizeType32, SizeType32> cacheSizeBytesPerTokenPerWindow;
+    for (auto const& [windowSize, globalLayerIds] : uniqueWindowSizeToLayers)
+    {
+        auto const cacheSizePerToken = BaseKVCacheManager::calculateCacheSizePerTokenForSingleWindowSize(
+            modelConfig, globalLayerIds, isCrossAttention, kvFactor);
+        auto const cacheSizeBytesPerToken = cacheSizePerToken * BufferDataType(modelConfig.getKvDataType()).getSize();
+        cacheSizeBytesPerTokenPerWindow[windowSize] = cacheSizeBytesPerToken;
+    }
+
+    return cacheSizeBytesPerTokenPerWindow;
+};
+
 bool TrtGptModelInflightBatching::executorConfigIsValid(
     ModelConfig const& modelConfig, executor::ExecutorConfig const& executorConfig)
 {
@@ -266,32 +300,10 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
     }
     if (mModelConfig.isTransformerBased() && modelConfig.isKVCacheEnabled())
     {
-
-        auto calculateCacheSizePerToken
-            = [](ModelConfig const& modelConfig, WorldConfig const& worldConfig,
-                  std::vector<SizeType32> const& maxAttentionWindowVec, bool isCrossAttention, SizeType32 kvFactor)
-        {
-            auto [numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd] = modelConfig.getNumKvHeadsPerLayerLocalRange(
-                worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank(), isCrossAttention);
-            auto numKvHeadsPerLayer = std::vector<SizeType32>(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd);
-            auto windowSizeLayers
-                = BaseKVCacheManager::groupLayersByWindowSize(maxAttentionWindowVec, modelConfig.getNbLayers());
-            std::map<SizeType32, SizeType32> cacheSizeBytesPerTokenPerWindow;
-            for (auto const& [windowSize, managedLayers] : windowSizeLayers)
-            {
-                auto const cacheSizePerToken = BaseKVCacheManager::calculateCacheSizePerTokenForSingleWindowSize(
-                    modelConfig, managedLayers, isCrossAttention, kvFactor);
-                auto const cacheSizeBytesPerToken
-                    = cacheSizePerToken * BufferDataType(modelConfig.getKvDataType()).getSize();
-                cacheSizeBytesPerTokenPerWindow[windowSize] = cacheSizeBytesPerToken;
-            }
-
-            return cacheSizeBytesPerTokenPerWindow;
-        };
         auto cacheTransceiverConfig
             = executorConfig.getCacheTransceiverConfig().value_or(executor::CacheTransceiverConfig());
 
-        auto const cacheSizeBytesPerTokenPerWindow = calculateCacheSizePerToken(
+        auto const cacheSizeBytesPerTokenPerWindow = calculateCacheSizePerTokenForDisagg(
             mModelConfig, mWorldConfig, getMaxAttentionWindowVec(), mModelConfig.useCrossAttention(), 2);
         auto cacheTransPreAllocaSize = kv_cache_manager::CacheTransBufferManager::preAllocBufferSize(
             cacheSizeBytesPerTokenPerWindow, cacheTransceiverConfig);
 
@@ -152,6 +152,19 @@ class TrtGptModelInflightBatching : public TrtGptModel
 
     ~TrtGptModelInflightBatching() override;
 
+    /// @brief Calculate the cache size per token for the disaggregated serving.
+    /// @param modelConfig Model configuration.
+    /// @param worldConfig World configuration.
+    /// @param maxAttentionWindowVec Maximum attention window vector. (may have fewer elements than numLayers, in which
+    /// case it cycles)
+    /// @param isCrossAttention Whether the attention is cross attention.
+    /// @param kvFactor KV factor.
+    /// @return Cache size per token for the disaggregated layers. Note that window size is not included in the result
+    /// here.
+    [[nodiscard]] static std::map<SizeType32, SizeType32> calculateCacheSizePerTokenForDisagg(
+        runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
+        std::vector<SizeType32> const& maxAttentionWindowVec, bool isCrossAttention, SizeType32 kvFactor);
+
     void terminateRequest(LlmRequestPtr const& llmRequest, bool pause = false) override;
 
     /// @brief Terminate request in the next forwardSync call that includes the request.
 
@@ -9,13 +9,6 @@
 # license agreement from NVIDIA CORPORATION or its affiliates is strictly
 # prohibited.
 
-if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
-  message(
-    STATUS
-      "The NIXL backend is temporarily unavailable on the aarch64 platform.")
-  unset(NIXL_ROOT)
-endif()
-
 if(NIXL_ROOT)
   find_package(NIXL REQUIRED)
   # Check if all required packages were found
 
@@ -11,6 +11,7 @@
 
 #include <random>
 #include <tuple>
+#include <unordered_map>
 
 namespace tensorrt_llm::testing
 {
@@ -201,4 +202,88 @@ INSTANTIATE_TEST_SUITE_P(Float, DecoderFloatTest, paramGenerator,
         return nameStringStream.str();
     });
 
+// Helper function to test calculateCacheSizePerToken with given parameters.
+std::map<runtime::SizeType32, runtime::SizeType32> calculateCacheSizePerTokenHelper(
+    std::vector<runtime::SizeType32> const& maxAttentionWindowVec, runtime::SizeType32 kvFactor = 2,
+    runtime::SizeType32 vocabSize = 32, runtime::SizeType32 nbLayers = 4, runtime::SizeType32 nbAttentionLayers = 4,
+    runtime::SizeType32 nbRnnLayers = 0, runtime::SizeType32 nbHeads = 8, runtime::SizeType32 hiddenSize = 512,
+    bool isCrossAttention = false)
+{
+    // Create minimal ModelConfig for testing.
+    auto modelConfig = runtime::ModelConfig(
+        vocabSize, nbLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, nvinfer1::DataType::kFLOAT);
+    modelConfig.useGptAttentionPlugin(true);
+    modelConfig.setModelVariant(runtime::ModelConfig::ModelVariant::kGpt);
+    modelConfig.setKVCacheType(runtime::ModelConfig::KVCacheType::kPAGED);
+
+    auto const worldConfig = runtime::WorldConfig();
+
+    return batch_manager::TrtGptModelInflightBatching::calculateCacheSizePerTokenForDisagg(
+        modelConfig, worldConfig, maxAttentionWindowVec, isCrossAttention, kvFactor);
+}
+
+// Test for TrtGptModelInflightBatching::calculateCacheSizePerToken function with different layer types.
+TEST(TrtInflightBatchingTest, CalculateCacheSizePerTokenForDisagg)
+{
+    // Common parameters.
+    constexpr runtime::SizeType32 nbLayers = 5;
+    constexpr runtime::SizeType32 hiddenSize = 512;
+    constexpr runtime::SizeType32 kvFactor = 2;
+    constexpr runtime::SizeType32 vocabSize = 32;
+    constexpr runtime::SizeType32 nbHeads = 8;
+    // Test case 1: Single attention window size - attention layers only.
+    {
+        std::vector<runtime::SizeType32> maxAttentionWindowVec = {128};
+        constexpr runtime::SizeType32 nbAttentionLayers = 5;
+        constexpr runtime::SizeType32 numBytesPerFloatElement = 4;
+        constexpr runtime::SizeType32 nbRnnLayers = 0;
+        auto result = calculateCacheSizePerTokenHelper(maxAttentionWindowVec, kvFactor, vocabSize, nbLayers,
+            nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
+        EXPECT_EQ(result.size(), 1);
+        EXPECT_EQ(result.at(128), nbAttentionLayers * kvFactor * hiddenSize * numBytesPerFloatElement);
+    }
+
+    // Test case 2: Multiple attention window sizes - attention layers only.
+    {
+        std::vector<runtime::SizeType32> maxAttentionWindowVec = {128, 256};
+        constexpr runtime::SizeType32 nbAttentionLayers = 5;
+        constexpr runtime::SizeType32 numBytesPerFloatElement = 4;
+        constexpr runtime::SizeType32 nbRnnLayers = 0;
+        auto result = calculateCacheSizePerTokenHelper(maxAttentionWindowVec, kvFactor, vocabSize, nbLayers,
+            nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
+        EXPECT_EQ(result.size(), 2);
+        auto const nbAttentionLayersIn128Window = 3;
+        auto const nbAttentionLayersIn256Window = 2;
+        EXPECT_EQ(result.at(128), nbAttentionLayersIn128Window * kvFactor * hiddenSize * numBytesPerFloatElement);
+        EXPECT_EQ(result.at(256), nbAttentionLayersIn256Window * kvFactor * hiddenSize * numBytesPerFloatElement);
+    }
+
+    // Test case 3: Single attention window size - attention and rnn layers.
+    {
+        std::vector<runtime::SizeType32> maxAttentionWindowVec = {128};
+        constexpr runtime::SizeType32 nbAttentionLayers = 3;
+        constexpr runtime::SizeType32 numBytesPerFloatElement = 4;
+        constexpr runtime::SizeType32 nbRnnLayers = 2;
+        auto result = calculateCacheSizePerTokenHelper(maxAttentionWindowVec, kvFactor, vocabSize, nbLayers,
+            nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
+        EXPECT_EQ(result.size(), 1);
+        EXPECT_EQ(result.at(128), nbAttentionLayers * kvFactor * hiddenSize * numBytesPerFloatElement);
+    }
+
+    // Test case 4: Multiple attention window sizes - attention and rnn layers.
+    {
+        std::vector<runtime::SizeType32> maxAttentionWindowVec = {128, 256};
+        constexpr runtime::SizeType32 nbAttentionLayers = 3;
+        constexpr runtime::SizeType32 numBytesPerFloatElement = 4;
+        constexpr runtime::SizeType32 nbRnnLayers = 2;
+        auto result = calculateCacheSizePerTokenHelper(maxAttentionWindowVec, kvFactor, vocabSize, nbLayers,
+            nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, false);
+        EXPECT_EQ(result.size(), 2);
+        auto const nbAttentionLayersIn128Window = 2;
+        auto const nbAttentionLayersIn256Window = 1;
+        EXPECT_EQ(result.at(128), nbAttentionLayersIn128Window * kvFactor * hiddenSize * numBytesPerFloatElement);
+        EXPECT_EQ(result.at(256), nbAttentionLayersIn256Window * kvFactor * hiddenSize * numBytesPerFloatElement);
+    }
+}
+
 } // namespace tensorrt_llm::testing
@@ -1,6 +1,6 @@
 .tag {
-  padding: 2px 5px;
-  border-radius: 4px;
+  padding: 2px 3px;
+  border-radius: 3px;
   font-size: 0.8em;
   margin-right: 5px;
   color: #000;
@@ -9,7 +9,7 @@
 code.beta {
   display: inline-block;
   background-color: #6c757d;
-  color: #999;
+  color: #fff;
 }
 
 code.prototype {
@@ -23,3 +23,9 @@ code.deprecated {
   background-color: red;
   color: #fff;
 }
+
+code.stable {
+  display: inline-block;
+  background-color: #28a745;
+  color: #fff;
+}
@@ -16,10 +16,9 @@
 
 sys.path.insert(0, os.path.abspath('.'))
 
-project = 'TensorRT-LLM'
+project = 'TensorRT LLM'
 copyright = '2025, NVidia'
 author = 'NVidia'
-branch_name = pygit2.Repository('.').head.shorthand
 html_show_sphinx = False
 
 # Get the git commit hash
@@ -79,7 +78,7 @@
     "https":
     None,
     "source":
-    "https://github.com/NVIDIA/TensorRT-LLM/tree/" + branch_name + "/{{path}}",
+    "https://github.com/NVIDIA/TensorRT-LLM/tree/" + commit_hash + "/{{path}}",
 }
 
 myst_heading_anchors = 4
 
@@ -423,7 +423,7 @@ checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkp
 - [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8)
 - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)
 
-To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/3_quantization.html).
+To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/deployment/1_tensorrt_llm.html).
 
 `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
 file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)