NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 5 additions & 2 deletions b/‎cpp/include/tensorrt_llm/executor/executor.h‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 107 additions & 2 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 107 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp‎
Lines changed: 14 additions & 2 deletions b/‎cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp‎
Lines changed: 10 additions & 6 deletions b/‎cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎cpp/tensorrt_llm/pybind/executor/executorConfig.cpp‎
Lines changed: 10 additions & 6 deletions b/‎cpp/tensorrt_llm/pybind/executor/executorConfig.cpp‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎examples/disaggregated/README.md‎
Lines changed: 3 additions & 0 deletions b/‎examples/disaggregated/README.md‎
Lines changed: 3 additions & 0 deletions
@@ -1454,13 +1454,15 @@ class CacheTransceiverConfig
         UCX = 2,
         NIXL = 3
     };
-    explicit CacheTransceiverConfig(
-        std::optional<BackendType> backendType = std::nullopt, std::optional<size_t> maxNumTokens = std::nullopt);
+    explicit CacheTransceiverConfig(std::optional<BackendType> backendType = std::nullopt,
+        std::optional<size_t> maxNumTokens = std::nullopt, std::optional<int> kvTransferTimeoutMs = std::nullopt);
 
     bool operator==(CacheTransceiverConfig const& other) const;
     void setBackendType(std::optional<BackendType> backendType);
     void setMaxTokensInBuffer(std::optional<size_t> maxTokensInBuffer);
+    void setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs);
 
+    [[nodiscard]] std::optional<int> getKvTransferTimeoutMs() const;
     [[nodiscard]] std::optional<size_t> getMaxTokensInBuffer() const;
     [[nodiscard]] std::optional<BackendType> getBackendType() const;
 
@@ -1470,6 +1472,7 @@ class CacheTransceiverConfig
     /// kvCache tokens to be transferred for a single request is greater than this value, the performance of the cache
     /// transfer may be degraded.
     std::optional<size_t> mMaxTokensInBuffer;
+    std::optional<int> mKvTransferTimeoutMs;
 };
 
 /// @brief Configuration class for the model executor
 
@@ -238,6 +238,14 @@ void CacheTransceiver::setContextState(LlmRequest* llmRequest)
 void CacheTransceiver::respondAndSendAsync(LlmRequest* llmRequest)
 {
     TLLM_CHECK(llmRequest && llmRequest->isContextOnlyRequest());
+    // TEST HOOK: Skip creating responder future for a specific request ID to validate HOL blocking theory
+    if (llmRequest->mRequestId == 2049)
+    {
+        llmRequest->setState(LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS);
+        TLLM_LOG_WARNING("TEST: Skipping responder future for context request 2049");
+        setContextState(llmRequest);
+        return;
+    }
     llmRequest->setState(LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS);
     // If context phase params is already set, it means that the KV cache
     // transfer is already in progress.
@@ -252,6 +260,8 @@ void CacheTransceiver::respondAndSendAsync(LlmRequest* llmRequest)
     setContextState(llmRequest);
     auto future = mDataResponder->respondAndSendAsync(*llmRequest);
     mResponderFutures.emplace_back(llmRequest, std::move(future));
+    TLLM_LOG_DEBUG("respondAndSendAsync: enqueued context request %ld, mResponderFutures.size()=%zu",
+        llmRequest->mRequestId, mResponderFutures.size());
 }
 
 void CacheTransceiver::respondAndSendLayerWise(
@@ -301,20 +311,34 @@ void CacheTransceiver::requestAndReceiveAsync(LlmRequest* llmRequest)
 std::vector<LlmRequest::RequestIdType> gatherRequestIds(
     mpi::MpiComm const& mpiComm, std::vector<LlmRequest::RequestIdType> const& requestIds)
 {
+    TLLM_LOG_DEBUG("gatherRequestIds: Entry - rank %d, localSize=%zu, worldSize=%d", mpiComm.getRank(),
+        requestIds.size(), mpiComm.getSize());
+
     int localSize = static_cast<int>(requestIds.size());
     std::vector<int> sizes(mpiComm.getSize());
+
+    TLLM_LOG_DEBUG("gatherRequestIds: Starting allgather for sizes");
     mpiComm.allgather(&localSize, sizes.data(), 1, mpi::MpiType::kINT32);
+    TLLM_LOG_DEBUG("gatherRequestIds: allgather for sizes completed");
+
     // std::vector<LlmRequest::RequestIdType> all_data(total_size);
     std::vector<int> displs(mpiComm.getSize());
     int totalSize = 0;
     for (int i = 0; i < mpiComm.getSize(); i++)
     {
         displs[i] = totalSize;
         totalSize += sizes[i];
+        TLLM_LOG_DEBUG("gatherRequestIds: Rank %d has %d request IDs", i, sizes[i]);
     }
+
+    TLLM_LOG_DEBUG("gatherRequestIds: Total size across all ranks: %d", totalSize);
     std::vector<LlmRequest::RequestIdType> retData(totalSize);
+
+    TLLM_LOG_DEBUG("gatherRequestIds: Starting allgatherv for request IDs");
     mpiComm.allgatherv(requestIds.data(), static_cast<int>(requestIds.size()), mpi::MpiType::kUINT64, retData.data(),
         sizes, displs, mpi::MpiType::kUINT64);
+    TLLM_LOG_DEBUG("gatherRequestIds: allgatherv for request IDs completed, returning %zu total IDs", retData.size());
+
     return retData;
 }
 
@@ -370,72 +394,153 @@ void updateKVCacheTransferBW(mpi::MpiComm const& mpiComm, LlmRequest* request)
 
 void CacheTransceiver::checkContextTransferStatus(std::optional<int> const& atLeastRequestNum)
 {
+    TLLM_LOG_DEBUG("checkContextTransferStatus: Entry with atLeastRequestNum=%s, mResponderFutures.size()=%zu",
+        atLeastRequestNum.has_value() ? std::to_string(atLeastRequestNum.value()).c_str() : "nullopt",
+        mResponderFutures.size());
+
+    // Dump current responder future queue order for diagnostics
+    {
+        std::ostringstream oss;
+        oss << "[";
+        bool first = true;
+        for (auto const& pair : mResponderFutures)
+        {
+            if (!first)
+            {
+                oss << ", ";
+            }
+            first = false;
+            oss << pair.first->mRequestId;
+        }
+        oss << "]";
+        TLLM_LOG_DEBUG("checkContextTransferStatus: mResponderFutures order: %s", oss.str().c_str());
+    }
+
     bool blockAll = !atLeastRequestNum.has_value();
     auto syncComm = mCacheState->getParallelConfig().mEnableAttentionDP ? mMpiGroupTPInDPComm : mMpiGroupTensorParaComm;
+
+    TLLM_LOG_DEBUG("checkContextTransferStatus: blockAll=%s, syncComm=%s, syncComm.size=%d",
+        blockAll ? "true" : "false", syncComm ? "valid" : "null", syncComm ? syncComm->getSize() : 0);
+
     std::vector<LlmRequest::RequestIdType> contextCompleteRequestIds;
+    TLLM_LOG_DEBUG(
+        "checkContextTransferStatus: Checking %zu responder futures for completion", mResponderFutures.size());
+
     for (auto&& [request, future] : mResponderFutures)
     {
+        TLLM_LOG_DEBUG("checkContextTransferStatus: Checking request %ld future status", request->mRequestId);
         if (future.wait_for(std::chrono::milliseconds(0)) == std::future_status::ready)
         {
+            TLLM_LOG_DEBUG("checkContextTransferStatus: Request %ld is ready", request->mRequestId);
             contextCompleteRequestIds.push_back(request->mRequestId);
         }
+        else
+        {
+            TLLM_LOG_DEBUG("checkContextTransferStatus: Request %ld is not ready", request->mRequestId);
+        }
     }
 
+    TLLM_LOG_DEBUG("checkContextTransferStatus: Found %zu ready requests", contextCompleteRequestIds.size());
+
     std::unordered_map<LlmRequest::RequestIdType, int> frequencyMap;
     if ((syncComm) && syncComm->getSize() > 1)
     {
+        TLLM_LOG_DEBUG(
+            "checkContextTransferStatus: Gathering request IDs across %d ranks via MPI", syncComm->getSize());
         auto gatherRequestIdVec = gatherRequestIds(*syncComm, contextCompleteRequestIds);
+        TLLM_LOG_DEBUG("checkContextTransferStatus: MPI gather completed, received %zu total request IDs",
+            gatherRequestIdVec.size());
+
         for (auto&& requestId : gatherRequestIdVec)
         {
             frequencyMap[requestId]++;
         }
+        TLLM_LOG_DEBUG(
+            "checkContextTransferStatus: Built frequency map with %zu unique request IDs", frequencyMap.size());
     }
     else
     {
+        TLLM_LOG_DEBUG("checkContextTransferStatus: Single rank mode, building frequency map locally");
         for (auto&& requestId : contextCompleteRequestIds)
         {
             frequencyMap[requestId]++;
         }
+        TLLM_LOG_DEBUG(
+            "checkContextTransferStatus: Local frequency map built with %zu unique request IDs", frequencyMap.size());
     }
     std::vector<std::pair<LlmRequest::RequestIdType, int>> freqVec(frequencyMap.begin(), frequencyMap.end());
 
     std::sort(freqVec.begin(), freqVec.end(),
         [](std::pair<LlmRequest::RequestIdType, int> const& left,
             std::pair<LlmRequest::RequestIdType, int> const& right) { return left.second > right.second; });
+
+    TLLM_LOG_DEBUG("checkContextTransferStatus: Sorted frequency vector, processing %zu entries", freqVec.size());
+
     std::unordered_set<LlmRequest::RequestIdType> toCompleteIdSet;
+    int expectedFreq = (syncComm) ? syncComm->getSize() : 1;
+    TLLM_LOG_DEBUG("checkContextTransferStatus: Expected frequency for completion: %d", expectedFreq);
+
     for (auto&& [requestId, freq] : freqVec)
     {
-        if (freq == ((syncComm) ? syncComm->getSize() : 1))
+        TLLM_LOG_DEBUG(
+            "checkContextTransferStatus: Request %ld has frequency %d (expected %d)", requestId, freq, expectedFreq);
+        if (freq == expectedFreq)
         {
             toCompleteIdSet.insert(requestId);
+            TLLM_LOG_DEBUG("checkContextTransferStatus: Request %ld added to completion set (freq match)", requestId);
         }
     }
 
+    TLLM_LOG_DEBUG("checkContextTransferStatus: toCompleteIdSet.size()=%zu, atLeastRequestNum=%d",
+        toCompleteIdSet.size(), atLeastRequestNum.value_or(0));
+
     // Make sure there are at least atLeastRequestNum requests in toCompleteIdSet.
     // This will preserve the order of insertion for KVCache transfer requests.
     for (auto it = mResponderFutures.begin();
          atLeastRequestNum.value_or(0) > static_cast<int>(toCompleteIdSet.size()) && it != mResponderFutures.end();
          ++it)
     {
         auto& [request, future] = *it;
+        TLLM_LOG_DEBUG(
+            "checkContextTransferStatus: Adding request %ld to completion set (min requirement)", request->mRequestId);
         toCompleteIdSet.insert(request->mRequestId);
     }
 
+    TLLM_LOG_DEBUG("checkContextTransferStatus: Final toCompleteIdSet.size()=%zu", toCompleteIdSet.size());
+
     // Complete all the requests in toCompleteIdSet
+    TLLM_LOG_DEBUG("checkContextTransferStatus: Starting completion phase, blockAll=%s", blockAll ? "true" : "false");
+
+    size_t completedCount = 0;
     for (auto it = mResponderFutures.begin(); it != mResponderFutures.end();)
     {
         auto& [request, future] = *it;
-        if (blockAll || (toCompleteIdSet.find(request->mRequestId) != toCompleteIdSet.end()))
+        bool shouldComplete = blockAll || (toCompleteIdSet.find(request->mRequestId) != toCompleteIdSet.end());
+
+        TLLM_LOG_DEBUG("checkContextTransferStatus: Request %ld shouldComplete=%s", request->mRequestId,
+            shouldComplete ? "true" : "false");
+
+        if (shouldComplete)
         {
+            TLLM_LOG_DEBUG("checkContextTransferStatus: Blocking on future.get() for request %ld", request->mRequestId);
             future.get();
+            TLLM_LOG_DEBUG("checkContextTransferStatus: future.get() completed for request %ld", request->mRequestId);
+
             request->setState(LlmRequestState::kDISAGG_CONTEXT_COMPLETE);
             it = mResponderFutures.erase(it);
+            completedCount++;
+
+            TLLM_LOG_DEBUG(
+                "checkContextTransferStatus: Request %ld completed and removed from futures", request->mRequestId);
         }
         else
         {
             ++it;
         }
     }
+
+    TLLM_LOG_DEBUG("checkContextTransferStatus: Exit - completed %zu requests, remaining futures: %zu", completedCount,
+        mResponderFutures.size());
 }
 
 void CacheTransceiver::checkGenTransferStatus(std::optional<int> const& atLeastRequestNum)
 
@@ -22,15 +22,17 @@ namespace tensorrt_llm::executor
 {
 
 CacheTransceiverConfig::CacheTransceiverConfig(
-    std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens)
+    std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens, std::optional<int> kvTransferTimeoutMs)
     : mBackendType(backendType)
     , mMaxTokensInBuffer(maxNumTokens)
+    , mKvTransferTimeoutMs(kvTransferTimeoutMs)
 {
 }
 
 bool CacheTransceiverConfig::operator==(CacheTransceiverConfig const& other) const
 {
-    return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType;
+    return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType
+        && mKvTransferTimeoutMs == other.mKvTransferTimeoutMs;
 }
 
 void CacheTransceiverConfig::setBackendType(std::optional<BackendType> backendType)
@@ -53,4 +55,14 @@ std::optional<size_t> CacheTransceiverConfig::getMaxTokensInBuffer() const
     return mMaxTokensInBuffer;
 }
 
+void CacheTransceiverConfig::setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs)
+{
+    mKvTransferTimeoutMs = kvTransferTimeoutMs;
+}
+
+std::optional<int> CacheTransceiverConfig::getKvTransferTimeoutMs() const
+{
+    return mKvTransferTimeoutMs;
+}
+
 } // namespace tensorrt_llm::executor
@@ -433,15 +433,15 @@ void initConfigBindings(nb::module_& m)
         .def("__setstate__", guidedDecodingConfigSetstate);
 
     auto cacheTransceiverConfigGetstate = [](tle::CacheTransceiverConfig const& self)
-    { return nb::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer()); };
+    { return nb::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer(), self.getKvTransferTimeoutMs()); };
     auto cacheTransceiverConfigSetstate = [](tle::CacheTransceiverConfig& self, nb::tuple const& state)
     {
-        if (state.size() != 2)
+        if (state.size() != 3)
         {
             throw std::runtime_error("Invalid CacheTransceiverConfig state!");
         }
-        new (&self) tle::CacheTransceiverConfig(
-            nb::cast<tle::CacheTransceiverConfig::BackendType>(state[0]), nb::cast<std::optional<size_t>>(state[1]));
+        new (&self) tle::CacheTransceiverConfig(nb::cast<tle::CacheTransceiverConfig::BackendType>(state[0]),
+            nb::cast<std::optional<size_t>>(state[1]), nb::cast<std::optional<int>>(state[2]));
     };
 
     nb::enum_<tle::CacheTransceiverConfig::BackendType>(m, "CacheTransceiverBackendType")
@@ -464,12 +464,16 @@ void initConfigBindings(nb::module_& m)
             });
 
     nb::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
-        .def(nb::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>(),
-            nb::arg("backend") = std::nullopt, nb::arg("max_tokens_in_buffer") = std::nullopt)
+        .def(nb::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>,
+                 std::optional<int>>(),
+            nb::arg("backend") = std::nullopt, nb::arg("max_tokens_in_buffer") = std::nullopt,
+            nb::arg("kv_transfer_timeout_ms") = std::nullopt)
         .def_prop_rw(
             "backend", &tle::CacheTransceiverConfig::getBackendType, &tle::CacheTransceiverConfig::setBackendType)
         .def_prop_rw("max_tokens_in_buffer", &tle::CacheTransceiverConfig::getMaxTokensInBuffer,
             &tle::CacheTransceiverConfig::setMaxTokensInBuffer)
+        .def_prop_rw("kv_transfer_timeout_ms", &tle::CacheTransceiverConfig::getKvTransferTimeoutMs,
+            &tle::CacheTransceiverConfig::setKvTransferTimeoutMs)
         .def("__getstate__", cacheTransceiverConfigGetstate)
         .def("__setstate__", cacheTransceiverConfigSetstate);
 
 
@@ -415,15 +415,15 @@ void initConfigBindings(pybind11::module_& m)
         .def(py::pickle(guidedDecodingConfigGetstate, guidedDecodingConfigSetstate));
 
     auto cacheTransceiverConfigGetstate = [](tle::CacheTransceiverConfig const& self)
-    { return py::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer()); };
+    { return py::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer(), self.getKvTransferTimeoutMs()); };
     auto cacheTransceiverConfigSetstate = [](py::tuple const& state)
     {
-        if (state.size() != 2)
+        if (state.size() != 3)
         {
             throw std::runtime_error("Invalid CacheTransceiverConfig state!");
         }
-        return tle::CacheTransceiverConfig(
-            state[0].cast<tle::CacheTransceiverConfig::BackendType>(), state[1].cast<std::optional<size_t>>());
+        return tle::CacheTransceiverConfig(state[0].cast<tle::CacheTransceiverConfig::BackendType>(),
+            state[1].cast<std::optional<size_t>>(), state[2].cast<std::optional<int>>());
     };
 
     py::enum_<tle::CacheTransceiverConfig::BackendType>(m, "CacheTransceiverBackendType")
@@ -446,12 +446,16 @@ void initConfigBindings(pybind11::module_& m)
             });
 
     py::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
-        .def(py::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>(),
-            py::arg("backend") = std::nullopt, py::arg("max_tokens_in_buffer") = std::nullopt)
+        .def(py::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>,
+                 std::optional<int>>(),
+            py::arg("backend") = std::nullopt, py::arg("max_tokens_in_buffer") = std::nullopt,
+            py::arg("kv_transfer_timeout_ms") = std::nullopt)
         .def_property(
             "backend", &tle::CacheTransceiverConfig::getBackendType, &tle::CacheTransceiverConfig::setBackendType)
         .def_property("max_tokens_in_buffer", &tle::CacheTransceiverConfig::getMaxTokensInBuffer,
             &tle::CacheTransceiverConfig::setMaxTokensInBuffer)
+        .def_property("kv_transfer_timeout_ms", &tle::CacheTransceiverConfig::getKvTransferTimeoutMs,
+            &tle::CacheTransceiverConfig::setKvTransferTimeoutMs)
         .def(py::pickle(cacheTransceiverConfigGetstate, cacheTransceiverConfigSetstate));
 
     auto executorConfigGetState = [](py::object const& self)
 
@@ -16,6 +16,9 @@ cache_transceiver_config:
   backend: <str>
   # KV cache buffer size. Set it ≥ the maximum ISL (Input Sequence Length) for best performance.
   max_tokens_in_buffer: <int>
+  # KV cache transfer timeout in milliseconds
+  # For requests, if they do not send/receive the KV cache in time they are removed and cleaned up.
+  kv_transfer_timeout_ms: <int>
 ```
 
 The following is an example, consisting of the `ctx_extra-llm-api-config.yaml` and `gen_extra-llm-api-config.yaml` files needed in the sections below.
Original file line number	Diff line number	Diff line change
`@@ -22,15 +22,17 @@ namespace tensorrt_llm::executor`
`22`	`22`	`{`
`23`	`23`
`24`	`24`	`CacheTransceiverConfig::CacheTransceiverConfig(`
`25`		`- std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens)`
	`25`	`+ std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens, std::optional<int> kvTransferTimeoutMs)`
`26`	`26`	`: mBackendType(backendType)`
`27`	`27`	`, mMaxTokensInBuffer(maxNumTokens)`
	`28`	`+ , mKvTransferTimeoutMs(kvTransferTimeoutMs)`
`28`	`29`	`{`
`29`	`30`	`}`
`30`	`31`
`31`	`32`	`bool CacheTransceiverConfig::operator==(CacheTransceiverConfig const& other) const`
`32`	`33`	`{`
`33`		`- return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType;`
	`34`	`+ return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType`
	`35`	`+ && mKvTransferTimeoutMs == other.mKvTransferTimeoutMs;`
`34`	`36`	`}`
`35`	`37`
`36`	`38`	`void CacheTransceiverConfig::setBackendType(std::optional<BackendType> backendType)`
`@@ -53,4 +55,14 @@ std::optional<size_t> CacheTransceiverConfig::getMaxTokensInBuffer() const`
`53`	`55`	`return mMaxTokensInBuffer;`
`54`	`56`	`}`
`55`	`57`
	`58`	`+void CacheTransceiverConfig::setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs)`
	`59`	`+{`
	`60`	`+ mKvTransferTimeoutMs = kvTransferTimeoutMs;`
	`61`	`+}`
	`62`	`+`
	`63`	`+std::optional<int> CacheTransceiverConfig::getKvTransferTimeoutMs() const`
	`64`	`+{`
	`65`	`+ return mKvTransferTimeoutMs;`
	`66`	`+}`
	`67`	`+`
`56`	`68`	`} // namespace tensorrt_llm::executor`