update to ToM, clean up a bit, move to cancel_request

raayandhar · raayandhar · commit 05746243c03e · 2025-10-07T01:02:38.000-07:00
Signed-off-by: raayandhar &lt;raayan.dhar@gmail.com&gt;
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1456,13 +1456,15 @@ class CacheTransceiverConfig
         UCX = 2,
         NIXL = 3
     };
-    explicit CacheTransceiverConfig(
-        std::optional<BackendType> backendType = std::nullopt, std::optional<size_t> maxNumTokens = std::nullopt);
+    explicit CacheTransceiverConfig(std::optional<BackendType> backendType = std::nullopt,
+        std::optional<size_t> maxNumTokens = std::nullopt, std::optional<int> kvTransferTimeoutMs = std::nullopt);
 
     bool operator==(CacheTransceiverConfig const& other) const;
     void setBackendType(std::optional<BackendType> backendType);
     void setMaxTokensInBuffer(std::optional<size_t> maxTokensInBuffer);
+    void setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs);
 
+    [[nodiscard]] std::optional<int> getKvTransferTimeoutMs() const;
     [[nodiscard]] std::optional<size_t> getMaxTokensInBuffer() const;
     [[nodiscard]] std::optional<BackendType> getBackendType() const;
 
@@ -1472,6 +1474,7 @@ class CacheTransceiverConfig
     /// kvCache tokens to be transferred for a single request is greater than this value, the performance of the cache
     /// transfer may be degraded.
     std::optional<size_t> mMaxTokensInBuffer;
+    std::optional<int> mKvTransferTimeoutMs;
 };
 
 /// @brief Configuration class for the model executor
diff --git a/cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp b/cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp
@@ -22,15 +22,17 @@ namespace tensorrt_llm::executor
 {
 
 CacheTransceiverConfig::CacheTransceiverConfig(
-    std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens)
+    std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens, std::optional<int> kvTransferTimeoutMs)
     : mBackendType(backendType)
-    , mMaxTokensInBuffer(maxNumTokens)
+    : mMaxTokensInBuffer(maxNumTokens)
+    , mKvTransferTimeoutMs(kvTransferTimeoutMs)
 {
 }
 
 bool CacheTransceiverConfig::operator==(CacheTransceiverConfig const& other) const
 {
-    return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType;
+    return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType
+        && mKvTransferTimeoutMs == other.mKvTransferTimeoutMs;
 }
 
 void CacheTransceiverConfig::setBackendType(std::optional<BackendType> backendType)
@@ -43,6 +45,11 @@ void CacheTransceiverConfig::setMaxTokensInBuffer(std::optional<size_t> maxToken
     mMaxTokensInBuffer = maxTokensInBuffer;
 }
 
+void CacheTransceiverConfig::setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs)
+{
+    mKvTransferTimeoutMs = kvTransferTimeoutMs;
+}
+
 std::optional<CacheTransceiverConfig::BackendType> CacheTransceiverConfig::getBackendType() const
 {
     return mBackendType;
@@ -53,4 +60,9 @@ std::optional<size_t> CacheTransceiverConfig::getMaxTokensInBuffer() const
     return mMaxTokensInBuffer;
 }
 
+std::optional<int> CacheTransceiverConfig::getKvTransferTimeoutMs() const
+{
+    return mKvTransferTimeoutMs;
+}
+
 } // namespace tensorrt_llm::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
@@ -433,15 +433,15 @@ void initConfigBindings(nb::module_& m)
         .def("__setstate__", guidedDecodingConfigSetstate);
 
     auto cacheTransceiverConfigGetstate = [](tle::CacheTransceiverConfig const& self)
-    { return nb::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer()); };
+    { return nb::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer(), self.getKvTransferTimeoutMs()); };
     auto cacheTransceiverConfigSetstate = [](tle::CacheTransceiverConfig& self, nb::tuple const& state)
     {
-        if (state.size() != 2)
+        if (state.size() != 3)
         {
             throw std::runtime_error("Invalid CacheTransceiverConfig state!");
         }
-        new (&self) tle::CacheTransceiverConfig(
-            nb::cast<tle::CacheTransceiverConfig::BackendType>(state[0]), nb::cast<std::optional<size_t>>(state[1]));
+        new (&self) tle::CacheTransceiverConfig(nb::cast<tle::CacheTransceiverConfig::BackendType>(state[0]),
+            nb::cast<std::optional<size_t>>(state[1]), nb::cast<std::optional<int>>(state[2]));
     };
 
     nb::enum_<tle::CacheTransceiverConfig::BackendType>(m, "CacheTransceiverBackendType")
@@ -464,12 +464,15 @@ void initConfigBindings(nb::module_& m)
             });
 
     nb::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
-        .def(nb::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>(),
-            nb::arg("backend") = std::nullopt, nb::arg("max_tokens_in_buffer") = std::nullopt)
+        .def(nb::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>,
+            std::optional < int >> (), nb::arg("backend") = std::nullopt,
+            nb::arg("max_tokens_in_buffer") = std::nullopt, nb::arg("kv_transfer_timeout_ms") = std::nullopt)
         .def_prop_rw(
             "backend", &tle::CacheTransceiverConfig::getBackendType, &tle::CacheTransceiverConfig::setBackendType)
         .def_prop_rw("max_tokens_in_buffer", &tle::CacheTransceiverConfig::getMaxTokensInBuffer,
             &tle::CacheTransceiverConfig::setMaxTokensInBuffer)
+        .def_prop_rw("kv_transfer_timeout_ms", &tle::CacheTransceiverConfig::getKvTransferTimeoutMs,
+            &tle::CacheTransceiverConfig::setKvTransferTimeoutMs)
         .def("__getstate__", cacheTransceiverConfigGetstate)
         .def("__setstate__", cacheTransceiverConfigSetstate);
 
diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@@ -415,15 +415,15 @@ void initConfigBindings(pybind11::module_& m)
         .def(py::pickle(guidedDecodingConfigGetstate, guidedDecodingConfigSetstate));
 
     auto cacheTransceiverConfigGetstate = [](tle::CacheTransceiverConfig const& self)
-    { return py::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer()); };
+    { return py::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer(), self.getKvTransferTimeoutMs()); };
     auto cacheTransceiverConfigSetstate = [](py::tuple const& state)
     {
-        if (state.size() != 2)
+        if (state.size() != 3)
         {
             throw std::runtime_error("Invalid CacheTransceiverConfig state!");
         }
-        return tle::CacheTransceiverConfig(
-            state[0].cast<tle::CacheTransceiverConfig::BackendType>(), state[1].cast<std::optional<size_t>>());
+        return tle::CacheTransceiverConfig(state[0].cast<tle::CacheTransceiverConfig::BackendType>(),
+            state[1].cast<std::optional<size_t>>(), state[2].cast<std::optional<int>>());
     };
 
     py::enum_<tle::CacheTransceiverConfig::BackendType>(m, "CacheTransceiverBackendType")
@@ -446,12 +446,15 @@ void initConfigBindings(pybind11::module_& m)
             });
 
     py::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
-        .def(py::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>(),
-            py::arg("backend") = std::nullopt, py::arg("max_tokens_in_buffer") = std::nullopt)
+        .def(py::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>,
+            std::optional < int >> (), py::arg("backend") = std::nullopt,
+            py::arg("max_tokens_in_buffer") = std::nullopt, py::arg("kv_transfer_timeout_ms") = std::nullopt)
         .def_property(
             "backend", &tle::CacheTransceiverConfig::getBackendType, &tle::CacheTransceiverConfig::setBackendType)
         .def_property("max_tokens_in_buffer", &tle::CacheTransceiverConfig::getMaxTokensInBuffer,
             &tle::CacheTransceiverConfig::setMaxTokensInBuffer)
+        .def_property("kv_transfer_timeout_ms", &tle::CacheTransceiverConfig::getKvTransferTimeoutMs,
+            &tle::CacheTransceiverConfig::setKvTransferTimeoutMs)
         .def(py::pickle(cacheTransceiverConfigGetstate, cacheTransceiverConfigSetstate));
 
     auto executorConfigGetState = [](py::object const& self)
diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md
@@ -16,6 +16,9 @@ cache_transceiver_config:
   backend: <str>
   # KV cache buffer size. Set it ≥ the maximum ISL (Input Sequence Length) for best performance.
   max_tokens_in_buffer: <int>
+  # KV cache transfer timeout in milliseconds
+  # For requests, if they do not send/receive the KV cache in time they are cancelled and cleaned up
+  kv_transfer_timeout_ms: <int>
 ```
 
 The following is an example, consisting of the `ctx_extra-llm-api-config.yaml` and `gen_extra-llm-api-config.yaml` files needed in the sections below.
diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
@@ -109,6 +109,8 @@ def __init__(self, mapping: Mapping, dist: Distributed,
         # get the layer num per pp rank, which is required by cache transceiver.
         pp_layer_num = len(kv_cache_manager.pp_layers)
         pp_layer_num_per_pp_rank = dist.pp_allgather(pp_layer_num)
+
+        self.kv_transfer_timeout_ms = cache_transceiver_config.kv_transfer_timeout_ms
         self.impl = CacheTransceiverCpp(kv_cache_manager.impl,
                                         total_num_kv_heads_per_layer, head_dim,
                                         tokens_per_block, world_config,
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -442,6 +442,8 @@ def __init__(
         self.py_lora_task_layer_module_configs: list[
             tensorrt_llm.bindings.internal.runtime.
             TaskLayerModuleConfig] | None = None
+        self.py_kv_transfer_start_time = None
+        self.py_kv_transfer_timed_out = False
 
         self.py_num_logprobs = num_logprobs
         self.py_return_log_probs = return_log_probs
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -978,6 +978,7 @@ def _executor_loop_pp(self):
                     self.micro_batches[prev_microbatch_id] = None
 
                 if self.kv_cache_transceiver and self.ctx_in_transmission_requests:
+                    self._check_kv_transfer_timeout()
                     self._terminate_ctx_finished_requests()
 
                 if self._disagg_pp_termination_handler is not None:
@@ -1006,6 +1007,7 @@ def _prepare_and_schedule_batch(self):
 
         if self.kv_cache_transceiver:
             self._check_disagg_gen_transfer_status()
+            self._check_kv_transfer_timeout()
 
         iter_stats = None
         if self.enable_iter_perf_stats:
@@ -1179,6 +1181,7 @@ def _executor_loop(self):
                         self._add_kv_cache_events()
 
                 if self.kv_cache_transceiver and self.ctx_in_transmission_requests:
+                    self._check_kv_transfer_timeout()
                     self._terminate_ctx_finished_requests()
 
                 self._kv_connector_terminate_requests()
@@ -1364,6 +1367,7 @@ def _executor_loop_overlap(self):
                         ctx_transmission_reqs=ctx_transmission_reqs)
 
                 if self.kv_cache_transceiver and self.ctx_in_transmission_requests:
+                    self._check_kv_transfer_timeout()
                     self._terminate_ctx_finished_requests()
 
                 self._kv_connector_terminate_requests()
@@ -1572,6 +1576,38 @@ def _check_disagg_gen_transfer_status(self):
 
         return
 
+    @nvtx_range("_check_kv_transfer_timeout")
+    def _check_kv_transfer_timeout(self):
+        if not self.kv_cache_transceiver:
+            return
+        timeout_ms = self.kv_cache_transceiver.kv_transfer_timeout_ms
+        if timeout_ms is None or timeout_ms <= 0:
+            return
+
+        current_time = time.time()
+
+        for req in self.ctx_in_transmission_requests:
+            if req.py_kv_transfer_start_time is None:
+                continue
+            elapsed_time = (current_time - req.py_kv_transfer_start_time) * 1000
+            if elapsed_time > timeout_ms and not req.py_kv_transfer_timed_out:
+                logger.warning(
+                    f"Terminating context request {req.py_request_id} due to KV cache transfer timeout"
+                )
+                req.py_kv_transfer_timed_out = True
+
+        for req in self.active_requests:
+            if req.is_disagg_generation_transmission_in_progress and req.py_kv_transfer_start_time is not None:
+                elapsed_time = (current_time -
+                                req.py_kv_transfer_start_time) * 1000
+                if elapsed_time > timeout_ms and not req.py_kv_transfer_timed_out:
+                    logger.warning(
+                        f"Terminating generation request {req.py_request_id} due to KV cache transfer timeout"
+                    )
+                    req.py_kv_transfer_timed_out = True
+
+        return
+
     @nvtx_range("_pad_attention_dp_dummy_request")
     def _pad_attention_dp_dummy_request(self):
         """
@@ -1646,6 +1682,7 @@ def _prepare_disagg_gen_transmission_complete(self, scheduled_batch):
                 req.context_current_position = req.prompt_len
                 req.decoding_iter = 1
                 req.py_decoding_iter = 1
+                req.py_kv_transfer_start_time = None
                 first_gen_tokens = req.context_phase_params.first_gen_tokens
                 ctx_draft_tokens = req.context_phase_params.draft_tokens
                 req.py_draft_tokens = [] if ctx_draft_tokens is None else ctx_draft_tokens
@@ -1669,6 +1706,11 @@ def _recv_disagg_gen_cache(self, new_gen_reqs):
             for req in new_gen_reqs:
                 self.kv_cache_transceiver.request_and_receive_async(req)
 
+        if self.kv_cache_transceiver.kv_transfer_timeout_ms is not None:
+            for req in new_gen_reqs:
+                if req.state == LlmRequestState.DISAGG_GENERATION_TRANS_IN_PROGRESS:
+                    req.py_kv_transfer_start_time = time.time()
+
         block_transfer = all([
             req.is_disagg_generation_transmission_in_progress
             for req in self.active_requests
@@ -1701,6 +1743,11 @@ def _send_disagg_ctx_cache(self, scheduled_ctx_requests):
             if req.state == LlmRequestState.DISAGG_CONTEXT_TRANS_IN_PROGRESS
         ]
 
+        if self.kv_cache_transceiver.kv_transfer_timeout_ms is not None:
+            for req in ctx_in_transmission_requests:
+                if req.state == LlmRequestState.DISAGG_CONTEXT_TRANS_IN_PROGRESS:
+                    req.py_kv_transfer_start_time = time.time()
+
         return ctx_transmission_reqs
 
     def _get_disagg_reqs_in_error_state(self):
@@ -2018,6 +2065,12 @@ def _handle_responses(self):
                 requests_to_terminate.append(request)
                 continue
 
+            # Check if generation request needs cleanup due to KV cache transfer timeout
+            if request.py_kv_transfer_timed_out:
+                # Previously, we were doing _handle_errors, which sends an error response.
+                # We should consider how we should be doing this now?
+                self.kv_cache_transceiver.cancel_request(request)
+
             if request.is_generation_only_request():
                 # If request is in transmission, so we don't need to emit a response
                 # Also, for the first iteration with overlap, we should skip since first
@@ -2068,6 +2121,9 @@ def _handle_responses(self):
     def _terminate_ctx_finished_requests(self):
         for request, block_id in self.ctx_in_transmission_requests[:]:
             if request.is_disagg_context_complete_state:
+                if request.py_kv_transfer_timed_out:
+                    request.py_kv_transfer_start_time = None
+                    self.kv_cache_transceiver.cancel_request(request)
                 if not self.block_reuse_enabled or self.kv_cache_manager.is_vswa:
                     self._terminate_request(request)
                 else:
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1286,10 +1286,17 @@ class CacheTransceiverConfig(StrictBaseModel, PybindMirror):
         default=None,
         description="The max number of tokens the transfer buffer can fit.")
 
+    kv_transfer_timeout_ms: Optional[int] = Field(
+        default=None,
+        description=
+        "Timeout in milliseconds for KV cache transfer. Requests exceeding this timeout will be cancelled."
+    )
+
     def _to_pybind(self):
         return _CacheTransceiverConfig(
             backend=_CacheTransceiverBackendType.from_string(self.backend),
-            max_tokens_in_buffer=self.max_tokens_in_buffer)
+            max_tokens_in_buffer=self.max_tokens_in_buffer,
+            kv_transfer_timeout_ms=self.kv_transfer_timeout_ms)
 
 
 @dataclass

Original file line number	Diff line number	Diff line change
`@@ -22,15 +22,17 @@ namespace tensorrt_llm::executor`
`22`	`22`	`{`
`23`	`23`
`24`	`24`	`CacheTransceiverConfig::CacheTransceiverConfig(`
`25`		`- std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens)`
	`25`	`+ std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens, std::optional<int> kvTransferTimeoutMs)`
`26`	`26`	`: mBackendType(backendType)`
`27`		`- , mMaxTokensInBuffer(maxNumTokens)`
	`27`	`+ : mMaxTokensInBuffer(maxNumTokens)`
	`28`	`+ , mKvTransferTimeoutMs(kvTransferTimeoutMs)`
`28`	`29`	`{`
`29`	`30`	`}`
`30`	`31`
`31`	`32`	`bool CacheTransceiverConfig::operator==(CacheTransceiverConfig const& other) const`
`32`	`33`	`{`
`33`		`- return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType;`
	`34`	`+ return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType`
	`35`	`+ && mKvTransferTimeoutMs == other.mKvTransferTimeoutMs;`
`34`	`36`	`}`
`35`	`37`
`36`	`38`	`void CacheTransceiverConfig::setBackendType(std::optional<BackendType> backendType)`
`@@ -43,6 +45,11 @@ void CacheTransceiverConfig::setMaxTokensInBuffer(std::optional<size_t> maxToken`
`43`	`45`	`mMaxTokensInBuffer = maxTokensInBuffer;`
`44`	`46`	`}`
`45`	`47`
	`48`	`+void CacheTransceiverConfig::setKvTransferTimeoutMs(std::optional<int> kvTransferTimeoutMs)`
	`49`	`+{`
	`50`	`+ mKvTransferTimeoutMs = kvTransferTimeoutMs;`
	`51`	`+}`
	`52`	`+`
`46`	`53`	`std::optional<CacheTransceiverConfig::BackendType> CacheTransceiverConfig::getBackendType() const`
`47`	`54`	`{`
`48`	`55`	`return mBackendType;`
`@@ -53,4 +60,9 @@ std::optional<size_t> CacheTransceiverConfig::getMaxTokensInBuffer() const`
`53`	`60`	`return mMaxTokensInBuffer;`
`54`	`61`	`}`
`55`	`62`
	`63`	`+std::optional<int> CacheTransceiverConfig::getKvTransferTimeoutMs() const`
	`64`	`+{`
	`65`	`+ return mKvTransferTimeoutMs;`
	`66`	`+}`
	`67`	`+`
`56`	`68`	`} // namespace tensorrt_llm::executor`