NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h‎
Lines changed: 4 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 13 additions & 0 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp‎
Lines changed: 174 additions & 9 deletions b/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp‎
Lines changed: 174 additions & 9 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.h‎
Lines changed: 22 additions & 0 deletions b/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.h‎
Lines changed: 22 additions & 0 deletions
@@ -71,6 +71,8 @@ class BaseCacheTransceiver
     virtual void checkGenTransferStatus(std::optional<int> const& atLeastRequestNum = std::nullopt) = 0;
 
     [[nodiscard]] virtual bool checkGenTransferComplete() const = 0;
+
+    virtual bool cancelRequest(LlmRequest* llmRequest) = 0;
 };
 
 class CacheTransceiver : public BaseCacheTransceiver
@@ -111,6 +113,8 @@ class CacheTransceiver : public BaseCacheTransceiver
 
     [[nodiscard]] bool checkGenTransferComplete() const override;
 
+    virtual bool cancelRequest(LlmRequest* llmRequest) override;
+
 private:
     void initializeCommState();
 
 
@@ -572,4 +572,17 @@ bool CacheTransceiver::checkGenTransferComplete() const
     return mRequesterFutures.empty();
 }
 
+bool CacheTransceiver::cancelRequest(LlmRequest* llmRequest)
+{
+    if (llmRequest->isContextOnlyRequest())
+    {
+        return mCacheSender->cancelRequest(*llmRequest);
+    }
+    else if (llmRequest->isGenerationOnlyRequest())
+    {
+        return mCacheReceiver->cancelRequest(*llmRequest);
+    }
+    return false;
+}
+
 } // namespace tensorrt_llm::batch_manager
@@ -348,6 +348,48 @@ class CacheSender::Impl
         mFormatter->format(session);
     }
 
+    bool cancelRequest(LlmRequest const& llmRequest)
+    {
+        bool isCancelled = false;
+        std::unique_lock lkResp(mSenderMutex);
+        auto it = mReadyResponses.find(llmRequest.mRequestId);
+        // If the request is not the current request and already in the ready queue, we can cancel it.
+        if (it != mReadyResponses.end() && (!isSending() || getCurrentRequestId() != llmRequest.mRequestId))
+        {
+            mCancelledRequests.insert(llmRequest.mRequestId);
+            isCancelled = true;
+        }
+        else
+        {
+            TLLM_LOG_WARNING("Cannot cancel request %zu", llmRequest.mRequestId);
+        }
+        return isCancelled;
+    }
+
+    void sendReadySignal(LlmRequest::RequestIdType requestId, bool isReady)
+    {
+        auto it = mRequestToSession.find(requestId);
+        TLLM_CHECK(it != mRequestToSession.end());
+        auto& session = it->second;
+        auto connections = session.getConnections();
+        for (size_t i = 0; i < connections.size(); i++)
+        {
+            auto* agentConnectionManager = dynamic_cast<executor::kv_cache::AgentConnectionManager*>(mManager);
+            if (agentConnectionManager != nullptr)
+            {
+                auto* agentConnection = dynamic_cast<executor::kv_cache::AgentConnection const*>(connections.at(i));
+                TLLM_CHECK(agentConnection != nullptr);
+                agentConnection->sendReadySignal(
+                    executor::kv_cache::DataContext{TransceiverTag::kREADY_SIGNAL_TAG}, isReady);
+            }
+            else
+            {
+                connections.at(i)->send(
+                    executor::kv_cache::DataContext{TransceiverTag::kREADY_SIGNAL_TAG}, &isReady, sizeof(isReady));
+            }
+        }
+    }
+
     ~Impl()
     {
         terminate();
@@ -391,20 +433,54 @@ class CacheSender::Impl
         {
             mRemainSendCount.erase(reqId);
 
-            // TODO(zhengd): pass the hashes directly instead of update llmRequest
-            auto llmRequest = it->second.mRequest;
-            llmRequest->setRequestedBlockHashes(std::move(blockHashes));
+            // Check if the request is cancelled
+            bool isReady = true;
+            {
+                std::unique_lock lk(mSenderMutex);
+                if (mCancelledRequests.find(reqId) != mCancelledRequests.end())
+                {
+                    isReady = false;
+                }
+            }
+            sendReadySignal(reqId, isReady);
 
-            if (common::getEnvParallelCacheSend())
+            if (isReady)
             {
-                // TODO: Use a thread pool and check for thread safety.
-                std::thread(&CacheSender::Impl::sendAndRemoveResponse, this, it->first, std::move(it->second)).detach();
+                // TODO(zhengd): pass the hashes directly instead of update llmRequest
+                auto llmRequest = it->second.mRequest;
+                llmRequest->setRequestedBlockHashes(std::move(blockHashes));
+
+                if (common::getEnvParallelCacheSend())
+                {
+                    // TODO: Use a thread pool and check for thread safety.
+                    std::thread(&CacheSender::Impl::sendAndRemoveResponse, this, it->first, std::move(it->second))
+                        .detach();
+                }
+                else
+                {
+                    CacheSender::Impl::sendAndRemoveResponse(it->first, std::move(it->second));
+                }
+                removeResponse(it);
             }
             else
             {
-                CacheSender::Impl::sendAndRemoveResponse(it->first, std::move(it->second));
+                // TODO: if the generation does not require the kv cache, the request will
+                // not be removed from mCancelledRequests. This should be handled by timeout.
+                auto it = mReadyResponses.find(mCurrentRequest.value());
+                {
+                    std::unique_lock lkResp(mSenderMutex);
+                    mReadyResponses.erase(it);
+                    mCancelledRequests.erase(mCurrentRequest.value());
+                    mRemainSendCount.erase(mCurrentRequest.value());
+                }
+                mCurrentRequest = std::nullopt;
+
+                if (mReadyResponses.empty())
+                {
+                    std::unique_lock lk(mCondMutex);
+                    mAnyReady = false;
+                }
             }
-            removeResponse(it);
         }
         mCurrentRequest = std::nullopt;
     }
@@ -433,7 +509,11 @@ class CacheSender::Impl
                     auto reqId = requestInfo.getRequestId();
                     blockHashes = requestInfo.getBlockHashes();
 
-                    mCurrentRequest = reqId;
+                    {
+                        std::unique_lock lk(mSenderMutex);
+                        mCurrentRequest = reqId;
+                    }
+
                     if (mRemainSendCount.find(reqId) == mRemainSendCount.end())
                     {
                         mRemainSendCount[reqId] = getCounterpartsCount(reqId);
@@ -513,6 +593,7 @@ class CacheSender::Impl
 
 private:
     std::optional<RequestIdType> mCurrentRequest;
+    std::set<LlmRequest::RequestIdType> mCancelledRequests;
     std::map<RequestIdType, Response> mReadyResponses;
     std::mutex mSenderMutex, mCondMutex;
     std::atomic<bool> mAnyReady{false}, mTerminate{false};
@@ -685,6 +766,62 @@ class CacheReceiver::Impl
         connection->send(executor::kv_cache::DataContext{TransceiverTag::kINFO_TAG}, serializedInfo.data(), infoSize);
     }
 
+    bool cancelRequest(LlmRequest const& llmRequest)
+    {
+
+        std::string processInfo = "default";
+        if (common::getEnvRequestKVCacheConcurrent())
+        {
+            processInfo = llmRequest.getDataTransceiverState().getCommState()->toString();
+        }
+
+        bool isCancelled = false;
+        auto& asyncResource = mInstanceToAsyncResource.at(processInfo);
+        {
+            std::unique_lock<std::mutex> lck(asyncResource->mMtxForQueue);
+            auto it = std::find_if(asyncResource->mRequestsQueue.begin(), asyncResource->mRequestsQueue.end(),
+                [&llmRequest](RequestAndPromise const& requestAndPromise)
+                { return requestAndPromise.mRequest->mRequestId == llmRequest.mRequestId; });
+            if (it != asyncResource->mRequestsQueue.end())
+            {
+                asyncResource->mRequestsQueue.erase(it);
+                isCancelled = true;
+            }
+            else
+            {
+                TLLM_LOG_WARNING("Cannot cancel request %zu", llmRequest.mRequestId);
+            }
+        }
+        return isCancelled;
+    }
+
+    bool receiveReadySignal(TransferSession& session)
+    {
+        bool isReadyFinal = true;
+        bool isReady = false;
+        auto connections = session.getConnections();
+
+        for (size_t i = 0; i < connections.size(); i++)
+        {
+            auto* agentConnectionManager = dynamic_cast<executor::kv_cache::AgentConnectionManager*>(mManager);
+            if (agentConnectionManager != nullptr)
+            {
+                auto* agentConnection = dynamic_cast<executor::kv_cache::AgentConnection const*>(connections.at(i));
+                TLLM_CHECK(agentConnection != nullptr);
+                isReady = agentConnection->recvReadySignal(
+                    executor::kv_cache::DataContext{TransceiverTag::kREADY_SIGNAL_TAG});
+            }
+            else
+            {
+                connections.at(i)->recv(
+                    executor::kv_cache::DataContext{TransceiverTag::kREADY_SIGNAL_TAG}, &isReady, sizeof(isReady));
+            }
+            isReadyFinal &= isReady;
+        }
+
+        return isReadyFinal;
+    }
+
     ~Impl()
     {
         for (auto&& [processInfo, asyncResource] : mInstanceToAsyncResource)
@@ -707,6 +844,14 @@ class CacheReceiver::Impl
         llmRequest.setKvCacheTransferStart(std::chrono::steady_clock::now());
         TLLM_CUDA_CHECK(cudaSetDevice(mDeviceId));
         auto session = sendRequestInfo(llmRequest);
+        bool isReady = receiveReadySignal(session);
+        if (!isReady)
+        {
+            // Reuse the error state for the cancelled request.
+            llmRequest.setState(LlmRequestState::kDISAGG_TRANS_ERROR);
+            llmRequest.setKvCacheTransferEnd(std::chrono::steady_clock::now());
+            return;
+        }
         receiveSync(session);
         llmRequest.setKvCacheTransferEnd(std::chrono::steady_clock::now());
 
@@ -876,6 +1021,16 @@ RequestInfo CacheSender::recvRequestInfo()
     return mImpl->recvRequestInfo();
 }
 
+bool CacheSender::cancelRequest(LlmRequest const& llmRequest)
+{
+    return mImpl->cancelRequest(llmRequest);
+}
+
+void CacheSender::sendReadySignal(LlmRequest::RequestIdType requestId, bool isReady)
+{
+    mImpl->sendReadySignal(requestId, isReady);
+}
+
 CacheReceiver::CacheReceiver(executor::kv_cache::ConnectionManager* manager,
     executor::kv_cache::CacheState selfCacheState, SizeType32 selfIndex, std::unique_ptr<BaseCacheFormatter> formatter)
     : mImpl{std::unique_ptr<Impl, ImplDeleter>(new Impl(manager, selfCacheState, selfIndex, std::move(formatter)))}
@@ -899,4 +1054,14 @@ void CacheReceiver::receiveSync(TransferSession& session)
     mImpl->receiveSync(session);
 }
 
+bool CacheReceiver::cancelRequest(LlmRequest const& llmRequest)
+{
+    return mImpl->cancelRequest(llmRequest);
+}
+
+bool CacheReceiver::receiveReadySignal(TransferSession& session)
+{
+    return mImpl->receiveReadySignal(session);
+}
+
 } // namespace tensorrt_llm::batch_manager
@@ -122,6 +122,7 @@ struct TransceiverTag
     static constexpr int32_t kID_TAG{19};
     static constexpr int32_t kINFO_SIZE_TAG{22};
     static constexpr int32_t kINFO_TAG{32};
+    static constexpr int32_t kREADY_SIGNAL_TAG{42};
 };
 
 // Used to store the information that needs to be sent to the context executor to ensure the generation
@@ -207,6 +208,16 @@ class CacheSender
     /// @param llmRequest The request object to which the data belongs.
     virtual RequestInfo recvRequestInfo();
 
+    /// @brief Cancel the request.
+    /// @param requestId The ID used in the context phase of the current request.
+    /// @return Whether the request is cancelled.
+    virtual bool cancelRequest(LlmRequest const& llmRequest);
+
+    /// @brief Send ready signal.
+    /// @param requestId The ID used in the context phase of the current request.
+    /// @param isReady Whether the request is ready to be received.
+    virtual void sendReadySignal(LlmRequest::RequestIdType requestId, bool isReady);
+
     /// @brief Destructor.
     virtual ~CacheSender();
 
@@ -239,6 +250,17 @@ class CacheReceiver
     virtual TransferSession sendRequestInfo(LlmRequest const& llmRequest);
 
     virtual void receiveSync(TransferSession& session);
+
+    /// @brief Cancel the request.
+    /// @param llmRequest Request object.
+    /// @return Whether the request is cancelled.
+    virtual bool cancelRequest(LlmRequest const& llmRequest);
+
+    /// @brief Receive ready signal.
+    /// @param session The session object.
+    /// @return Whether the request is ready to be received.
+    virtual bool receiveReadySignal(TransferSession& session);
+
     /// @brief Destructor.
     virtual ~CacheReceiver();