@@ -238,6 +238,14 @@ void CacheTransceiver::setContextState(LlmRequest* llmRequest)
238238void CacheTransceiver::respondAndSendAsync (LlmRequest* llmRequest)
239239{
240240 TLLM_CHECK (llmRequest && llmRequest->isContextOnlyRequest ());
241+ // TEST HOOK: Skip creating responder future for a specific request ID to validate HOL blocking theory
242+ if (llmRequest->mRequestId == 2049 )
243+ {
244+ llmRequest->setState (LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS );
245+ TLLM_LOG_WARNING (" TEST: Skipping responder future for context request 2049" );
246+ setContextState (llmRequest);
247+ return ;
248+ }
241249 llmRequest->setState (LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS );
242250 // If context phase params is already set, it means that the KV cache
243251 // transfer is already in progress.
@@ -252,6 +260,8 @@ void CacheTransceiver::respondAndSendAsync(LlmRequest* llmRequest)
252260 setContextState (llmRequest);
253261 auto future = mDataResponder ->respondAndSendAsync (*llmRequest);
254262 mResponderFutures .emplace_back (llmRequest, std::move (future));
263+ TLLM_LOG_DEBUG (" respondAndSendAsync: enqueued context request %ld, mResponderFutures.size()=%zu" ,
264+ llmRequest->mRequestId , mResponderFutures .size ());
255265}
256266
257267void CacheTransceiver::respondAndSendLayerWise (
@@ -301,20 +311,34 @@ void CacheTransceiver::requestAndReceiveAsync(LlmRequest* llmRequest)
301311std::vector<LlmRequest::RequestIdType> gatherRequestIds (
302312 mpi::MpiComm const & mpiComm, std::vector<LlmRequest::RequestIdType> const & requestIds)
303313{
314+ TLLM_LOG_DEBUG (" gatherRequestIds: Entry - rank %d, localSize=%zu, worldSize=%d" , mpiComm.getRank (),
315+ requestIds.size (), mpiComm.getSize ());
316+
304317 int localSize = static_cast <int >(requestIds.size ());
305318 std::vector<int > sizes (mpiComm.getSize ());
319+
320+ TLLM_LOG_DEBUG (" gatherRequestIds: Starting allgather for sizes" );
306321 mpiComm.allgather (&localSize, sizes.data (), 1 , mpi::MpiType::kINT32 );
322+ TLLM_LOG_DEBUG (" gatherRequestIds: allgather for sizes completed" );
323+
307324 // std::vector<LlmRequest::RequestIdType> all_data(total_size);
308325 std::vector<int > displs (mpiComm.getSize ());
309326 int totalSize = 0 ;
310327 for (int i = 0 ; i < mpiComm.getSize (); i++)
311328 {
312329 displs[i] = totalSize;
313330 totalSize += sizes[i];
331+ TLLM_LOG_DEBUG (" gatherRequestIds: Rank %d has %d request IDs" , i, sizes[i]);
314332 }
333+
334+ TLLM_LOG_DEBUG (" gatherRequestIds: Total size across all ranks: %d" , totalSize);
315335 std::vector<LlmRequest::RequestIdType> retData (totalSize);
336+
337+ TLLM_LOG_DEBUG (" gatherRequestIds: Starting allgatherv for request IDs" );
316338 mpiComm.allgatherv (requestIds.data (), static_cast <int >(requestIds.size ()), mpi::MpiType::kUINT64 , retData.data (),
317339 sizes, displs, mpi::MpiType::kUINT64 );
340+ TLLM_LOG_DEBUG (" gatherRequestIds: allgatherv for request IDs completed, returning %zu total IDs" , retData.size ());
341+
318342 return retData;
319343}
320344
@@ -370,72 +394,153 @@ void updateKVCacheTransferBW(mpi::MpiComm const& mpiComm, LlmRequest* request)
370394
371395void CacheTransceiver::checkContextTransferStatus (std::optional<int > const & atLeastRequestNum)
372396{
397+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Entry with atLeastRequestNum=%s, mResponderFutures.size()=%zu" ,
398+ atLeastRequestNum.has_value () ? std::to_string (atLeastRequestNum.value ()).c_str () : " nullopt" ,
399+ mResponderFutures .size ());
400+
401+ // Dump current responder future queue order for diagnostics
402+ {
403+ std::ostringstream oss;
404+ oss << " [" ;
405+ bool first = true ;
406+ for (auto const & pair : mResponderFutures )
407+ {
408+ if (!first)
409+ {
410+ oss << " , " ;
411+ }
412+ first = false ;
413+ oss << pair.first ->mRequestId ;
414+ }
415+ oss << " ]" ;
416+ TLLM_LOG_DEBUG (" checkContextTransferStatus: mResponderFutures order: %s" , oss.str ().c_str ());
417+ }
418+
373419 bool blockAll = !atLeastRequestNum.has_value ();
374420 auto syncComm = mCacheState ->getParallelConfig ().mEnableAttentionDP ? mMpiGroupTPInDPComm : mMpiGroupTensorParaComm ;
421+
422+ TLLM_LOG_DEBUG (" checkContextTransferStatus: blockAll=%s, syncComm=%s, syncComm.size=%d" ,
423+ blockAll ? " true" : " false" , syncComm ? " valid" : " null" , syncComm ? syncComm->getSize () : 0 );
424+
375425 std::vector<LlmRequest::RequestIdType> contextCompleteRequestIds;
426+ TLLM_LOG_DEBUG (
427+ " checkContextTransferStatus: Checking %zu responder futures for completion" , mResponderFutures .size ());
428+
376429 for (auto && [request, future] : mResponderFutures )
377430 {
431+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Checking request %ld future status" , request->mRequestId );
378432 if (future.wait_for (std::chrono::milliseconds (0 )) == std::future_status::ready)
379433 {
434+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Request %ld is ready" , request->mRequestId );
380435 contextCompleteRequestIds.push_back (request->mRequestId );
381436 }
437+ else
438+ {
439+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Request %ld is not ready" , request->mRequestId );
440+ }
382441 }
383442
443+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Found %zu ready requests" , contextCompleteRequestIds.size ());
444+
384445 std::unordered_map<LlmRequest::RequestIdType, int > frequencyMap;
385446 if ((syncComm) && syncComm->getSize () > 1 )
386447 {
448+ TLLM_LOG_DEBUG (
449+ " checkContextTransferStatus: Gathering request IDs across %d ranks via MPI" , syncComm->getSize ());
387450 auto gatherRequestIdVec = gatherRequestIds (*syncComm, contextCompleteRequestIds);
451+ TLLM_LOG_DEBUG (" checkContextTransferStatus: MPI gather completed, received %zu total request IDs" ,
452+ gatherRequestIdVec.size ());
453+
388454 for (auto && requestId : gatherRequestIdVec)
389455 {
390456 frequencyMap[requestId]++;
391457 }
458+ TLLM_LOG_DEBUG (
459+ " checkContextTransferStatus: Built frequency map with %zu unique request IDs" , frequencyMap.size ());
392460 }
393461 else
394462 {
463+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Single rank mode, building frequency map locally" );
395464 for (auto && requestId : contextCompleteRequestIds)
396465 {
397466 frequencyMap[requestId]++;
398467 }
468+ TLLM_LOG_DEBUG (
469+ " checkContextTransferStatus: Local frequency map built with %zu unique request IDs" , frequencyMap.size ());
399470 }
400471 std::vector<std::pair<LlmRequest::RequestIdType, int >> freqVec (frequencyMap.begin (), frequencyMap.end ());
401472
402473 std::sort (freqVec.begin (), freqVec.end (),
403474 [](std::pair<LlmRequest::RequestIdType, int > const & left,
404475 std::pair<LlmRequest::RequestIdType, int > const & right) { return left.second > right.second ; });
476+
477+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Sorted frequency vector, processing %zu entries" , freqVec.size ());
478+
405479 std::unordered_set<LlmRequest::RequestIdType> toCompleteIdSet;
480+ int expectedFreq = (syncComm) ? syncComm->getSize () : 1 ;
481+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Expected frequency for completion: %d" , expectedFreq);
482+
406483 for (auto && [requestId, freq] : freqVec)
407484 {
408- if (freq == ((syncComm) ? syncComm->getSize () : 1 ))
485+ TLLM_LOG_DEBUG (
486+ " checkContextTransferStatus: Request %ld has frequency %d (expected %d)" , requestId, freq, expectedFreq);
487+ if (freq == expectedFreq)
409488 {
410489 toCompleteIdSet.insert (requestId);
490+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Request %ld added to completion set (freq match)" , requestId);
411491 }
412492 }
413493
494+ TLLM_LOG_DEBUG (" checkContextTransferStatus: toCompleteIdSet.size()=%zu, atLeastRequestNum=%d" ,
495+ toCompleteIdSet.size (), atLeastRequestNum.value_or (0 ));
496+
414497 // Make sure there are at least atLeastRequestNum requests in toCompleteIdSet.
415498 // This will preserve the order of insertion for KVCache transfer requests.
416499 for (auto it = mResponderFutures .begin ();
417500 atLeastRequestNum.value_or (0 ) > static_cast <int >(toCompleteIdSet.size ()) && it != mResponderFutures .end ();
418501 ++it)
419502 {
420503 auto & [request, future] = *it;
504+ TLLM_LOG_DEBUG (
505+ " checkContextTransferStatus: Adding request %ld to completion set (min requirement)" , request->mRequestId );
421506 toCompleteIdSet.insert (request->mRequestId );
422507 }
423508
509+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Final toCompleteIdSet.size()=%zu" , toCompleteIdSet.size ());
510+
424511 // Complete all the requests in toCompleteIdSet
512+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Starting completion phase, blockAll=%s" , blockAll ? " true" : " false" );
513+
514+ size_t completedCount = 0 ;
425515 for (auto it = mResponderFutures .begin (); it != mResponderFutures .end ();)
426516 {
427517 auto & [request, future] = *it;
428- if (blockAll || (toCompleteIdSet.find (request->mRequestId ) != toCompleteIdSet.end ()))
518+ bool shouldComplete = blockAll || (toCompleteIdSet.find (request->mRequestId ) != toCompleteIdSet.end ());
519+
520+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Request %ld shouldComplete=%s" , request->mRequestId ,
521+ shouldComplete ? " true" : " false" );
522+
523+ if (shouldComplete)
429524 {
525+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Blocking on future.get() for request %ld" , request->mRequestId );
430526 future.get ();
527+ TLLM_LOG_DEBUG (" checkContextTransferStatus: future.get() completed for request %ld" , request->mRequestId );
528+
431529 request->setState (LlmRequestState::kDISAGG_CONTEXT_COMPLETE );
432530 it = mResponderFutures .erase (it);
531+ completedCount++;
532+
533+ TLLM_LOG_DEBUG (
534+ " checkContextTransferStatus: Request %ld completed and removed from futures" , request->mRequestId );
433535 }
434536 else
435537 {
436538 ++it;
437539 }
438540 }
541+
542+ TLLM_LOG_DEBUG (" checkContextTransferStatus: Exit - completed %zu requests, remaining futures: %zu" , completedCount,
543+ mResponderFutures .size ());
439544}
440545
441546void CacheTransceiver::checkGenTransferStatus (std::optional<int > const & atLeastRequestNum)
0 commit comments