From 89dea77e2ac5bd0fa7cbcb1b0ad683a81f5e8c3e Mon Sep 17 00:00:00 2001
From: isaki001 <36317038+isaki001@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:32:01 -0500
Subject: [PATCH] P2p batching hang-fix (#2011)

* prevent batching when send/recv bytes dont match, restore bit reversal for channel to part mapping, prevent batching beyond 32-nodes

* correct computation for channel to part mapping

* update changelog

* disabling p2p-batching by default

(cherry picked from commit 641c0eb51ce033d6f0e8414215e7dcfd03ec785e)
---
 CHANGELOG.md         |  3 +++
 src/enqueue.cc       | 12 ++++++++++--
 src/include/device.h | 14 ++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1e87ce8da..fd44785a5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
 
 ## Unreleased - RCCL 2.27.7 for ROCm 7.1.1
 
+### Changed
+* Enabling P2P batching with `RCCL_P2P_BATCH_ENABLE=1` is only applicable up to 32 nodes.
+
 ### Resolved Issues
 
 * Fixed a single node data corruption issue in MSCCL on the Instinct MI350X and MI355X for the LL protocol. This previously affected about 2% of the runs for single node AllReduce with inputs smaller than 512 KiB.
diff --git a/src/enqueue.cc b/src/enqueue.cc
index ad5b2aa7e..afcdeecdb 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -190,7 +190,7 @@ static void addWorkBatchToPlan(
     // batch further down.
     newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize;
     if (workType == ncclDevWorkTypeP2p) {
-      newBatch |= (comm->nNodes > 2 && batchP2P)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
+      newBatch |= (comm->nNodes > 2 && batchP2P && comm->nNodes <= 32)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
       for (int i=0; i < chan->wipBatch.nP2ps; i++) {
         newBatch |= p2pRound == chan->wipBatch.p2pRounds[i];
       }
@@ -952,7 +952,15 @@ static ncclResult_t addP2pToPlan(
   bool proxySameProcess[2] = {true, true};
   void** handles[2] = {NULL, NULL};
   auto batchP2PEnableEnv = rcclParamP2pBatchEnable();
-  bool batchP2P =  batchP2PEnableEnv && ((sendBytes == -1)? recvBytes <= rcclParamP2pBatchThreshold() : sendBytes <= rcclParamP2pBatchThreshold());
+  auto p2pBatchThreshold = rcclParamP2pBatchThreshold();
+  bool belowThreshold = (recvBytes <= p2pBatchThreshold) && (sendBytes <= p2pBatchThreshold);
+  bool batchP2P =  batchP2PEnableEnv && (sendBytes == recvBytes) && belowThreshold;
+
+  //ncclP2pChannelBaseForRound now computes channel-base based on batching enablement (env. variable RCCL_P2P_BATCH_ENABLE=1)
+  //but batching is only applicable if msg size is below threshold which is not checked below
+  //this causes perf. dips in some cases but also boosts in other cases even when no batching happens because msg size is above threshold
+  //replacing line below with ncclP2pChannelBaseForRound(comm, p2pRound, batchP2P) can cause issues due to ncclP2pChannelBaseForRound calling the same routine
+  //channel base computed in taskAppend and here must be the same, but in taskAppend the call happens once and is cached for later usage, which is why it wouldn't be consistent with the call below
   uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound, batchP2PEnableEnv);
   if (comm->p2pNet) {
     for (int dir = 0; dir <= 1; dir++) {
diff --git a/src/include/device.h b/src/include/device.h
index 3e0cb12f3..5e864dfe1 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -317,10 +317,24 @@ inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2
 // ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code
 // uses ncclP2pChannelToPart to determine which part "this" channel is responsible for.
 inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part, int nParts, int nNodes) {
+  if (nNodes > 2) {
+    // Only works because nP2pChannels is pow2
+    int nChannelsLog2 = countOneBits(nP2pChannels-1);
+    int delta = reverseBits(part, nChannelsLog2);
+    return (base + delta) & (nP2pChannels-1);
+  } else {
     return (base * nParts + part) & (nP2pChannels-1);
+  }
 }
 inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel, int nParts, int nNodes) {
+  if (nNodes > 2) {
+    // Only works because nP2pChannels is pow2
+    int nChannelsLog2 = countOneBits(nP2pChannels-1);
+    int delta = (channel-base) & (nP2pChannels-1);
+    return reverseBits(delta, nChannelsLog2);
+  } else {
     return (channel - base * nParts) & (nP2pChannels-1);
+  }
 }
 
 struct alignas(16) ncclDevWorkColl {