Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:

## Unreleased - RCCL 2.27.7 for ROCm 7.1.1

### Changed
* Enabling P2P batching with `RCCL_P2P_BATCH_ENABLE=1` is only applicable up to 32 nodes.

### Resolved Issues

* Fixed a single node data corruption issue in MSCCL on the Instinct MI350X and MI355X for the LL protocol. This previously affected about 2% of the runs for single node AllReduce with inputs smaller than 512 KiB.
Expand Down
12 changes: 10 additions & 2 deletions src/enqueue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ static void addWorkBatchToPlan(
// batch further down.
newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize;
if (workType == ncclDevWorkTypeP2p) {
newBatch |= (comm->nNodes > 2 && batchP2P)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
newBatch |= (comm->nNodes > 2 && batchP2P && comm->nNodes <= 32)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
for (int i=0; i < chan->wipBatch.nP2ps; i++) {
newBatch |= p2pRound == chan->wipBatch.p2pRounds[i];
}
Expand Down Expand Up @@ -952,7 +952,15 @@ static ncclResult_t addP2pToPlan(
bool proxySameProcess[2] = {true, true};
void** handles[2] = {NULL, NULL};
auto batchP2PEnableEnv = rcclParamP2pBatchEnable();
bool batchP2P = batchP2PEnableEnv && ((sendBytes == -1)? recvBytes <= rcclParamP2pBatchThreshold() : sendBytes <= rcclParamP2pBatchThreshold());
auto p2pBatchThreshold = rcclParamP2pBatchThreshold();
bool belowThreshold = (recvBytes <= p2pBatchThreshold) && (sendBytes <= p2pBatchThreshold);
bool batchP2P = batchP2PEnableEnv && (sendBytes == recvBytes) && belowThreshold;

//ncclP2pChannelBaseForRound now computes channel-base based on batching enablement (env. variable RCCL_P2P_BATCH_ENABLE=1)
//but batching is only applicable if msg size is below threshold which is not checked below
//this causes perf. dips in some cases but also boosts in other cases even when no batching happens because msg size is above threshold
//replacing line below with ncclP2pChannelBaseForRound(comm, p2pRound, batchP2P) can cause issues due to ncclP2pChannelBaseForRound calling the same routine
//channel base computed in taskAppend and here must be the same, but in taskAppend the call happens once and is cached for later usage, which is why it wouldn't be consistent with the call below
uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound, batchP2PEnableEnv);
if (comm->p2pNet) {
for (int dir = 0; dir <= 1; dir++) {
Expand Down
14 changes: 14 additions & 0 deletions src/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,10 +317,24 @@ inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2
// ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code
// uses ncclP2pChannelToPart to determine which part "this" channel is responsible for.
inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part, int nParts, int nNodes) {
if (nNodes > 2) {
// Only works because nP2pChannels is pow2
int nChannelsLog2 = countOneBits(nP2pChannels-1);
int delta = reverseBits(part, nChannelsLog2);
return (base + delta) & (nP2pChannels-1);
} else {
return (base * nParts + part) & (nP2pChannels-1);
}
}
inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel, int nParts, int nNodes) {
if (nNodes > 2) {
// Only works because nP2pChannels is pow2
int nChannelsLog2 = countOneBits(nP2pChannels-1);
int delta = (channel-base) & (nP2pChannels-1);
return reverseBits(delta, nChannelsLog2);
} else {
return (channel - base * nParts) & (nP2pChannels-1);
}
}

struct alignas(16) ncclDevWorkColl {
Expand Down