Skip to content

Commit 48bb7fe

Browse files
committed
2.20.5-1
Fix UDS connection failure when using ncclCommSplit. Issue #1185
1 parent b647562 commit 48bb7fe

File tree

6 files changed

+19
-10
lines changed

6 files changed

+19
-10
lines changed

makefiles/version.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
##### version
22
NCCL_MAJOR := 2
33
NCCL_MINOR := 20
4-
NCCL_PATCH := 3
4+
NCCL_PATCH := 5
55
NCCL_SUFFIX :=
66
PKG_REVISION := 1

src/bootstrap.cc

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,10 @@ ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm*
305305
NCCLCHECK(ncclSocketGetAddr(proxySocket, state->peerProxyAddresses+rank));
306306
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
307307
// cuMem UDS support
308-
state->peerProxyAddressesUDS[rank] = getPidHash()+comm->commHash;
308+
// Make sure we create a unique UDS socket name
309+
uint64_t randId;
310+
NCCLCHECK(getRandomData(&randId, sizeof(randId)));
311+
state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
309312
NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)));
310313
NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
311314

@@ -371,7 +374,10 @@ ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm*
371374
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)), ret, fail);
372375
// cuMem UDS support
373376
NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail);
374-
state->peerProxyAddressesUDS[rank] = getPidHash()+comm->commHash;
377+
// Make sure we create a unique UDS socket name
378+
uint64_t randId;
379+
NCCLCHECKGOTO(getRandomData(&randId, sizeof(randId)), ret, fail);
380+
state->peerProxyAddressesUDS[rank] = getPidHash()+randId;
375381
NCCLCHECKGOTO(bootstrapAllGather(state, state->peerProxyAddressesUDS, sizeof(*state->peerProxyAddressesUDS)), ret, fail);
376382
NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
377383
}

src/graph/search.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1077,7 +1077,7 @@ ncclResult_t getNvlsNetDev(struct ncclComm* comm, struct ncclTopoGraph* graph, i
10771077
int localRanks = comm->topo->nodes[GPU].count;
10781078
int netNum = 0;
10791079
int net[MAXCHANNELS];
1080-
1080+
10811081
for (int c = 0; c < graph->nChannels; c++) {
10821082
if (graph->intra[c * localRanks] == comm->rank) {
10831083
net[netNum++] = graph->inter[c * 2];

src/init.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
826826
ncclResult_t ret = ncclSuccess;
827827
int rank = comm->rank;
828828
int nranks = comm->nRanks;
829+
int nNodes = 1;
829830
cpu_set_t affinitySave;
830831
struct ncclTopoGraph ringGraph;
831832
struct ncclTopoGraph treeGraph;
@@ -865,6 +866,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
865866
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
866867

867868
for (int i = 0; i < nranks; i++) {
869+
if (comm->peerInfo[i].hostHash != comm->peerInfo[rank].hostHash) nNodes++;
868870
if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
869871
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
870872
ret = ncclInvalidUsage;
@@ -879,7 +881,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
879881
#include "cudawrap.h"
880882

881883
// MNNVL support
882-
{
884+
if (nNodes > 1) {
883885
int cliqueSize = 0;
884886
comm->MNNVL = 0;
885887
// Determine the size of the MNNVL domain/clique
@@ -1485,15 +1487,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
14851487
if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
14861488
snprintf((char*)&job->commId, sizeof(job->commId), "%016lx-%d", job->parent->commHash, job->color);
14871489
NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
1488-
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); // Needed for UDS support
14891490
NCCLCHECKGOTO(bootstrapSplit((struct ncclBootstrapHandle*)&job->commId, comm, job->parent, job->color, job->key, parentRanks), res, fail);
14901491
} else {
14911492
NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
1492-
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES); // Needed for UDS support
14931493
NCCLCHECKGOTO(bootstrapInit((struct ncclBootstrapHandle*)&job->commId, comm), res, fail);
14941494
}
14951495

14961496
comm->cudaArch = cudaArch;
1497+
comm->commHash = getHash(job->commId.internal, NCCL_UNIQUE_ID_BYTES);
14971498

14981499
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, (unsigned long long)hashUniqueId(job->commId));
14991500

src/proxy.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1660,6 +1660,9 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union
16601660
comm->proxyState->listenSock = sock;
16611661
comm->proxyState->peerAddresses = peerAddresses;
16621662
comm->proxyState->peerAddressesUDS = peerAddressesUDS;
1663+
1664+
// UDS support
1665+
NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag));
16631666
// Seed the random number generator for UDS filename generation
16641667
struct timeval time;
16651668
gettimeofday(&time,NULL);
@@ -1693,8 +1696,7 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
16931696
ncclSetThreadName(comm->proxyState->thread, "NCCL Service %2d", comm->cudaDev);
16941697

16951698
// UDS support
1696-
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d pidHash %lx", comm, comm->rank, comm->peerInfo[comm->rank].pidHash);
1697-
NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, comm->peerInfo[comm->rank].pidHash, comm->abortFlag));
1699+
INFO(NCCL_PROXY, "UDS: Creating service thread comm %p rank %d", comm, comm->rank);
16981700
pthread_create(&comm->proxyState->threadUDS, NULL, ncclProxyServiceUDS, comm->proxyState);
16991701
ncclSetThreadName(comm->proxyState->threadUDS, "NCCL UDS Service %2d", comm->cudaDev);
17001702
}

src/transport/net_ib.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -853,7 +853,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
853853
// Print just the QPs for this dev
854854
if (comm->base.qps[q].devIndex == i)
855855
INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
856-
comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
856+
comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
857857
commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, ncclParamIbGidIndex(),
858858
devInfo->spn, devInfo->iid, devInfo->fifoRkey, commDev->fifoMr->lkey);
859859
}

0 commit comments

Comments
 (0)