@@ -826,6 +826,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
826826 ncclResult_t ret = ncclSuccess;
827827 int rank = comm->rank ;
828828 int nranks = comm->nRanks ;
829+ int nNodes = 1 ;
829830 cpu_set_t affinitySave;
830831 struct ncclTopoGraph ringGraph;
831832 struct ncclTopoGraph treeGraph;
@@ -865,6 +866,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
865866 NCCLCHECKGOTO (bootstrapAllGather (comm->bootstrap , comm->peerInfo , sizeof (struct ncclPeerInfo )), ret, fail);
866867
867868 for (int i = 0 ; i < nranks; i++) {
869+ if (comm->peerInfo [i].hostHash != comm->peerInfo [rank].hostHash ) nNodes++;
868870 if ((i != rank) && (comm->peerInfo [i].hostHash == comm->peerInfo [rank].hostHash ) && (comm->peerInfo [i].busId == comm->peerInfo [rank].busId )) {
869871 WARN (" Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx" , rank, i, comm->peerInfo [rank].busId );
870872 ret = ncclInvalidUsage;
@@ -879,7 +881,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
879881#include " cudawrap.h"
880882
881883 // MNNVL support
882- {
884+ if (nNodes > 1 ) {
883885 int cliqueSize = 0 ;
884886 comm->MNNVL = 0 ;
885887 // Determine the size of the MNNVL domain/clique
@@ -1485,15 +1487,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
14851487 if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
14861488 snprintf ((char *)&job->commId , sizeof (job->commId ), " %016lx-%d" , job->parent ->commHash , job->color );
14871489 NCCLCHECKGOTO (commAlloc (comm, job->parent , job->nranks , job->myrank ), res, fail);
1488- comm->commHash = getHash (job->commId .internal , NCCL_UNIQUE_ID_BYTES); // Needed for UDS support
14891490 NCCLCHECKGOTO (bootstrapSplit ((struct ncclBootstrapHandle *)&job->commId , comm, job->parent , job->color , job->key , parentRanks), res, fail);
14901491 } else {
14911492 NCCLCHECKGOTO (commAlloc (comm, NULL , job->nranks , job->myrank ), res, fail);
1492- comm->commHash = getHash (job->commId .internal , NCCL_UNIQUE_ID_BYTES); // Needed for UDS support
14931493 NCCLCHECKGOTO (bootstrapInit ((struct ncclBootstrapHandle *)&job->commId , comm), res, fail);
14941494 }
14951495
14961496 comm->cudaArch = cudaArch;
1497+ comm->commHash = getHash (job->commId .internal , NCCL_UNIQUE_ID_BYTES);
14971498
14981499 INFO (NCCL_INIT," comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START" , comm, comm->rank , comm->nRanks , comm->cudaDev , comm->nvmlDev , comm->busId , (unsigned long long )hashUniqueId (job->commId ));
14991500
0 commit comments