Skip to content

Commit 80f6bda

Browse files
committed
NCCL 2.25.1-1
Add Blackwell/SM100 support * Add compilation for sm100 * Add graph search speeds for Blackwell * Optimize graph search to converge on large NVLink domains * Limit NVLS heads to 32 * Increase various limits to fit large NVLink domains * Add extra checks for IMEX setup, needed for MNNVL * Increase MAXCHANNELS to 64 Extend NVTX instrumentation to track NCCL communicators * Add communicator ID to NVTX traces to allow for correlation between ranks. RAS fixes
1 parent 1672c85 commit 80f6bda

30 files changed

+603
-330
lines changed

makefiles/common.mk

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,20 @@ endif
3939
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
4040
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
4141
CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
42+
CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \
43+
-gencode=arch=compute_120,code=sm_120
4244

4345
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
4446
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
4547
CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
4648
CUDA12_PTX = -gencode=arch=compute_90,code=compute_90
49+
CUDA13_PTX = -gencode=arch=compute_120,code=compute_120
4750

4851

49-
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
52+
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0)
53+
# Include Blackwell support if we're using CUDA12.8 or above
54+
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
55+
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
5056
# Include Hopper support if we're using CUDA11.8 or above
5157
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
5258
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)

makefiles/version.mk

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
##### version
22
NCCL_MAJOR := 2
3-
NCCL_MINOR := 24
4-
NCCL_PATCH := 3
3+
NCCL_MINOR := 25
4+
NCCL_PATCH := 1
55
NCCL_SUFFIX :=
66
PKG_REVISION := 1

src/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
1010
INCEXPORTS := nccl.h
1111
LIBSRCFILES := \
1212
bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
13-
init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
13+
init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
1414
$(wildcard graph/*.cc) \
1515
$(wildcard misc/*.cc) \
1616
$(wildcard transport/*.cc) \

src/collectives.cc

Lines changed: 15 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "collectives.h"
99
#include "enqueue.h"
1010
#include "nccl.h"
11+
#include "nvtx_payload_schemas.h"
1112

1213
const char* ncclFuncToString(ncclFunc_t fn) {
1314
switch (fn) {
@@ -78,11 +79,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
7879
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
7980
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
8081
// Just pass the size of one message and not the total bytes sent/received.
81-
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
82-
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
83-
};
84-
size_t msgsize = sendcount * ncclTypeSize(datatype);
85-
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
82+
NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
83+
NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype)));
8684

8785
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
8886
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
@@ -94,18 +92,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
9492
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
9593
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
9694
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
97-
struct NvtxParamsAllReduce {
98-
size_t bytes;
99-
ncclRedOp_t op;
100-
};
101-
// Just pass the size of one message and not the total bytes sent/received.
102-
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
103-
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
104-
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
105-
offsetof(NvtxParamsAllReduce, op)}
106-
};
107-
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
108-
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
95+
NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
96+
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), op));
10997

11098
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
11199
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
@@ -117,16 +105,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
117105
ncclComm_t comm, cudaStream_t stream);
118106
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
119107
ncclComm_t comm, cudaStream_t stream) {
120-
struct NvtxParamsBroadcast {
121-
size_t bytes;
122-
int root;
123-
};
124-
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
125-
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
126-
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
127-
};
128-
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
129-
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
108+
NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
109+
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));
130110

131111
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
132112
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
@@ -145,19 +125,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
145125
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
146126
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
147127
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
148-
struct NvtxParamsReduce {
149-
size_t bytes;
150-
int root;
151-
ncclRedOp_t op;
152-
};
153-
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
154-
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
155-
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
156-
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
157-
offsetof(NvtxParamsReduce, op)}
158-
};
159-
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
160-
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
128+
NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
129+
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, op));
161130

162131
struct ncclInfo info = { ncclFuncReduce, "Reduce",
163132
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
@@ -169,39 +138,21 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
169138
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
170139
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
171140
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
172-
struct NvtxParamsReduceScatter {
173-
size_t bytes;
174-
ncclRedOp_t op;
175-
};
176-
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
177-
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
178-
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
179-
offsetof(NvtxParamsReduceScatter, op)}
180-
};
181-
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
182-
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
141+
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
142+
NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), op));
183143

184144
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
185145
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
186146
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
187147
return ncclEnqueueCheck(&info);
188148
}
189149

190-
struct NvtxParamsSendRecv {
191-
size_t bytes;
192-
int peer;
193-
};
194-
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
195-
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
196-
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
197-
};
198-
199150
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
200151
ncclComm_t comm, cudaStream_t stream);
201152
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
202153
ncclComm_t comm, cudaStream_t stream) {
203-
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
204-
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
154+
NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
155+
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer));
205156

206157
struct ncclInfo info = { ncclFuncSend, "Send",
207158
NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
@@ -213,8 +164,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
213164
ncclComm_t comm, cudaStream_t stream);
214165
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
215166
ncclComm_t comm, cudaStream_t stream) {
216-
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
217-
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
167+
NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
168+
NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer));
218169

219170
struct ncclInfo info = { ncclFuncRecv, "Recv",
220171
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */

src/device/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#
66

77
SHELL := /usr/bin/env bash
8-
MAKEFALGS += -r
8+
MAKEFLAGS += -r
99
.SUFFIXES:
1010
.SECONDARY:
1111

src/device/all_reduce.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
436436
int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
437437
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
438438
}
439-
} else if (tid < tidEndReduce) {
439+
} else if (tid < tidEndReduce && nvls->headRank != -1) {
440440
// Reduce, broadcast through NVLS
441441
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
442442
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>

src/enqueue.cc

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,21 @@
2121
NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
2222

2323
// Returns maximum kernel stack size of all CUDA kernels
24-
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
24+
ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
2525
ncclResult_t result = ncclSuccess;
26+
int print = 0;
2627

2728
if (maxStackSize) *maxStackSize = 0;
2829
int carveout = ncclParamL1SharedMemoryCarveout();
30+
int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
2931

3032
for (int k=0; k < ncclDevKernelCount; k++) {
3133
void* fn = ncclDevKernelList[k];
34+
cudaFuncAttributes attr = {0};
3235
if (fn == nullptr) continue;
3336

37+
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
3438
if (maxStackSize) {
35-
cudaFuncAttributes attr = {0};
36-
CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
3739
if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
3840
ignore0:;
3941
}
@@ -43,9 +45,17 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
4345
result, ignore1);
4446
ignore1:;
4547
}
46-
if (ncclShmemDynamicSize(cudaArch) != 0) {
48+
if (ncclMaxSharedMem != 0) {
49+
int sharedMemSize = ncclMaxSharedMem;
50+
if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
51+
if (print++ == 0)
52+
INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
53+
sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
54+
// Reduce requested MaxDynamicSharedMemorySize attribute
55+
sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
56+
}
4757
CUDACHECKGOTO(cudaFuncSetAttribute(fn,
48-
cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
58+
cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
4959
result, next_kernel);
5060
}
5161
next_kernel:;
@@ -1445,7 +1455,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
14451455
NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
14461456
if (driverVersion >= 11080) {
14471457
int compCap = comm->compCap;
1448-
unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;
1458+
unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
14491459

14501460
CUlaunchConfig launchConfig = {0};
14511461
CUlaunchAttribute launchAttrs[3];
@@ -1597,7 +1607,7 @@ static ncclResult_t updateCollCostTable(
15971607
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
15981608
if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
15991609
/* now we only support single-node NVLS allgather and reducescatter */
1600-
if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
1610+
if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue;
16011611
/* Tree reduceScatter doesn't support scaling yet */
16021612
if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
16031613
&& (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;

src/graph/connect.cc

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
2020
int rank = comm->rank;
2121
int localRanks = comm->topo->nodes[GPU].count;
22-
int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
2322
int nChannels = comm->nChannels;
2423

2524
topoRanks->nvlsHeadNum = 0;
@@ -74,7 +73,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
7473
// Get nvls heads and the number of heads. Duplicate head is not allowed.
7574
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
7675
bool addHead = true;
77-
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
76+
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
7877

7978
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
8079
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
@@ -259,8 +258,6 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
259258
channel->nvls.out = -1; // NVLS+SHARP not yet implemented.
260259
channel->nvls.headRank = headRank;
261260
channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
262-
channel->nvls.node = comm->node;
263-
channel->nvls.nNodes = comm->nNodes;
264261
if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
265262
}
266263
if (comm->nNodes == 1) return ncclSuccess;
@@ -466,7 +463,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
466463
}
467464

468465
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
469-
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
466+
if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
470467
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
471468
}
472469

src/graph/paths.cc

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -828,14 +828,37 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
828828
return ncclSuccess;
829829
}
830830

831-
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
832-
int minPath = PATH_DIS;
831+
ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min) {
832+
int minPath = PATH_SYS;
833833
for (int i=0; i<system->nodes[GPU].count; i++) {
834-
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
835-
for (int j=0; j<system->nodes[GPU].count; j++) {
836-
if (i == j) continue;
834+
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
835+
if (paths == NULL) continue;
836+
for (int j=0; j<system->nodes[type].count; j++) {
837+
if (type == GPU && i == j) continue;
837838
minPath = std::min(minPath, paths[j].type);
838839
}
839840
}
840-
return minPath >= PATH_PIX ? 0 : 1;
841+
*min = minPath;
842+
return ncclSuccess;
843+
}
844+
845+
ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max) {
846+
int maxPath = PATH_LOC;
847+
for (int i=0; i<system->nodes[GPU].count; i++) {
848+
struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
849+
if (paths == NULL) continue;
850+
for (int j=0; j<system->nodes[type].count; j++) {
851+
if (type == GPU && i == j) continue;
852+
maxPath = std::max(maxPath, paths[j].type);
853+
}
854+
}
855+
*max = maxPath;
856+
return ncclSuccess;
857+
}
858+
859+
ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink) {
860+
int maxPath;
861+
NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxPath));
862+
*allNvLink = maxPath >= PATH_PIX ? 0 : 1;
863+
return ncclSuccess;
841864
}

0 commit comments

Comments
 (0)