NVIDIA
diff --git a/‎makefiles/common.mk‎
Lines changed: 7 additions & 1 deletion b/‎makefiles/common.mk‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎makefiles/version.mk‎
Lines changed: 2 additions & 2 deletions b/‎makefiles/version.mk‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/Makefile‎
Lines changed: 1 addition & 1 deletion b/‎src/Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/collectives.cc‎
Lines changed: 15 additions & 64 deletions b/‎src/collectives.cc‎
Lines changed: 15 additions & 64 deletions
diff --git a/‎src/device/Makefile‎
Lines changed: 1 addition & 1 deletion b/‎src/device/Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/device/all_reduce.h‎
Lines changed: 1 addition & 1 deletion b/‎src/device/all_reduce.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/enqueue.cc‎
Lines changed: 17 additions & 7 deletions b/‎src/enqueue.cc‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎src/graph/connect.cc‎
Lines changed: 2 additions & 5 deletions b/‎src/graph/connect.cc‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎src/graph/paths.cc‎
Lines changed: 29 additions & 6 deletions b/‎src/graph/paths.cc‎
Lines changed: 29 additions & 6 deletions
@@ -39,14 +39,20 @@ endif
 CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
+CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \
+                 -gencode=arch=compute_120,code=sm_120
 
 CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
 CUDA11_PTX    = -gencode=arch=compute_80,code=compute_80
 CUDA12_PTX    = -gencode=arch=compute_90,code=compute_90
+CUDA13_PTX    = -gencode=arch=compute_120,code=compute_120
 
 
-ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0)
+# Include Blackwell support if we're using CUDA12.8 or above
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
 # Include Hopper support if we're using CUDA11.8 or above
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
 
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 24
-NCCL_PATCH   := 3
+NCCL_MINOR   := 25
+NCCL_PATCH   := 1
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
+	init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
 
@@ -8,6 +8,7 @@
 #include "collectives.h"
 #include "enqueue.h"
 #include "nccl.h"
+#include "nvtx_payload_schemas.h"
 
 const char* ncclFuncToString(ncclFunc_t fn) {
   switch (fn) {
@@ -78,11 +79,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
   // Just pass the size of one message and not the total bytes sent/received.
-  constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
-  };
-  size_t msgsize = sendcount * ncclTypeSize(datatype);
-  NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
+  NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype)));
 
   struct ncclInfo info = { ncclFuncAllGather, "AllGather",
     sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
@@ -94,18 +92,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct NvtxParamsAllReduce {
-    size_t bytes;
-    ncclRedOp_t op;
-  };
-  // Just pass the size of one message and not the total bytes sent/received.
-  static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsAllReduce, op)}
-  };
-  NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
-  NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), op));
 
   struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
     sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
@@ -117,16 +105,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream) {
-  struct NvtxParamsBroadcast {
-    size_t bytes;
-    int root;
-  };
-  constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
-  };
-  NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
-  NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));
 
   struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
     sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
@@ -145,19 +125,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  struct NvtxParamsReduce {
-    size_t bytes;
-    int root;
-    ncclRedOp_t op;
-  };
-  constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsReduce, op)}
-  };
-  NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
-  NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, op));
 
   struct ncclInfo info = { ncclFuncReduce, "Reduce",
     sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
@@ -169,39 +138,21 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct NvtxParamsReduceScatter {
-    size_t bytes;
-    ncclRedOp_t op;
-  };
-  constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsReduceScatter, op)}
-  };
-  NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
-  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), op));
 
   struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
     sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
     REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
   return ncclEnqueueCheck(&info);
 }
 
-struct NvtxParamsSendRecv {
-    size_t bytes;
-    int peer;
-};
-constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
-};
-
 NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream) {
-  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
-  NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer));
 
   struct ncclInfo info = { ncclFuncSend, "Send",
     NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
@@ -213,8 +164,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream) {
-  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
-  NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer));
 
   struct ncclInfo info = { ncclFuncRecv, "Recv",
     NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
 
@@ -5,7 +5,7 @@
 #
 
 SHELL := /usr/bin/env bash
-MAKEFALGS += -r
+MAKEFLAGS += -r
 .SUFFIXES:
 .SECONDARY:
 
 
@@ -436,7 +436,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
           prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
-      } else if (tid < tidEndReduce) {
+      } else if (tid < tidEndReduce && nvls->headRank != -1) {
         // Reduce, broadcast through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
 
@@ -21,19 +21,21 @@
 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 
 // Returns maximum kernel stack size of all CUDA kernels
-ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
+ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
   ncclResult_t result = ncclSuccess;
+  int print = 0;
 
   if (maxStackSize) *maxStackSize = 0;
   int carveout = ncclParamL1SharedMemoryCarveout();
+  int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
 
   for (int k=0; k < ncclDevKernelCount; k++) {
     void* fn = ncclDevKernelList[k];
+    cudaFuncAttributes attr = {0};
     if (fn == nullptr) continue;
 
+    CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
     if (maxStackSize) {
-      cudaFuncAttributes attr = {0};
-      CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
       if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
     ignore0:;
     }
@@ -43,9 +45,17 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
         result, ignore1);
     ignore1:;
     }
-    if (ncclShmemDynamicSize(cudaArch) != 0) {
+    if (ncclMaxSharedMem != 0) {
+      int sharedMemSize = ncclMaxSharedMem;
+      if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
+        if (print++ == 0)
+          INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
+               sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
+        // Reduce requested MaxDynamicSharedMemorySize attribute
+        sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
+      }
       CUDACHECKGOTO(cudaFuncSetAttribute(fn,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
+        cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
         result, next_kernel);
     }
   next_kernel:;
@@ -1445,7 +1455,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
   NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
   if (driverVersion >= 11080) {
     int compCap = comm->compCap;
-    unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;
+    unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
 
     CUlaunchConfig launchConfig = {0};
     CUlaunchAttribute launchAttrs[3];
@@ -1597,7 +1607,7 @@ static ncclResult_t updateCollCostTable(
     if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
     if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
     /* now we only support single-node NVLS allgather and reducescatter */
-    if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
+    if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue;
     /* Tree reduceScatter doesn't support scaling yet */
     if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
         && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
 
@@ -19,7 +19,6 @@
 ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
   int rank = comm->rank;
   int localRanks = comm->topo->nodes[GPU].count;
-  int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
   int nChannels = comm->nChannels;
 
   topoRanks->nvlsHeadNum = 0;
@@ -74,7 +73,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
   // Get nvls heads and the number of heads. Duplicate head is not allowed.
   for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
     bool addHead = true;
-    int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
+    int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
 
     for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
       if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
@@ -259,8 +258,6 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
     channel->nvls.out = -1;       // NVLS+SHARP not yet implemented.
     channel->nvls.headRank = headRank;
     channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
-    channel->nvls.node = comm->node;
-    channel->nvls.nNodes = comm->nNodes;
     if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
   }
   if (comm->nNodes == 1) return ncclSuccess;
@@ -466,7 +463,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   }
 
   // Use 4 compute channels per search channel to reach peak BW on <8 PPN
-  if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
+  if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
      nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
   }
 
 
@@ -828,14 +828,37 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
   return ncclSuccess;
 }
 
-int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
-  int minPath = PATH_DIS;
+ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min) {
+  int minPath = PATH_SYS;
   for (int i=0; i<system->nodes[GPU].count; i++) {
-    struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
-    for (int j=0; j<system->nodes[GPU].count; j++) {
-      if (i == j) continue;
+    struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
+    if (paths == NULL) continue;
+    for (int j=0; j<system->nodes[type].count; j++) {
+      if (type == GPU && i == j) continue;
       minPath = std::min(minPath, paths[j].type);
     }
   }
-  return minPath >= PATH_PIX ? 0 : 1;
+  *min = minPath;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max) {
+  int maxPath = PATH_LOC;
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
+    if (paths == NULL) continue;
+    for (int j=0; j<system->nodes[type].count; j++) {
+      if (type == GPU && i == j) continue;
+      maxPath = std::max(maxPath, paths[j].type);
+    }
+  }
+  *max = maxPath;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink) {
+  int maxPath;
+  NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxPath));
+  *allNvLink = maxPath >= PATH_PIX ? 0 : 1;
+  return ncclSuccess;
 }
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`#`
`6`	`6`
`7`	`7`	`SHELL := /usr/bin/env bash`
`8`		`-MAKEFALGS += -r`
	`8`	`+MAKEFLAGS += -r`
`9`	`9`	`.SUFFIXES:`
`10`	`10`	`.SECONDARY:`
`11`	`11`
Original file line number	Diff line number	Diff line change
`@@ -436,7 +436,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL`
`436`	`436`	`int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);`
`437`	`437`	`prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);`
`438`	`438`	`}`
`439`		`- } else if (tid < tidEndReduce) {`
	`439`	`+ } else if (tid < tidEndReduce && nvls->headRank != -1) {`
`440`	`440`	`// Reduce, broadcast through NVLS`
`441`	`441`	`using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;`
`442`	`442`	`Primitives<T, RedOp, FanSymmetric<1>, /Direct=/1, Proto, 0>`