ROCm
diff --git a/‎src/device/op128.h‎
Lines changed: 0 additions & 2 deletions b/‎src/device/op128.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/enqueue.cc‎
Lines changed: 21 additions & 7 deletions b/‎src/enqueue.cc‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎src/include/rccl_common.h‎
Lines changed: 1 addition & 0 deletions b/‎src/include/rccl_common.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/include/recorder.h‎
Lines changed: 0 additions & 1 deletion b/‎src/include/recorder.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/include/rocmwrap.h‎
Lines changed: 12 additions & 0 deletions b/‎src/include/rocmwrap.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/init.cc‎
Lines changed: 45 additions & 11 deletions b/‎src/init.cc‎
Lines changed: 45 additions & 11 deletions
diff --git a/‎src/misc/recorder.cc‎
Lines changed: 1 addition & 0 deletions b/‎src/misc/recorder.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/misc/rocmwrap.cc‎
Lines changed: 2 additions & 0 deletions b/‎src/misc/rocmwrap.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/rccl_wrap.cc‎
Lines changed: 22 additions & 4 deletions b/‎src/rccl_wrap.cc‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎test/ext-plugins/.gitignore‎
Lines changed: 20 additions & 0 deletions b/‎test/ext-plugins/.gitignore‎
Lines changed: 20 additions & 0 deletions
@@ -154,7 +154,6 @@ union alignas(16) BytePack<16> {
   uint32_t u32[4];
   uint64_t u64[2];
   ulong2 ul2[1], native;
-#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx942__) || defined(__gfx950__)
   inline __device__ BytePack<16>() = default;
   inline __device__ BytePack<16>(const BytePack<16>& other) {
     *this = other;
@@ -164,7 +163,6 @@ union alignas(16) BytePack<16> {
     u64[1] = other.u64[1];
     return *this;
   }
-#endif
 };
 template<int Size>
 union BytePack {
 
@@ -392,7 +392,9 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
     devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
     devWork.oneNode = (comm->nNodes == 1);
     devWork.rcclUseOneSlice = comm->rcclUseOneSlice;
-
+    //[Added-comment] opCount is missing for collDevWork, adding here
+    devWork.opCount = task->opCount;
+	  
     devWork.isOneRPN = comm->isOneRPN;
     devWork.netRegUsed = devWork.regUsed = 0;
     devWork.gfx9CheapFenceOff = gfx9CheapFenceOff(devWork, comm->gfx9CheapFenceOff);
@@ -488,6 +490,14 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
         WARN("%s: unsupported collective. Please ensure the collective has been enabled in build.", __func__);
         return ncclInvalidUsage;
       }
+      
+      if (!rcclIsArchSupportedForFunc(&agg, comm->archName)) {
+        WARN("%s: unsupported architecture (%s) for collective %s(%s, %s, %s, %s, Acc=%d, Pipeline=%d).", 
+          __func__, comm->archName, 
+          ncclFuncToString(task->func), ncclAlgoToString(task->algorithm), ncclProtoToString(task->protocol), 
+          ncclDevRedOpToString(task->opDev.op), ncclDatatypeToString(task->datatype), (agg.acc != nullptr), agg.pipeline);
+        return ncclInvalidUsage;
+      }
 
       int isCollnet=0, isNvls=0;
       switch (agg.algorithm) {
@@ -898,7 +908,7 @@ static ncclResult_t scheduleCollTasksToPlan(
   return ncclSuccess;
 }
 
-NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384);
+NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 8192);
 RCCL_PARAM(P2pNetThreshold, "P2P_NET_THRESHOLD", 131072);
 NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
 
@@ -1597,7 +1607,7 @@ static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturin
     if (capturing && driver < 12090) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
     *mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
 #else
-    *mode = ncclImplicitOrderNone;
+    *mode = ncclImplicitOrderSerial;
 #endif
     return ncclSuccess;
   }
@@ -1886,6 +1896,12 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern
     // hostStreamPlanTask directly
     NCCLCHECK(hostStreamPlanTask(comm, plan));
   }
+  
+  // Increment the opCount for intranode comms as well. Previously if proxyOpQueue was empty 
+  // opCount was not incremented because ncclProxyStart wasn't called in hostStreamPlanTask
+  if (!plan->persistent && ncclIntruQueueHead(&plan->proxyOpQueue) == nullptr) {
+    comm->opCount++;
+  }
   return ncclSuccess;
 }
 
@@ -1913,10 +1929,10 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
     ncclIntruQueueConstruct(&planner->planQueue);
 
     bool capturing = ncclCudaGraphValid(planner->capturingGraph);
-    //cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // unused variable - compiler warning
+    cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
     cudaStream_t deviceStream, launchOrder;
-
     cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent;
+    CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
 
     if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) {
       comm->workFifoProducedLastRecorded = comm->workFifoProduced;
@@ -1931,8 +1947,6 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
     }
 
     if (capturing || planner->numStreams != 1) {
-      // CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
-
       // deviceStream waits on userStream[0]
       NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
 
@@ -117,4 +117,5 @@ ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count,
 ncclResult_t commSetUnrollFactor(struct ncclComm* comm);
 bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, const char* archName);
 int parseFirmwareVersion();
+bool rcclIsArchSupportedForFunc(struct ncclTaskColl* info, char const* archName);
 #endif
@@ -5,7 +5,6 @@
 #include <vector>
 #include <mutex>
 #include <chrono>
-#include "debug.h"
 
 namespace rccl
 {
 
@@ -9,6 +9,7 @@
 #define NCCL_ROCMWRAP_H_
 
 #include <hsa/hsa.h>
+#include "checks.h"
 
 typedef hsa_status_t (*PFN_hsa_init)();
 typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, void* value);
@@ -85,6 +86,17 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
 
 ncclResult_t rocmLibraryInit(void);
 
+extern int ncclCudaDriverVersionCache;
 extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
 
+inline ncclResult_t ncclCudaDriverVersion(int* driver) {
+  int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
+  if (version == -1) {
+    CUDACHECK(cudaDriverGetVersion(&version));
+    __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED);
+  }
+  *driver = version;
+  return ncclSuccess;
+}
+
 #endif
@@ -199,6 +199,22 @@ ncclResult_t checkHsaEnvSetting() {
   }
   return ncclSuccess;
 }
+
+// Fail the job if build flag HIP_HOST_UNCACHED_MEMORY is not set on mi350x
+ncclResult_t checkHostUncacheMemSetting(struct ncclComm* comm) {
+  #if defined(HIP_HOST_UNCACHED_MEMORY)
+    return ncclSuccess;
+  #else
+    if( IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") ){
+      ERROR("Build flag HIP_HOST_UNCACHED_MEMORY must be set to avoid memory corruption on mi350x");
+      return ncclSystemError;
+    }
+    else {
+      return ncclSuccess;
+    }
+  #endif   
+}
+
 static void initOnceFunc() {
   NCCLCHECKGOTO(checkHsaEnvSetting(), initResult, exit);
   initEnv();
@@ -1508,8 +1524,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
         allGather3Data[rank].nc = 4;
     }
   }
+  // For single node communicators that do not uses the full xgmi links per gpu, i.e., nranks < 8
+  // Inflate the nChannels a bit to achieve higher b/w. 
   if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) {
-    allGather3Data[rank].nc = 4;
+    if (nranks == 2 && nNodes == 1){
+      allGather3Data[rank].nc = 16;
+    } else if (nranks == 4 && nNodes == 1){
+      allGather3Data[rank].nc = 8;
+    } else {
+      allGather3Data[rank].nc = 4;
+    }
   }
 
   allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();
@@ -1873,8 +1897,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   }
   NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
 
-  INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
-
+  INFO(NCCL_INIT, "comm:%p, nRanks:%d, nNodes:%d, coll channels:%d collnet channels:%d, nvls channels:%d, p2p channels:%d, p2p channels per peer:%d", comm, comm->nRanks, comm->nNodes, comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);  
+  
   if (comm->intraRank == 0) { // Load ncclParamLaunchMode
     const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
     enum ncclLaunchMode mode, modeOld;
@@ -2053,9 +2077,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   double sum_timers = 0;
   uint64_t timers[TIMERS_INIT_COUNT] = {0};
   unsigned long long commIdHash;
+  char* archName;
+  int cuCount;
+  hipDeviceProp_t devProp;
+
   #ifdef USE_INDIRECT_FUNCTION_CALL
   int64_t stackSize;
-  hipDeviceProp_t devProp;
   #endif
 
   timers[TIMER_INIT_TOTAL] = clockNano();
@@ -2065,16 +2092,20 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail);
   cudaArch = 100*archMajor + 10*archMinor;
 
+  CUDACHECKGOTO(hipGetDeviceProperties(&devProp, cudaDev), res, fail);
+  cuCount = devProp.multiProcessorCount;
+  archName = (char*)malloc(strlen(devProp.gcnArchName) + 1);
+  strcpy(archName, devProp.gcnArchName);
+
   timers[TIMER_INIT_KERNELS] = clockNano();
   NCCLCHECK(ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes));
   // Set the maximum kernel stack size of all kernels to avoid
   // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
 #ifdef USE_INDIRECT_FUNCTION_CALL
-  CUDACHECK(hipGetDeviceProperties(&devProp, 0));
-  if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx942") && !IsArchMatch(devProp.gcnArchName,"gfx950")) {
+  if (ncclParamSetStackSize() == 1 && !IsArchMatch(archName,"gfx942") && !IsArchMatch(archName,"gfx950")) {
     stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
     if (stackSize == 0) {
-      if (IsArchMatch(devProp.gcnArchName,"gfx906"))
+      if (IsArchMatch(archName,"gfx906"))
         stackSize = 1024;
       else
         stackSize = 512;
@@ -2127,9 +2158,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
   }
   comm->cudaArch = cudaArch;
+  comm->archName = archName;
+  comm->cuCount = cuCount;
 
   NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
-
+  
+    // Check if using host uncached mem correctly
+  NCCLCHECK(checkHostUncacheMemSetting(comm));
+  
   // RCCL: determine and set unroll factor for comm
   NCCLCHECK(commSetUnrollFactor(comm));
 
@@ -2151,9 +2187,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   if (rcclParamMscclppEnabled()) {
 #ifdef ENABLE_MSCCLPP
     if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled()) && mscclppCommCompatible(comm)) {
-      hipDeviceProp_t devProp;
-      CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev));
-      comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx942") || IsArchMatch(devProp.gcnArchName, "gfx950");
+      comm->mscclppCompatible = IsArchMatch(archName, "gfx942") || IsArchMatch(archName, "gfx950");
       if (comm->mscclppCompatible) {
         bool mapContainsId = (mscclpp_uniqueIdMap.count(*job->commId) > 0);
         auto& mscclppUniqueId = mscclpp_uniqueIdMap[*job->commId];
 
@@ -7,6 +7,7 @@
 #include <string>
 #include <iomanip>
 #include <sys/syscall.h>
+#include "debug.h"
 
 using namespace std::chrono;
 
 
@@ -28,6 +28,8 @@ DECLARE_ROCM_PFN(hsa_status_string);
 
 static void *hsaLib;
 static uint16_t version_major, version_minor;
+
+int ncclCudaDriverVersionCache = -1;
 bool ncclCudaLaunchBlocking = false;
 
 static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
 
@@ -451,15 +451,13 @@ ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count,
 }
 
 ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
-  hipDeviceProp_t devProp;
-  CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
-  if(IsArchMatch(devProp.gcnArchName, "gfx950")) {
+  if(IsArchMatch(comm->archName, "gfx950")) {
     if(comm->nNodes == 1)
       comm->unroll = NCCL_UNROLL_1;
     else
       comm->unroll = NCCL_UNROLL_2;
   }
-  else if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") && devProp.multiProcessorCount > 80)))
+  else if(IsArchMatch(comm->archName, "gfx908") || ((IsArchMatch(comm->archName, "gfx942") && comm->cuCount > 80)))
     comm->unroll = NCCL_UNROLL_2;
   else
     comm->unroll = NCCL_UNROLL_4;
@@ -535,3 +533,23 @@ bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion,
   }
   return true;
 }
+
+// Should match get_arch_guard() in generate.py
+bool rcclIsArchSupportedForFunc(struct ncclTaskColl* info, const char* archName) {
+  bool supported = true;
+
+  if (info->protocol == NCCL_PROTO_LL128) {
+#if defined(ENABLE_LL128)
+    if (info->acc)
+      supported = (IsArchMatch(archName, "gfx942") || IsArchMatch(archName, "gfx950"));
+    else
+      supported = (IsArchMatch(archName, "gfx942") || IsArchMatch(archName, "gfx950") || IsArchMatch(archName, "gfx90a"));
+#else
+    supported = false;
+#endif
+  } else if (info->acc) {
+    supported = (IsArchMatch(archName, "gfx942") || IsArchMatch(archName, "gfx950"));
+  }
+
+  return supported;
+}
@@ -0,0 +1,20 @@
+# Ignore Python cache and virtual environment folders
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+
+# Ignore pytest cache
+.pytest_cache/
+.cache/
+
+# Ignore log folders
+logs/
+log/
+*.log
+
+# Ignore virtual environment folders
+venv/
+
+# Ignore build artifacts
+build/
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`	`#include <vector>`
`6`	`6`	`#include <mutex>`
`7`	`7`	`#include <chrono>`
`8`		`-#include "debug.h"`
`9`	`8`
`10`	`9`	`namespace rccl`
`11`	`10`	`{`