@@ -199,6 +199,22 @@ ncclResult_t checkHsaEnvSetting() {
199199 }
200200 return ncclSuccess;
201201}
202+
203+ // Fail the job if build flag HIP_HOST_UNCACHED_MEMORY is not set on mi350x
204+ ncclResult_t checkHostUncacheMemSetting (struct ncclComm * comm) {
205+ #if defined(HIP_HOST_UNCACHED_MEMORY)
206+ return ncclSuccess;
207+ #else
208+ if ( IsArchMatch (comm->topo ->nodes [GPU].nodes [0 ].gpu .gcn , " gfx950" ) ){
209+ ERROR (" Build flag HIP_HOST_UNCACHED_MEMORY must be set to avoid memory corruption on mi350x" );
210+ return ncclSystemError;
211+ }
212+ else {
213+ return ncclSuccess;
214+ }
215+ #endif
216+ }
217+
202218static void initOnceFunc () {
203219 NCCLCHECKGOTO (checkHsaEnvSetting (), initResult, exit);
204220 initEnv ();
@@ -1508,8 +1524,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
15081524 allGather3Data[rank].nc = 4 ;
15091525 }
15101526 }
1527+ // For single node communicators that do not uses the full xgmi links per gpu, i.e., nranks < 8
1528+ // Inflate the nChannels a bit to achieve higher b/w.
15111529 if (IsArchMatch (comm->topo ->nodes [GPU].nodes [idx].gpu .gcn , " gfx950" )) {
1512- allGather3Data[rank].nc = 4 ;
1530+ if (nranks == 2 && nNodes == 1 ){
1531+ allGather3Data[rank].nc = 16 ;
1532+ } else if (nranks == 4 && nNodes == 1 ){
1533+ allGather3Data[rank].nc = 8 ;
1534+ } else {
1535+ allGather3Data[rank].nc = 4 ;
1536+ }
15131537 }
15141538
15151539 allGather3Data[rank].pivotA2AEnabled = comm->topo ->pivotA2AEnabled && rcclParamPivotAlltoallEnable ();
@@ -1873,8 +1897,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
18731897 }
18741898 NCCLCHECKGOTO (ncclTopoTuneModel (comm, comm->minCompCap , comm->maxCompCap , graphs), ret, fail);
18751899
1876- INFO (NCCL_INIT, " %d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer" , comm-> nChannels , comm->nChannels , comm->nvlsChannels , comm->p2pnChannels , comm->p2pnChannelsPerPeer );
1877-
1900+ INFO (NCCL_INIT, " comm:%p, nRanks:%d, nNodes:%d, coll channels: %d collnet channels:%d, nvls channels:%d, p2p channels:%d, p2p channels per peer:%d " , comm, comm-> nRanks , comm-> nNodes , comm-> nChannels , comm->nChannels , comm->nvlsChannels , comm->p2pnChannels , comm->p2pnChannelsPerPeer );
1901+
18781902 if (comm->intraRank == 0 ) { // Load ncclParamLaunchMode
18791903 const char * str = ncclGetEnv (" NCCL_LAUNCH_MODE" );
18801904 enum ncclLaunchMode mode, modeOld;
@@ -2053,9 +2077,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
20532077 double sum_timers = 0 ;
20542078 uint64_t timers[TIMERS_INIT_COUNT] = {0 };
20552079 unsigned long long commIdHash;
2080+ char * archName;
2081+ int cuCount;
2082+ hipDeviceProp_t devProp;
2083+
20562084 #ifdef USE_INDIRECT_FUNCTION_CALL
20572085 int64_t stackSize;
2058- hipDeviceProp_t devProp;
20592086 #endif
20602087
20612088 timers[TIMER_INIT_TOTAL] = clockNano ();
@@ -2065,16 +2092,20 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
20652092 CUDACHECKGOTO (cudaDeviceGetAttribute (&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail);
20662093 cudaArch = 100 *archMajor + 10 *archMinor;
20672094
2095+ CUDACHECKGOTO (hipGetDeviceProperties (&devProp, cudaDev), res, fail);
2096+ cuCount = devProp.multiProcessorCount ;
2097+ archName = (char *)malloc (strlen (devProp.gcnArchName ) + 1 );
2098+ strcpy (archName, devProp.gcnArchName );
2099+
20682100 timers[TIMER_INIT_KERNELS] = clockNano ();
20692101 NCCLCHECK (ncclInitKernelsForDevice (cudaArch, maxSharedMem, &maxLocalSizeBytes));
20702102 // Set the maximum kernel stack size of all kernels to avoid
20712103 // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
20722104#ifdef USE_INDIRECT_FUNCTION_CALL
2073- CUDACHECK (hipGetDeviceProperties (&devProp, 0 ));
2074- if (ncclParamSetStackSize () == 1 && !IsArchMatch (devProp.gcnArchName ," gfx942" ) && !IsArchMatch (devProp.gcnArchName ," gfx950" )) {
2105+ if (ncclParamSetStackSize () == 1 && !IsArchMatch (archName," gfx942" ) && !IsArchMatch (archName," gfx950" )) {
20752106 stackSize = rcclParamStackSizeOverride () ? rcclParamStackSizeOverride () : maxLocalSizeBytes;
20762107 if (stackSize == 0 ) {
2077- if (IsArchMatch (devProp. gcnArchName ," gfx906" ))
2108+ if (IsArchMatch (archName ," gfx906" ))
20782109 stackSize = 1024 ;
20792110 else
20802111 stackSize = 512 ;
@@ -2127,9 +2158,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
21272158 timers[TIMER_INIT_BOOTSTRAP] = clockNano () - timers[TIMER_INIT_BOOTSTRAP];
21282159 }
21292160 comm->cudaArch = cudaArch;
2161+ comm->archName = archName;
2162+ comm->cuCount = cuCount;
21302163
21312164 NCCLCHECKGOTO (initTransportsRank (comm, job->parent , timers), res, fail);
2132-
2165+
2166+ // Check if using host uncached mem correctly
2167+ NCCLCHECK (checkHostUncacheMemSetting (comm));
2168+
21332169 // RCCL: determine and set unroll factor for comm
21342170 NCCLCHECK (commSetUnrollFactor (comm));
21352171
@@ -2151,9 +2187,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
21512187 if (rcclParamMscclppEnabled ()) {
21522188#ifdef ENABLE_MSCCLPP
21532189 if (mscclEnabled () && (comm->topo ->mscclEnabled || mscclForceEnabled ()) && mscclppCommCompatible (comm)) {
2154- hipDeviceProp_t devProp;
2155- CUDACHECK (hipGetDeviceProperties (&devProp, cudaDev));
2156- comm->mscclppCompatible = IsArchMatch (devProp.gcnArchName , " gfx942" ) || IsArchMatch (devProp.gcnArchName , " gfx950" );
2190+ comm->mscclppCompatible = IsArchMatch (archName, " gfx942" ) || IsArchMatch (archName, " gfx950" );
21572191 if (comm->mscclppCompatible ) {
21582192 bool mapContainsId = (mscclpp_uniqueIdMap.count (*job->commId ) > 0 );
21592193 auto & mscclppUniqueId = mscclpp_uniqueIdMap[*job->commId ];
0 commit comments