Skip to content

Commit b327cf0

Browse files
authored
Merge branch 'develop' into mberenjk/nccl-sync-2.28
2 parents b7a2872 + c8da880 commit b327cf0

31 files changed

+3054
-58
lines changed

src/device/op128.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,6 @@ union alignas(16) BytePack<16> {
154154
uint32_t u32[4];
155155
uint64_t u64[2];
156156
ulong2 ul2[1], native;
157-
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx942__) || defined(__gfx950__)
158157
inline __device__ BytePack<16>() = default;
159158
inline __device__ BytePack<16>(const BytePack<16>& other) {
160159
*this = other;
@@ -164,7 +163,6 @@ union alignas(16) BytePack<16> {
164163
u64[1] = other.u64[1];
165164
return *this;
166165
}
167-
#endif
168166
};
169167
template<int Size>
170168
union BytePack {

src/enqueue.cc

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,9 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
392392
devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
393393
devWork.oneNode = (comm->nNodes == 1);
394394
devWork.rcclUseOneSlice = comm->rcclUseOneSlice;
395-
395+
//[Added-comment] opCount is missing for collDevWork, adding here
396+
devWork.opCount = task->opCount;
397+
396398
devWork.isOneRPN = comm->isOneRPN;
397399
devWork.netRegUsed = devWork.regUsed = 0;
398400
devWork.gfx9CheapFenceOff = gfx9CheapFenceOff(devWork, comm->gfx9CheapFenceOff);
@@ -488,6 +490,14 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
488490
WARN("%s: unsupported collective. Please ensure the collective has been enabled in build.", __func__);
489491
return ncclInvalidUsage;
490492
}
493+
494+
if (!rcclIsArchSupportedForFunc(&agg, comm->archName)) {
495+
WARN("%s: unsupported architecture (%s) for collective %s(%s, %s, %s, %s, Acc=%d, Pipeline=%d).",
496+
__func__, comm->archName,
497+
ncclFuncToString(task->func), ncclAlgoToString(task->algorithm), ncclProtoToString(task->protocol),
498+
ncclDevRedOpToString(task->opDev.op), ncclDatatypeToString(task->datatype), (agg.acc != nullptr), agg.pipeline);
499+
return ncclInvalidUsage;
500+
}
491501

492502
int isCollnet=0, isNvls=0;
493503
switch (agg.algorithm) {
@@ -898,7 +908,7 @@ static ncclResult_t scheduleCollTasksToPlan(
898908
return ncclSuccess;
899909
}
900910

901-
NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 16384);
911+
NCCL_PARAM(P2pLLThreshold, "P2P_LL_THRESHOLD", 8192);
902912
RCCL_PARAM(P2pNetThreshold, "P2P_NET_THRESHOLD", 131072);
903913
NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
904914

@@ -1597,7 +1607,7 @@ static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturin
15971607
if (capturing && driver < 12090) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
15981608
*mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
15991609
#else
1600-
*mode = ncclImplicitOrderNone;
1610+
*mode = ncclImplicitOrderSerial;
16011611
#endif
16021612
return ncclSuccess;
16031613
}
@@ -1886,6 +1896,12 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern
18861896
// hostStreamPlanTask directly
18871897
NCCLCHECK(hostStreamPlanTask(comm, plan));
18881898
}
1899+
1900+
// Increment the opCount for intranode comms as well. Previously if proxyOpQueue was empty
1901+
// opCount was not incremented because ncclProxyStart wasn't called in hostStreamPlanTask
1902+
if (!plan->persistent && ncclIntruQueueHead(&plan->proxyOpQueue) == nullptr) {
1903+
comm->opCount++;
1904+
}
18891905
return ncclSuccess;
18901906
}
18911907

@@ -1913,10 +1929,10 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
19131929
ncclIntruQueueConstruct(&planner->planQueue);
19141930

19151931
bool capturing = ncclCudaGraphValid(planner->capturingGraph);
1916-
//cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch // unused variable - compiler warning
1932+
cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
19171933
cudaStream_t deviceStream, launchOrder;
1918-
19191934
cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent;
1935+
CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
19201936

19211937
if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) {
19221938
comm->workFifoProducedLastRecorded = comm->workFifoProduced;
@@ -1931,8 +1947,6 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
19311947
}
19321948

19331949
if (capturing || planner->numStreams != 1) {
1934-
// CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
1935-
19361950
// deviceStream waits on userStream[0]
19371951
NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
19381952

src/include/rccl_common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,5 @@ ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count,
117117
ncclResult_t commSetUnrollFactor(struct ncclComm* comm);
118118
bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion, int firmwareVersion, const char* archName);
119119
int parseFirmwareVersion();
120+
bool rcclIsArchSupportedForFunc(struct ncclTaskColl* info, char const* archName);
120121
#endif

src/include/recorder.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include <vector>
66
#include <mutex>
77
#include <chrono>
8-
#include "debug.h"
98

109
namespace rccl
1110
{

src/include/rocmwrap.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#define NCCL_ROCMWRAP_H_
1010

1111
#include <hsa/hsa.h>
12+
#include "checks.h"
1213

1314
typedef hsa_status_t (*PFN_hsa_init)();
1415
typedef hsa_status_t (*PFN_hsa_system_get_info)(hsa_system_info_t attribute, void* value);
@@ -85,6 +86,17 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
8586

8687
ncclResult_t rocmLibraryInit(void);
8788

89+
extern int ncclCudaDriverVersionCache;
8890
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
8991

92+
inline ncclResult_t ncclCudaDriverVersion(int* driver) {
93+
int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
94+
if (version == -1) {
95+
CUDACHECK(cudaDriverGetVersion(&version));
96+
__atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED);
97+
}
98+
*driver = version;
99+
return ncclSuccess;
100+
}
101+
90102
#endif

src/init.cc

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,22 @@ ncclResult_t checkHsaEnvSetting() {
199199
}
200200
return ncclSuccess;
201201
}
202+
203+
// Fail the job if build flag HIP_HOST_UNCACHED_MEMORY is not set on mi350x
204+
ncclResult_t checkHostUncacheMemSetting(struct ncclComm* comm) {
205+
#if defined(HIP_HOST_UNCACHED_MEMORY)
206+
return ncclSuccess;
207+
#else
208+
if( IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950") ){
209+
ERROR("Build flag HIP_HOST_UNCACHED_MEMORY must be set to avoid memory corruption on mi350x");
210+
return ncclSystemError;
211+
}
212+
else {
213+
return ncclSuccess;
214+
}
215+
#endif
216+
}
217+
202218
static void initOnceFunc() {
203219
NCCLCHECKGOTO(checkHsaEnvSetting(), initResult, exit);
204220
initEnv();
@@ -1508,8 +1524,16 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
15081524
allGather3Data[rank].nc = 4;
15091525
}
15101526
}
1527+
// For single node communicators that do not uses the full xgmi links per gpu, i.e., nranks < 8
1528+
// Inflate the nChannels a bit to achieve higher b/w.
15111529
if (IsArchMatch(comm->topo->nodes[GPU].nodes[idx].gpu.gcn, "gfx950")) {
1512-
allGather3Data[rank].nc = 4;
1530+
if (nranks == 2 && nNodes == 1){
1531+
allGather3Data[rank].nc = 16;
1532+
} else if (nranks == 4 && nNodes == 1){
1533+
allGather3Data[rank].nc = 8;
1534+
} else {
1535+
allGather3Data[rank].nc = 4;
1536+
}
15131537
}
15141538

15151539
allGather3Data[rank].pivotA2AEnabled = comm->topo->pivotA2AEnabled && rcclParamPivotAlltoallEnable();
@@ -1873,8 +1897,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
18731897
}
18741898
NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
18751899

1876-
INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
1877-
1900+
INFO(NCCL_INIT, "comm:%p, nRanks:%d, nNodes:%d, coll channels:%d collnet channels:%d, nvls channels:%d, p2p channels:%d, p2p channels per peer:%d", comm, comm->nRanks, comm->nNodes, comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
1901+
18781902
if (comm->intraRank == 0) { // Load ncclParamLaunchMode
18791903
const char* str = ncclGetEnv("NCCL_LAUNCH_MODE");
18801904
enum ncclLaunchMode mode, modeOld;
@@ -2053,9 +2077,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
20532077
double sum_timers = 0;
20542078
uint64_t timers[TIMERS_INIT_COUNT] = {0};
20552079
unsigned long long commIdHash;
2080+
char* archName;
2081+
int cuCount;
2082+
hipDeviceProp_t devProp;
2083+
20562084
#ifdef USE_INDIRECT_FUNCTION_CALL
20572085
int64_t stackSize;
2058-
hipDeviceProp_t devProp;
20592086
#endif
20602087

20612088
timers[TIMER_INIT_TOTAL] = clockNano();
@@ -2065,16 +2092,20 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
20652092
CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail);
20662093
cudaArch = 100*archMajor + 10*archMinor;
20672094

2095+
CUDACHECKGOTO(hipGetDeviceProperties(&devProp, cudaDev), res, fail);
2096+
cuCount = devProp.multiProcessorCount;
2097+
archName = (char*)malloc(strlen(devProp.gcnArchName) + 1);
2098+
strcpy(archName, devProp.gcnArchName);
2099+
20682100
timers[TIMER_INIT_KERNELS] = clockNano();
20692101
NCCLCHECK(ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes));
20702102
// Set the maximum kernel stack size of all kernels to avoid
20712103
// a CUDA memory reconfig on load (c.f. NVSHMEM issue)
20722104
#ifdef USE_INDIRECT_FUNCTION_CALL
2073-
CUDACHECK(hipGetDeviceProperties(&devProp, 0));
2074-
if (ncclParamSetStackSize() == 1 && !IsArchMatch(devProp.gcnArchName,"gfx942") && !IsArchMatch(devProp.gcnArchName,"gfx950")) {
2105+
if (ncclParamSetStackSize() == 1 && !IsArchMatch(archName,"gfx942") && !IsArchMatch(archName,"gfx950")) {
20752106
stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes;
20762107
if (stackSize == 0) {
2077-
if (IsArchMatch(devProp.gcnArchName,"gfx906"))
2108+
if (IsArchMatch(archName,"gfx906"))
20782109
stackSize = 1024;
20792110
else
20802111
stackSize = 512;
@@ -2127,9 +2158,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
21272158
timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
21282159
}
21292160
comm->cudaArch = cudaArch;
2161+
comm->archName = archName;
2162+
comm->cuCount = cuCount;
21302163

21312164
NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
2132-
2165+
2166+
// Check if using host uncached mem correctly
2167+
NCCLCHECK(checkHostUncacheMemSetting(comm));
2168+
21332169
// RCCL: determine and set unroll factor for comm
21342170
NCCLCHECK(commSetUnrollFactor(comm));
21352171

@@ -2151,9 +2187,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
21512187
if (rcclParamMscclppEnabled()) {
21522188
#ifdef ENABLE_MSCCLPP
21532189
if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled()) && mscclppCommCompatible(comm)) {
2154-
hipDeviceProp_t devProp;
2155-
CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev));
2156-
comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx942") || IsArchMatch(devProp.gcnArchName, "gfx950");
2190+
comm->mscclppCompatible = IsArchMatch(archName, "gfx942") || IsArchMatch(archName, "gfx950");
21572191
if (comm->mscclppCompatible) {
21582192
bool mapContainsId = (mscclpp_uniqueIdMap.count(*job->commId) > 0);
21592193
auto& mscclppUniqueId = mscclpp_uniqueIdMap[*job->commId];

src/misc/recorder.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <string>
88
#include <iomanip>
99
#include <sys/syscall.h>
10+
#include "debug.h"
1011

1112
using namespace std::chrono;
1213

src/misc/rocmwrap.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ DECLARE_ROCM_PFN(hsa_status_string);
2828

2929
static void *hsaLib;
3030
static uint16_t version_major, version_minor;
31+
32+
int ncclCudaDriverVersionCache = -1;
3133
bool ncclCudaLaunchBlocking = false;
3234

3335
static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;

src/rccl_wrap.cc

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -451,15 +451,13 @@ ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count,
451451
}
452452

453453
ncclResult_t commSetUnrollFactor(struct ncclComm* comm) {
454-
hipDeviceProp_t devProp;
455-
CUDACHECK(hipGetDeviceProperties(&devProp, comm->cudaDev));
456-
if(IsArchMatch(devProp.gcnArchName, "gfx950")) {
454+
if(IsArchMatch(comm->archName, "gfx950")) {
457455
if(comm->nNodes == 1)
458456
comm->unroll = NCCL_UNROLL_1;
459457
else
460458
comm->unroll = NCCL_UNROLL_2;
461459
}
462-
else if(IsArchMatch(devProp.gcnArchName, "gfx908") || ((IsArchMatch(devProp.gcnArchName, "gfx942") && devProp.multiProcessorCount > 80)))
460+
else if(IsArchMatch(comm->archName, "gfx908") || ((IsArchMatch(comm->archName, "gfx942") && comm->cuCount > 80)))
463461
comm->unroll = NCCL_UNROLL_2;
464462
else
465463
comm->unroll = NCCL_UNROLL_4;
@@ -535,3 +533,23 @@ bool validHsaScratchEnvSetting(const char*hsaScratchEnv, int hipRuntimeVersion,
535533
}
536534
return true;
537535
}
536+
537+
// Should match get_arch_guard() in generate.py
538+
bool rcclIsArchSupportedForFunc(struct ncclTaskColl* info, const char* archName) {
539+
bool supported = true;
540+
541+
if (info->protocol == NCCL_PROTO_LL128) {
542+
#if defined(ENABLE_LL128)
543+
if (info->acc)
544+
supported = (IsArchMatch(archName, "gfx942") || IsArchMatch(archName, "gfx950"));
545+
else
546+
supported = (IsArchMatch(archName, "gfx942") || IsArchMatch(archName, "gfx950") || IsArchMatch(archName, "gfx90a"));
547+
#else
548+
supported = false;
549+
#endif
550+
} else if (info->acc) {
551+
supported = (IsArchMatch(archName, "gfx942") || IsArchMatch(archName, "gfx950"));
552+
}
553+
554+
return supported;
555+
}

test/ext-plugins/.gitignore

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Ignore Python cache and virtual environment folders
2+
__pycache__/
3+
*.pyc
4+
*.pyo
5+
*.pyd
6+
7+
# Ignore pytest cache
8+
.pytest_cache/
9+
.cache/
10+
11+
# Ignore log folders
12+
logs/
13+
log/
14+
*.log
15+
16+
# Ignore virtual environment folders
17+
venv/
18+
19+
# Ignore build artifacts
20+
build/

0 commit comments

Comments
 (0)