diff --git a/CMakeLists.txt b/CMakeLists.txt index 892968cce3..efcf6b411b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ option(ENABLE_MSCCLPP_CLIP "Enable MSCCL++ CLIP" option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor" OFF) option(ENABLE_MSCCLPP_FORMAT_CHECKS "Enable formatting checks in MSCCL++" OFF) option(ENABLE_NPKIT "Enable NPKit" OFF) -option(ENABLE_IFC "Enable indirect function call" OFF) +option(ENABLE_IFC "Enable indirect function call" ON) option(GENERATE_SYM_KERNELS "Generate symmetric memory kernels" OFF) option(INSTALL_DEPENDENCIES "Force install dependencies" OFF) option(REPORT_KERNEL_RESOURCE_USE "Append -Rpass-analysis=kernel to CXX flags" OFF) diff --git a/src/device/all_gather.h b/src/device/all_gather.h index 2dc1f0417e..3e66e9dedd 100644 --- a/src/device/all_gather.h +++ b/src/device/all_gather.h @@ -11,11 +11,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#else - __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#endif #if defined(ENABLE_NPKIT) const int bid = ncclShmem.channelId - work->channelLo; int npKitCtxIdx = bid; // unused variable - compiler warning diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h index 45b06be119..0ccbe97f11 100644 --- a/src/device/all_reduce.h +++ b/src/device/all_reduce.h @@ -15,11 +15,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#else - __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#endif ncclRing *ring = &ncclShmem.channel.ring; int ringIx = ring->index; const int nranks = ncclShmem.comm.nRanks; @@ -211,11 +207,7 @@ namespace { } template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) { -#else - __device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) { -#endif #if defined(ENABLE_NPKIT) const int bid = ncclShmem.channelId - work->channelLo; int npKitCtxIdx = bid; // unused variable - compiler warning @@ -359,11 +351,7 @@ namespace { } template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) { -#else - __device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) { -#endif #if defined(ENABLE_NPKIT) const int bid = ncclShmem.channelId - work->channelLo; // unused variable - compiler warning #endif diff --git a/src/device/alltoall_pivot.h b/src/device/alltoall_pivot.h index 9e988605e9..3d4371521f 100644 --- a/src/device/alltoall_pivot.h +++ b/src/device/alltoall_pivot.h @@ -10,11 +10,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#else - __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#endif const int bid = ncclShmem.channelId - work->channelLo; const int nranks = ncclShmem.comm.nRanks; size_t count, partOffset, partCount, chunkCount; diff --git a/src/device/broadcast.h b/src/device/broadcast.h index 364e87ee2b..b5f88cd8ea 100644 --- a/src/device/broadcast.h +++ b/src/device/broadcast.h @@ -10,11 +10,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#else - __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#endif #if defined(ENABLE_NPKIT) const int bid = ncclShmem.channelId - work->channelLo; int npKitCtxIdx = bid; // unused variable - compiler warning diff --git a/src/device/common.cu b/src/device/common.cu index 36d396fbb8..99746f7869 100644 --- a/src/device/common.cu +++ b/src/device/common.cu @@ -38,8 +38,4 @@ __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Gener } #endif -#ifdef USE_INDIRECT_FUNCTION_CALL __device__ void ncclDevFunc_Nop(); -#else -__device__ __attribute__((noinline)) void ncclDevFunc_Nop(); -#endif diff --git a/src/device/common.h b/src/device/common.h index ada2eba98f..30a99d6e84 100644 --- a/src/device/common.h +++ b/src/device/common.h @@ -601,7 +601,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) { SpecializedRunWorkBatch().run(); } else { -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx950__) if (COLL_UNROLL == 1) ncclDevFuncTable_1[ncclShmem.funcId](); else if (COLL_UNROLL == 2) @@ -667,7 +667,7 @@ __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONST } #else #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto, acc, pipeline, unroll) \ - __device__ __attribute__((noinline)) void ncclDevFunc_##suffix() { \ + __device__ void ncclDevFunc_##suffix() { \ RunWorkBatch, algo, proto, acc, unroll, pipeline>().run(); \ } #endif diff --git a/src/device/generate.py b/src/device/generate.py index ef2ecf91b2..37a86bd04c 100755 --- a/src/device/generate.py +++ b/src/device/generate.py @@ -170,11 +170,7 @@ class Fn: def __iter__(self): return iter((self.coll, self.algo, self.proto, self.redop, self.ty, self.acc, self.pipeline, self.unroll)) -def calc_unroll_and_pipeline_for_local_arch(): - - if not is_local_arch_only: - return (all_unrolls, all_pipelines) - +def get_gfx_targets(): rocminfo_path = os.environ.get('ROCM_PATH') + "/bin/rocminfo" res = subprocess.run([rocminfo_path], stdout=subprocess.PIPE, universal_newlines=True) @@ -194,10 +190,22 @@ def calc_unroll_and_pipeline_for_local_arch(): cu_count = int(line.split(':')[-1].strip()) gfx_targets[(curr_name, cu_count)] = None curr_name = None + return list(gfx_targets.keys()) + +def compiling_for_arch(arch): + gfx_targets = get_gfx_targets() + for gfx_name, cu_count in gfx_targets: + if gfx_name == arch: + return True + return False + +def calc_unroll_and_pipeline_for_local_arch(): + if not is_local_arch_only: + return (all_unrolls, all_pipelines) # We want to remove duplicates but cannot use a dictionary since same gfx name can have different cu counts # Use (gfx_name, cu_count) as key for dictionary and convert it to list here - gfx_targets = list(gfx_targets.keys()) + gfx_targets = get_gfx_targets() # Homogeneous system is required to build for only 1 variant of unroll factor (except for gfx950) if len(gfx_targets) == 1: @@ -370,8 +378,7 @@ def get_arch_guard(fn): print("-- Generating %s" % os.path.join(gensrc, "device_table.h")) out = f.write - if is_ifc: func_declaration = "__device__ void" - else: func_declaration = "__device__ __attribute__((noinline)) void" + func_declaration = "__device__ void" for fn in primary_funcs: sym = paste("_", "ncclDevFunc", *fn) @@ -399,7 +406,7 @@ def get_arch_guard(fn): out("nullptr};\n") out("\n") - if not is_ifc: + if not is_ifc or compiling_for_arch("gfx950"): for unroll in all_unrolls: out(f"template\n" f"struct Caller{unroll} {{\n" diff --git a/src/device/reduce.h b/src/device/reduce.h index 4ca3fb28cb..11dd63359a 100644 --- a/src/device/reduce.h +++ b/src/device/reduce.h @@ -11,11 +11,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#else - __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#endif ncclRing *ring = &ncclShmem.channel.ring; const int nranks = ncclShmem.comm.nRanks; const int rank = ncclShmem.comm.rank; diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h index e5c6896143..04e711f6df 100644 --- a/src/device/reduce_scatter.h +++ b/src/device/reduce_scatter.h @@ -11,11 +11,7 @@ namespace { template -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#else - __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { -#endif ncclRing *ring = &ncclShmem.channel.ring; int const *ringRanks = ring->userRanks; const int nranks = ncclShmem.comm.nRanks; diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h index 5d2445dec1..ba6f498945 100644 --- a/src/device/sendrecv.h +++ b/src/device/sendrecv.h @@ -134,11 +134,7 @@ struct RunWorkBatch