Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ option(ENABLE_MSCCLPP_CLIP "Enable MSCCL++ CLIP"
option(ENABLE_MSCCLPP_EXECUTOR "Enable MSCCL++ Executor" OFF)
option(ENABLE_MSCCLPP_FORMAT_CHECKS "Enable formatting checks in MSCCL++" OFF)
option(ENABLE_NPKIT "Enable NPKit" OFF)
option(ENABLE_IFC "Enable indirect function call" OFF)
option(ENABLE_IFC "Enable indirect function call" ON)
option(GENERATE_SYM_KERNELS "Generate symmetric memory kernels" OFF)
option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
option(REPORT_KERNEL_RESOURCE_USE "Append -Rpass-analysis=kernel to CXX flags" OFF)
Expand Down
4 changes: 0 additions & 4 deletions src/device/all_gather.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@

namespace {
template<typename T, typename RedOp, typename Proto, bool isNetOffload = false>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
Expand Down
12 changes: 0 additions & 12 deletions src/device/all_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@

namespace {
template<typename T, typename RedOp, typename Proto, int RCCLMetadata>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
ncclRing *ring = &ncclShmem.channel.ring;
int ringIx = ring->index;
const int nranks = ncclShmem.comm.nRanks;
Expand Down Expand Up @@ -211,11 +207,7 @@ namespace {
}

template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
Expand Down Expand Up @@ -359,11 +351,7 @@ namespace {
}

template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo; // unused variable - compiler warning
#endif
Expand Down
4 changes: 0 additions & 4 deletions src/device/alltoall_pivot.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
const int bid = ncclShmem.channelId - work->channelLo;
const int nranks = ncclShmem.comm.nRanks;
size_t count, partOffset, partCount, chunkCount;
Expand Down
4 changes: 0 additions & 4 deletions src/device/broadcast.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
#if defined(ENABLE_NPKIT)
const int bid = ncclShmem.channelId - work->channelLo;
int npKitCtxIdx = bid; // unused variable - compiler warning
Expand Down
4 changes: 0 additions & 4 deletions src/device/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,4 @@ __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Gener
}
#endif

#ifdef USE_INDIRECT_FUNCTION_CALL
__device__ void ncclDevFunc_Nop();
#else
__device__ __attribute__((noinline)) void ncclDevFunc_Nop();
#endif
2 changes: 1 addition & 1 deletion src/device/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONST
}
#else
#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto, acc, pipeline, unroll) \
__device__ __attribute__((noinline)) void ncclDevFunc_##suffix() { \
__device__ void ncclDevFunc_##suffix() { \
RunWorkBatch<coll, ty, redop<ty>, algo, proto, acc, unroll, pipeline>().run(); \
}
#endif
Expand Down
2 changes: 1 addition & 1 deletion src/device/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def get_arch_guard(fn):
out = f.write

if is_ifc: func_declaration = "__device__ void"
else: func_declaration = "__device__ __attribute__((noinline)) void"
else: func_declaration = "__device__ void"

for fn in primary_funcs:
sym = paste("_", "ncclDevFunc", *fn)
Expand Down
4 changes: 0 additions & 4 deletions src/device/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
ncclRing *ring = &ncclShmem.channel.ring;
const int nranks = ncclShmem.comm.nRanks;
const int rank = ncclShmem.comm.rank;
Expand Down
4 changes: 0 additions & 4 deletions src/device/reduce_scatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@

namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#else
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
#endif
ncclRing *ring = &ncclShmem.channel.ring;
int const *ringRanks = ring->userRanks;
const int nranks = ncclShmem.comm.nRanks;
Expand Down
6 changes: 1 addition & 5 deletions src/device/sendrecv.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
#endif
}

#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
__device__ void run() {
#else
__device__ __attribute__((noinline)) void run() {
#endif
__device__ void run() {
const int tid = threadIdx.x;
const int tn = blockDim.x;
const int wid = tid/WARP_SIZE;
Expand Down
Loading