Skip to content

Commit a130722

Browse files
committed
Remove noinline attributes
1 parent c8da880 commit a130722

File tree

9 files changed

+11
-47
lines changed

9 files changed

+11
-47
lines changed

src/device/all_gather.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@
1111

1212
namespace {
1313
template<typename T, typename RedOp, typename Proto, bool isNetOffload = false>
14-
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
15-
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
16-
#else
17-
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
18-
#endif
14+
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
1915
#if defined(ENABLE_NPKIT)
2016
const int bid = ncclShmem.channelId - work->channelLo;
2117
int npKitCtxIdx = bid; // unused variable - compiler warning

src/device/all_reduce.h

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,7 @@
1515

1616
namespace {
1717
template<typename T, typename RedOp, typename Proto, int RCCLMetadata>
18-
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
19-
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
20-
#else
21-
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
22-
#endif
18+
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
2319
ncclRing *ring = &ncclShmem.channel.ring;
2420
int ringIx = ring->index;
2521
const int nranks = ncclShmem.comm.nRanks;
@@ -211,11 +207,7 @@ namespace {
211207
}
212208

213209
template<typename T, typename RedOp, typename Proto>
214-
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
215-
__device__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
216-
#else
217-
__device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
218-
#endif
210+
__device__ __forceinline__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
219211
#if defined(ENABLE_NPKIT)
220212
const int bid = ncclShmem.channelId - work->channelLo;
221213
int npKitCtxIdx = bid; // unused variable - compiler warning
@@ -359,11 +351,7 @@ namespace {
359351
}
360352

361353
template<typename T, typename RedOp, typename Proto>
362-
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
363-
__device__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
364-
#else
365-
__device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
366-
#endif
354+
__device__ __forceinline__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
367355
#if defined(ENABLE_NPKIT)
368356
const int bid = ncclShmem.channelId - work->channelLo; // unused variable - compiler warning
369357
#endif

src/device/alltoall_pivot.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,7 @@
1010

1111
namespace {
1212
template<typename T, typename RedOp, typename Proto>
13-
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
14-
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
15-
#else
16-
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
17-
#endif
13+
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
1814
const int bid = ncclShmem.channelId - work->channelLo;
1915
const int nranks = ncclShmem.comm.nRanks;
2016
size_t count, partOffset, partCount, chunkCount;

src/device/broadcast.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,7 @@
1010

1111
namespace {
1212
template<typename T, typename RedOp, typename Proto>
13-
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
14-
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
15-
#else
16-
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
17-
#endif
13+
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
1814
#if defined(ENABLE_NPKIT)
1915
const int bid = ncclShmem.channelId - work->channelLo;
2016
int npKitCtxIdx = bid; // unused variable - compiler warning

src/device/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -667,7 +667,7 @@ __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONST
667667
}
668668
#else
669669
#define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto, acc, pipeline, unroll) \
670-
__device__ __attribute__((noinline)) void ncclDevFunc_##suffix() { \
670+
__device__ __forceinline__ void ncclDevFunc_##suffix() { \
671671
RunWorkBatch<coll, ty, redop<ty>, algo, proto, acc, unroll, pipeline>().run(); \
672672
}
673673
#endif

src/device/generate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ def get_arch_guard(fn):
369369
out = f.write
370370

371371
if is_ifc: func_declaration = "__device__ void"
372-
else: func_declaration = "__device__ __attribute__((noinline)) void"
372+
else: func_declaration = "__device__ __forceinline__ void"
373373

374374
for fn in primary_funcs:
375375
sym = paste("_", "ncclDevFunc", *fn)

src/device/reduce.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@
1111

1212
namespace {
1313
template<typename T, typename RedOp, typename Proto>
14-
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
15-
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
16-
#else
17-
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
18-
#endif
14+
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
1915
ncclRing *ring = &ncclShmem.channel.ring;
2016
const int nranks = ncclShmem.comm.nRanks;
2117
const int rank = ncclShmem.comm.rank;

src/device/reduce_scatter.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@
1111

1212
namespace {
1313
template<typename T, typename RedOp, typename Proto>
14-
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
15-
__device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
16-
#else
17-
__device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
18-
#endif
14+
__device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
1915
ncclRing *ring = &ncclShmem.channel.ring;
2016
int const *ringRanks = ring->userRanks;
2117
const int nranks = ncclShmem.comm.nRanks;

src/device/sendrecv.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
134134
#endif
135135
}
136136

137-
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
138-
__device__ void run() {
139-
#else
140-
__device__ __attribute__((noinline)) void run() {
141-
#endif
137+
__device__ __forceinline__ void run() {
142138
const int tid = threadIdx.x;
143139
const int tn = blockDim.x;
144140
const int wid = tid/WARP_SIZE;

0 commit comments

Comments
 (0)