|
15 | 15 |
|
16 | 16 | namespace { |
17 | 17 | template<typename T, typename RedOp, typename Proto, int RCCLMetadata> |
18 | | -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) |
19 | | - __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { |
20 | | -#else |
21 | | - __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { |
22 | | -#endif |
| 18 | + __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) { |
23 | 19 | ncclRing *ring = &ncclShmem.channel.ring; |
24 | 20 | int ringIx = ring->index; |
25 | 21 | const int nranks = ncclShmem.comm.nRanks; |
@@ -211,11 +207,7 @@ namespace { |
211 | 207 | } |
212 | 208 |
|
213 | 209 | template<typename T, typename RedOp, typename Proto> |
214 | | -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) |
215 | | - __device__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) { |
216 | | -#else |
217 | | - __device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) { |
218 | | -#endif |
| 210 | + __device__ __forceinline__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) { |
219 | 211 | #if defined(ENABLE_NPKIT) |
220 | 212 | const int bid = ncclShmem.channelId - work->channelLo; |
221 | 213 | int npKitCtxIdx = bid; // unused variable - compiler warning |
@@ -359,11 +351,7 @@ namespace { |
359 | 351 | } |
360 | 352 |
|
361 | 353 | template<typename T, typename RedOp, typename Proto> |
362 | | -#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__) |
363 | | - __device__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) { |
364 | | -#else |
365 | | - __device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) { |
366 | | -#endif |
| 354 | + __device__ __forceinline__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) { |
367 | 355 | #if defined(ENABLE_NPKIT) |
368 | 356 | const int bid = ncclShmem.channelId - work->channelLo; // unused variable - compiler warning |
369 | 357 | #endif |
|
0 commit comments