diff --git a/CMakeLists.txt b/CMakeLists.txt
index 892968cce3..efcf6b411b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@ option(ENABLE_MSCCLPP_CLIP                     "Enable MSCCL++ CLIP"
 option(ENABLE_MSCCLPP_EXECUTOR                 "Enable MSCCL++ Executor"                       OFF)
 option(ENABLE_MSCCLPP_FORMAT_CHECKS            "Enable formatting checks in MSCCL++"           OFF)
 option(ENABLE_NPKIT                            "Enable NPKit"                                  OFF)
-option(ENABLE_IFC                              "Enable indirect function call"                 OFF)
+option(ENABLE_IFC                              "Enable indirect function call"                 ON)
 option(GENERATE_SYM_KERNELS                    "Generate symmetric memory kernels"             OFF)
 option(INSTALL_DEPENDENCIES                    "Force install dependencies"                    OFF)
 option(REPORT_KERNEL_RESOURCE_USE              "Append -Rpass-analysis=kernel to CXX flags"    OFF)
diff --git a/src/device/all_gather.h b/src/device/all_gather.h
index 2dc1f0417e..3e66e9dedd 100644
--- a/src/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -11,11 +11,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto, bool isNetOffload = false>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#else
-  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#endif
 #if defined(ENABLE_NPKIT)
     const int bid = ncclShmem.channelId - work->channelLo;
     int npKitCtxIdx = bid; // unused variable - compiler warning
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index 45b06be119..0ccbe97f11 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -15,11 +15,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto, int RCCLMetadata>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#else
-  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#endif
     ncclRing *ring = &ncclShmem.channel.ring;
     int ringIx = ring->index;
     const int nranks = ncclShmem.comm.nRanks;
@@ -211,11 +207,7 @@ namespace {
   }
 
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#else
-  __device__ __attribute__((noinline)) void runTreeUpDown(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#endif
 #if defined(ENABLE_NPKIT)
     const int bid = ncclShmem.channelId - work->channelLo;
     int npKitCtxIdx = bid; // unused variable - compiler warning
@@ -359,11 +351,7 @@ namespace {
   }
 
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#else
-  __device__ __attribute__((noinline)) void runTreeSplit(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#endif
 #if defined(ENABLE_NPKIT)
     const int bid = ncclShmem.channelId - work->channelLo; // unused variable - compiler warning
 #endif
diff --git a/src/device/alltoall_pivot.h b/src/device/alltoall_pivot.h
index 9e988605e9..3d4371521f 100644
--- a/src/device/alltoall_pivot.h
+++ b/src/device/alltoall_pivot.h
@@ -10,11 +10,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#else
-  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#endif
     const int bid = ncclShmem.channelId - work->channelLo;
     const int nranks = ncclShmem.comm.nRanks;
     size_t count, partOffset, partCount, chunkCount;
diff --git a/src/device/broadcast.h b/src/device/broadcast.h
index 364e87ee2b..b5f88cd8ea 100644
--- a/src/device/broadcast.h
+++ b/src/device/broadcast.h
@@ -10,11 +10,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#else
-  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#endif
 #if defined(ENABLE_NPKIT)
     const int bid = ncclShmem.channelId - work->channelLo;
     int npKitCtxIdx = bid; // unused variable - compiler warning
diff --git a/src/device/common.cu b/src/device/common.cu
index 36d396fbb8..99746f7869 100644
--- a/src/device/common.cu
+++ b/src/device/common.cu
@@ -38,8 +38,4 @@ __launch_bounds__(NCCL_MAX_NTHREADS, 1) __global__ void ncclDevKernelDebug_Gener
 }
 #endif
 
-#ifdef USE_INDIRECT_FUNCTION_CALL
 __device__ void ncclDevFunc_Nop();
-#else
-__device__ __attribute__((noinline)) void ncclDevFunc_Nop();
-#endif
diff --git a/src/device/common.h b/src/device/common.h
index ada2eba98f..30a99d6e84 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -601,7 +601,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
     if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
       SpecializedRunWorkBatch().run();
     } else {
-#ifdef USE_INDIRECT_FUNCTION_CALL
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx950__)
       if (COLL_UNROLL == 1)
         ncclDevFuncTable_1[ncclShmem.funcId]();
       else if (COLL_UNROLL == 2)
@@ -667,7 +667,7 @@ __global__ void ncclDevKernelDebug_Generic_4(ncclDevKernelArgs4K NCCL_GRID_CONST
   }
 #else
 #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto, acc, pipeline, unroll) \
-  __device__ __attribute__((noinline)) void ncclDevFunc_##suffix() { \
+  __device__ void ncclDevFunc_##suffix() { \
     RunWorkBatch<coll, ty, redop<ty>, algo, proto, acc, unroll, pipeline>().run(); \
   }
 #endif
diff --git a/src/device/generate.py b/src/device/generate.py
index ef2ecf91b2..37a86bd04c 100755
--- a/src/device/generate.py
+++ b/src/device/generate.py
@@ -170,11 +170,7 @@ class Fn:
   def __iter__(self):
     return iter((self.coll, self.algo, self.proto, self.redop, self.ty, self.acc, self.pipeline, self.unroll))
 
-def calc_unroll_and_pipeline_for_local_arch():
-
-  if not is_local_arch_only:
-    return (all_unrolls, all_pipelines)
-
+def get_gfx_targets():
   rocminfo_path = os.environ.get('ROCM_PATH') + "/bin/rocminfo"
 
   res = subprocess.run([rocminfo_path], stdout=subprocess.PIPE, universal_newlines=True)
@@ -194,10 +190,22 @@ def calc_unroll_and_pipeline_for_local_arch():
       cu_count = int(line.split(':')[-1].strip())
       gfx_targets[(curr_name, cu_count)] = None
       curr_name = None
+  return list(gfx_targets.keys())
+
+def compiling_for_arch(arch):
+  gfx_targets = get_gfx_targets()
+  for gfx_name, cu_count in gfx_targets:
+    if gfx_name == arch:
+      return True
+  return False
+
+def calc_unroll_and_pipeline_for_local_arch():
+  if not is_local_arch_only:
+    return (all_unrolls, all_pipelines)
 
   # We want to remove duplicates but cannot use a dictionary since same gfx name can have different cu counts
   # Use (gfx_name, cu_count) as key for dictionary and convert it to list here
-  gfx_targets = list(gfx_targets.keys())
+  gfx_targets = get_gfx_targets()
   
   # Homogeneous system is required to build for only 1 variant of unroll factor (except for gfx950)
   if len(gfx_targets) == 1:
@@ -370,8 +378,7 @@ def get_arch_guard(fn):
   print("-- Generating %s" % os.path.join(gensrc, "device_table.h"))
   out = f.write
 
-  if is_ifc: func_declaration = "__device__ void"
-  else: func_declaration = "__device__ __attribute__((noinline)) void"
+  func_declaration = "__device__ void"
 
   for fn in primary_funcs:
     sym = paste("_", "ncclDevFunc", *fn)
@@ -399,7 +406,7 @@ def get_arch_guard(fn):
     out("nullptr};\n")
     out("\n")
 
-  if not is_ifc:
+  if not is_ifc or compiling_for_arch("gfx950"):
     for unroll in all_unrolls:
       out(f"template<unsigned short f, unsigned short l>\n"
           f"struct Caller{unroll} {{\n"
diff --git a/src/device/reduce.h b/src/device/reduce.h
index 4ca3fb28cb..11dd63359a 100644
--- a/src/device/reduce.h
+++ b/src/device/reduce.h
@@ -11,11 +11,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#else
-  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#endif
     ncclRing *ring = &ncclShmem.channel.ring;
     const int nranks = ncclShmem.comm.nRanks;
     const int rank = ncclShmem.comm.rank;
diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h
index e5c6896143..04e711f6df 100644
--- a/src/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -11,11 +11,7 @@
 
 namespace {
   template<typename T, typename RedOp, typename Proto>
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
   __device__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#else
-  __device__ __attribute__((noinline)) void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
-#endif
     ncclRing *ring = &ncclShmem.channel.ring;
     int const *ringRanks = ring->userRanks;
     const int nranks = ncclShmem.comm.nRanks;
diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h
index 5d2445dec1..ba6f498945 100644
--- a/src/device/sendrecv.h
+++ b/src/device/sendrecv.h
@@ -134,11 +134,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
 #endif
   }
 
-#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx942__) && !defined(__gfx950__)
-  __device__  void run() {
-#else
-  __device__  __attribute__((noinline)) void run() {
-#endif
+  __device__ void run() {
     const int tid = threadIdx.x;
     const int tn = blockDim.x;
     const int wid = tid/WARP_SIZE;