Fix inlining behaviour at the NVVM IR level (#247)

gmarkall · web-flow · commit 879b1d4ef824 · 2025-05-09T15:55:33.000+01:00
PR #181 aimed to align the behaviour of the `inline` kwarg with that of upstream Numba, in that it now forces inlining at the Numba IR level. It turns out that this kwarg in Numba-CUDA already had the prior effect of enabling inlining at the NVVM IR level. Because the default value of `inline` is `"never"`, this was interpreted by the `compile_cuda()` function as a `True`ish value and every device function got marked with the `alwaysinline` function attribute. This is a minor problem in that it probably forces a lot of inlining that we don't want, but also a major problem in that it triggers an NVVM bug that was only resolved in CUDA 12.3 that causes a hang in `nvvmCompileProgram()`. To rectify these issues, we add the `forceinline` kwarg to the `@cuda.jit` decorator and the `cuda.compile[_*]()` functions. Now, `compile_cuda()` will only enable inlining at the NVVM IR level for `forceinline` and not `inline`. This is aligned with the behaviour of upstream Numba (see numba/numba#10068). We now document the `inline` and `forceinline` kwargs to clarify the intent and behaviour for users. For clarity: the behaviour is now: - The `inline` kwarg enables inlining only at the Numba IR level. - The `forceinline` kwarg enables inlining only at the NVVM IR level.
diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
@@ -278,7 +278,7 @@ def compile_cuda(
     args,
     debug=False,
     lineinfo=False,
-    inline=False,
+    forceinline=False,
     fastmath=False,
     nvvm_options=None,
     cc=None,
@@ -316,7 +316,7 @@ def compile_cuda(
     else:
         flags.error_model = "numpy"
 
-    if inline:
+    if forceinline:
         flags.forceinline = True
     if fastmath:
         flags.fastmath = True
@@ -574,6 +574,7 @@ def compile(
     abi="c",
     abi_info=None,
     output="ptx",
+    forceinline=False,
 ):
     """Compile a Python function to PTX or LTO-IR for a given set of argument
     types.
@@ -614,6 +615,11 @@ def compile(
     :type abi_info: dict
     :param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
     :type output: str
+    :param forceinline: Enables inlining at the NVVM IR level when set to
+                        ``True``. This is accomplished by adding the
+                        ``alwaysinline`` function attribute to the function
+                        definition. This is only valid when the output is
+                        ``"ltoir"``.
     :return: (code, resty): The compiled code and inferred return type
     :rtype: tuple
     """
@@ -626,6 +632,12 @@ def compile(
     if output not in ("ptx", "ltoir"):
         raise NotImplementedError(f"Unsupported output type: {output}")
 
+    if forceinline and not device:
+        raise ValueError("Cannot force-inline kernels")
+
+    if forceinline and output != "ltoir":
+        raise ValueError("Can only designate forced inlining in LTO-IR")
+
     debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
     opt = (config.OPT != 0) if opt is None else opt
 
@@ -660,6 +672,7 @@ def compile(
         fastmath=fastmath,
         nvvm_options=nvvm_options,
         cc=cc,
+        forceinline=forceinline,
     )
     resty = cres.signature.return_type
 
@@ -699,6 +712,7 @@ def compile_for_current_device(
     abi="c",
     abi_info=None,
     output="ptx",
+    forceinline=False,
 ):
     """Compile a Python function to PTX or LTO-IR for a given signature for the
     current device's compute capabilility. This calls :func:`compile` with an
@@ -716,6 +730,7 @@ def compile_for_current_device(
         abi=abi,
         abi_info=abi_info,
         output=output,
+        forceinline=forceinline,
     )
 
 
@@ -730,6 +745,7 @@ def compile_ptx(
     opt=None,
     abi="numba",
     abi_info=None,
+    forceinline=False,
 ):
     """Compile a Python function to PTX for a given signature. See
     :func:`compile`. The defaults for this function are to compile a kernel
@@ -747,6 +763,7 @@ def compile_ptx(
         abi=abi,
         abi_info=abi_info,
         output="ptx",
+        forceinline=forceinline,
     )
 
 
@@ -760,6 +777,7 @@ def compile_ptx_for_current_device(
     opt=None,
     abi="numba",
     abi_info=None,
+    forceinline=False,
 ):
     """Compile a Python function to PTX for a given signature for the current
     device's compute capabilility. See :func:`compile_ptx`."""
@@ -775,6 +793,7 @@ def compile_ptx_for_current_device(
         opt=opt,
         abi=abi,
         abi_info=abi_info,
+        forceinline=forceinline,
     )
 
 
diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py
@@ -17,6 +17,7 @@ def jit(
     func_or_sig=None,
     device=False,
     inline="never",
+    forceinline=False,
     link=[],
     debug=None,
     opt=None,
@@ -39,6 +40,14 @@ def jit(
        .. note:: A kernel cannot have any return value.
     :param device: Indicates whether this is a device function.
     :type device: bool
+    :param inline: Enables inlining at the Numba IR level when set to
+       ``"always"``. See `Notes on Inlining
+       <https://numba.readthedocs.io/en/stable/developer/inlining.html>`_.
+    :type inline: str
+    :param forceinline: Enables inlining at the NVVM IR level when set to
+       ``True``. This is accomplished by adding the ``alwaysinline`` function
+       attribute to the function definition.
+    :type forceinline: bool
     :param link: A list of files containing PTX or CUDA C/C++ source to link
        with the function
     :type link: list
@@ -85,7 +94,9 @@ def jit(
         DeprecationWarning(
             "Passing bool to inline argument is deprecated, please refer to "
             "Numba's documentation on inlining: "
-            "https://numba.readthedocs.io/en/stable/developer/inlining.html"
+            "https://numba.readthedocs.io/en/stable/developer/inlining.html. "
+            "You may have wanted the forceinline argument instead, to force "
+            "inlining at the NVVM IR level."
         )
 
         inline = "always" if inline else "never"
@@ -140,6 +151,7 @@ def _jit(func):
             targetoptions["fastmath"] = fastmath
             targetoptions["device"] = device
             targetoptions["inline"] = inline
+            targetoptions["forceinline"] = forceinline
             targetoptions["extensions"] = extensions
 
             disp = CUDADispatcher(func, targetoptions=targetoptions)
@@ -182,6 +194,7 @@ def autojitwrapper(func):
                         func,
                         device=device,
                         inline=inline,
+                        forceinline=forceinline,
                         debug=debug,
                         opt=opt,
                         lineinfo=lineinfo,
@@ -206,6 +219,7 @@ def autojitwrapper(func):
                 targetoptions["fastmath"] = fastmath
                 targetoptions["device"] = device
                 targetoptions["inline"] = inline
+                targetoptions["forceinline"] = forceinline
                 targetoptions["extensions"] = extensions
                 disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
 
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
@@ -137,6 +137,7 @@ def __init__(
         debug=False,
         lineinfo=False,
         inline=False,
+        forceinline=False,
         fastmath=False,
         extensions=None,
         max_registers=None,
@@ -182,7 +183,7 @@ def __init__(
             self.argtypes,
             debug=self.debug,
             lineinfo=lineinfo,
-            inline=inline,
+            forceinline=forceinline,
             fastmath=fastmath,
             nvvm_options=nvvm_options,
             cc=cc,
@@ -1073,7 +1074,7 @@ def compile_device(self, args, return_type=None):
             with self._compiling_counter:
                 debug = self.targetoptions.get("debug")
                 lineinfo = self.targetoptions.get("lineinfo")
-                inline = self.targetoptions.get("inline")
+                forceinline = self.targetoptions.get("forceinline")
                 fastmath = self.targetoptions.get("fastmath")
 
                 nvvm_options = {
@@ -1091,7 +1092,7 @@ def compile_device(self, args, return_type=None):
                     args,
                     debug=debug,
                     lineinfo=lineinfo,
-                    inline=inline,
+                    forceinline=forceinline,
                     fastmath=fastmath,
                     nvvm_options=nvvm_options,
                     cc=cc,
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_inline.py b/numba_cuda/numba/cuda/tests/cudapy/test_inline.py
@@ -8,8 +8,8 @@
 )
 
 
+@skip_on_cudasim("Cudasim does not support inline and forceinline")
 class TestCudaInline(CUDATestCase):
-    @skip_on_cudasim("Cudasim does not support inline")
     def _test_call_inline(self, inline):
         """Test @cuda.jit(inline=...)"""
         a = np.ones(2, dtype=np.int32)
@@ -42,6 +42,9 @@ def call_set_zero(a):
             # check that call was not inlined
             self.assertIsNotNone(match, msg=llvm_ir)
 
+        # alwaysinline should not be in the IR when the inline kwarg is used
+        self.assertNotIn("alwaysinline", llvm_ir)
+
     def test_call_inline_always(self):
         self._test_call_inline("always")
 
@@ -54,6 +57,100 @@ def test_call_inline_true(self):
     def test_call_inline_false(self):
         self._test_call_inline(False)
 
+    def _test_call_forceinline(self, forceinline):
+        """Test @cuda.jit(forceinline=...)"""
+        a = np.ones(2, dtype=np.int32)
+
+        sig = (types.int32[::1],)
+
+        @cuda.jit(forceinline=forceinline)
+        def set_zero(a):
+            a[0] = 0
+
+        @cuda.jit(sig)
+        def call_set_zero(a):
+            set_zero(a)
+
+        call_set_zero[1, 2](a)
+
+        expected = np.arange(2, dtype=np.int32)
+        self.assertTrue(np.all(a == expected))
+
+        llvm_ir = call_set_zero.inspect_llvm(sig)
+        pat = r"call [a-zA-Z0-9]* @"
+        match = re.compile(pat).search(llvm_ir)
+
+        # Check that call was not inlined at the Numba IR level - the call
+        # should still be present in the IR
+        self.assertIsNotNone(match)
+
+        # Check the definition of set_zero - it is a definition where the
+        # name does not include an underscore just before "set_zero", because
+        # that would match the "call_set_zero" definition
+        pat = r"define.*[^_]set_zero.*"
+        match = re.compile(pat).search(llvm_ir)
+        self.assertIsNotNone(match)
+        if forceinline:
+            self.assertIn("alwaysinline", match.group())
+        else:
+            self.assertNotIn("alwaysinline", match.group())
+
+        # The kernel, "call_set_zero", should never have "alwaysinline" set
+        pat = r"define.*call_set_zero.*"
+        match = re.compile(pat).search(llvm_ir)
+        self.assertIsNotNone(match)
+        self.assertNotIn("alwaysinline", match.group())
+
+    def test_call_forceinline_true(self):
+        self._test_call_forceinline(True)
+
+    def test_call_forceinline_false(self):
+        self._test_call_forceinline(False)
+
+    def test_compile_forceinline_ltoir_only(self):
+        def set_zero(a):
+            a[0] = 0
+
+        args = (types.float32[::1],)
+        msg = r"Can only designate forced inlining in LTO-IR"
+        with self.assertRaisesRegex(ValueError, msg):
+            cuda.compile(
+                set_zero,
+                args,
+                device=True,
+                forceinline=True,
+            )
+
+    def _compile_set_zero(self, forceinline):
+        def set_zero(a):
+            a[0] = 0
+
+        args = (types.float32[::1],)
+        ltoir, resty = cuda.compile(
+            set_zero,
+            args,
+            device=True,
+            output="ltoir",
+            forceinline=forceinline,
+        )
+
+        # Sanity check
+        self.assertEqual(resty, types.none)
+
+        return ltoir
+
+    def test_compile_forceinline(self):
+        ltoir_noinline = self._compile_set_zero(False)
+        ltoir_forceinline = self._compile_set_zero(True)
+
+        # As LTO-IR is opaque, the best we can do is check that changing the
+        # flag resulted in a change in the generated LTO-IR in some way.
+        self.assertNotEqual(
+            ltoir_noinline,
+            ltoir_forceinline,
+            "forceinline flag appeared to have no effect on LTO-IR",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()