Skip to content

Commit 879b1d4

Browse files
authored
Fix inlining behaviour at the NVVM IR level (#247)
PR #181 aimed to align the behaviour of the `inline` kwarg with that of upstream Numba, in that it now forces inlining at the Numba IR level. It turns out that this kwarg in Numba-CUDA already had the prior effect of enabling inlining at the NVVM IR level. Because the default value of `inline` is `"never"`, this was interpreted by the `compile_cuda()` function as a `True`ish value and every device function got marked with the `alwaysinline` function attribute. This is a minor problem in that it probably forces a lot of inlining that we don't want, but also a major problem in that it triggers an NVVM bug that was only resolved in CUDA 12.3 that causes a hang in `nvvmCompileProgram()`. To rectify these issues, we add the `forceinline` kwarg to the `@cuda.jit` decorator and the `cuda.compile[_*]()` functions. Now, `compile_cuda()` will only enable inlining at the NVVM IR level for `forceinline` and not `inline`. This is aligned with the behaviour of upstream Numba (see numba/numba#10068). We now document the `inline` and `forceinline` kwargs to clarify the intent and behaviour for users. For clarity: the behaviour is now: - The `inline` kwarg enables inlining only at the Numba IR level. - The `forceinline` kwarg enables inlining only at the NVVM IR level.
1 parent 68e86a5 commit 879b1d4

File tree

4 files changed

+138
-7
lines changed

4 files changed

+138
-7
lines changed

numba_cuda/numba/cuda/compiler.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def compile_cuda(
278278
args,
279279
debug=False,
280280
lineinfo=False,
281-
inline=False,
281+
forceinline=False,
282282
fastmath=False,
283283
nvvm_options=None,
284284
cc=None,
@@ -316,7 +316,7 @@ def compile_cuda(
316316
else:
317317
flags.error_model = "numpy"
318318

319-
if inline:
319+
if forceinline:
320320
flags.forceinline = True
321321
if fastmath:
322322
flags.fastmath = True
@@ -574,6 +574,7 @@ def compile(
574574
abi="c",
575575
abi_info=None,
576576
output="ptx",
577+
forceinline=False,
577578
):
578579
"""Compile a Python function to PTX or LTO-IR for a given set of argument
579580
types.
@@ -614,6 +615,11 @@ def compile(
614615
:type abi_info: dict
615616
:param output: Type of output to generate, either ``"ptx"`` or ``"ltoir"``.
616617
:type output: str
618+
:param forceinline: Enables inlining at the NVVM IR level when set to
619+
``True``. This is accomplished by adding the
620+
``alwaysinline`` function attribute to the function
621+
definition. This is only valid when the output is
622+
``"ltoir"``.
617623
:return: (code, resty): The compiled code and inferred return type
618624
:rtype: tuple
619625
"""
@@ -626,6 +632,12 @@ def compile(
626632
if output not in ("ptx", "ltoir"):
627633
raise NotImplementedError(f"Unsupported output type: {output}")
628634

635+
if forceinline and not device:
636+
raise ValueError("Cannot force-inline kernels")
637+
638+
if forceinline and output != "ltoir":
639+
raise ValueError("Can only designate forced inlining in LTO-IR")
640+
629641
debug = config.CUDA_DEBUGINFO_DEFAULT if debug is None else debug
630642
opt = (config.OPT != 0) if opt is None else opt
631643

@@ -660,6 +672,7 @@ def compile(
660672
fastmath=fastmath,
661673
nvvm_options=nvvm_options,
662674
cc=cc,
675+
forceinline=forceinline,
663676
)
664677
resty = cres.signature.return_type
665678

@@ -699,6 +712,7 @@ def compile_for_current_device(
699712
abi="c",
700713
abi_info=None,
701714
output="ptx",
715+
forceinline=False,
702716
):
703717
"""Compile a Python function to PTX or LTO-IR for a given signature for the
704718
current device's compute capabilility. This calls :func:`compile` with an
@@ -716,6 +730,7 @@ def compile_for_current_device(
716730
abi=abi,
717731
abi_info=abi_info,
718732
output=output,
733+
forceinline=forceinline,
719734
)
720735

721736

@@ -730,6 +745,7 @@ def compile_ptx(
730745
opt=None,
731746
abi="numba",
732747
abi_info=None,
748+
forceinline=False,
733749
):
734750
"""Compile a Python function to PTX for a given signature. See
735751
:func:`compile`. The defaults for this function are to compile a kernel
@@ -747,6 +763,7 @@ def compile_ptx(
747763
abi=abi,
748764
abi_info=abi_info,
749765
output="ptx",
766+
forceinline=forceinline,
750767
)
751768

752769

@@ -760,6 +777,7 @@ def compile_ptx_for_current_device(
760777
opt=None,
761778
abi="numba",
762779
abi_info=None,
780+
forceinline=False,
763781
):
764782
"""Compile a Python function to PTX for a given signature for the current
765783
device's compute capabilility. See :func:`compile_ptx`."""
@@ -775,6 +793,7 @@ def compile_ptx_for_current_device(
775793
opt=opt,
776794
abi=abi,
777795
abi_info=abi_info,
796+
forceinline=forceinline,
778797
)
779798

780799

numba_cuda/numba/cuda/decorators.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def jit(
1717
func_or_sig=None,
1818
device=False,
1919
inline="never",
20+
forceinline=False,
2021
link=[],
2122
debug=None,
2223
opt=None,
@@ -39,6 +40,14 @@ def jit(
3940
.. note:: A kernel cannot have any return value.
4041
:param device: Indicates whether this is a device function.
4142
:type device: bool
43+
:param inline: Enables inlining at the Numba IR level when set to
44+
``"always"``. See `Notes on Inlining
45+
<https://numba.readthedocs.io/en/stable/developer/inlining.html>`_.
46+
:type inline: str
47+
:param forceinline: Enables inlining at the NVVM IR level when set to
48+
``True``. This is accomplished by adding the ``alwaysinline`` function
49+
attribute to the function definition.
50+
:type forceinline: bool
4251
:param link: A list of files containing PTX or CUDA C/C++ source to link
4352
with the function
4453
:type link: list
@@ -85,7 +94,9 @@ def jit(
8594
DeprecationWarning(
8695
"Passing bool to inline argument is deprecated, please refer to "
8796
"Numba's documentation on inlining: "
88-
"https://numba.readthedocs.io/en/stable/developer/inlining.html"
97+
"https://numba.readthedocs.io/en/stable/developer/inlining.html. "
98+
"You may have wanted the forceinline argument instead, to force "
99+
"inlining at the NVVM IR level."
89100
)
90101

91102
inline = "always" if inline else "never"
@@ -140,6 +151,7 @@ def _jit(func):
140151
targetoptions["fastmath"] = fastmath
141152
targetoptions["device"] = device
142153
targetoptions["inline"] = inline
154+
targetoptions["forceinline"] = forceinline
143155
targetoptions["extensions"] = extensions
144156

145157
disp = CUDADispatcher(func, targetoptions=targetoptions)
@@ -182,6 +194,7 @@ def autojitwrapper(func):
182194
func,
183195
device=device,
184196
inline=inline,
197+
forceinline=forceinline,
185198
debug=debug,
186199
opt=opt,
187200
lineinfo=lineinfo,
@@ -206,6 +219,7 @@ def autojitwrapper(func):
206219
targetoptions["fastmath"] = fastmath
207220
targetoptions["device"] = device
208221
targetoptions["inline"] = inline
222+
targetoptions["forceinline"] = forceinline
209223
targetoptions["extensions"] = extensions
210224
disp = CUDADispatcher(func_or_sig, targetoptions=targetoptions)
211225

numba_cuda/numba/cuda/dispatcher.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ def __init__(
137137
debug=False,
138138
lineinfo=False,
139139
inline=False,
140+
forceinline=False,
140141
fastmath=False,
141142
extensions=None,
142143
max_registers=None,
@@ -182,7 +183,7 @@ def __init__(
182183
self.argtypes,
183184
debug=self.debug,
184185
lineinfo=lineinfo,
185-
inline=inline,
186+
forceinline=forceinline,
186187
fastmath=fastmath,
187188
nvvm_options=nvvm_options,
188189
cc=cc,
@@ -1073,7 +1074,7 @@ def compile_device(self, args, return_type=None):
10731074
with self._compiling_counter:
10741075
debug = self.targetoptions.get("debug")
10751076
lineinfo = self.targetoptions.get("lineinfo")
1076-
inline = self.targetoptions.get("inline")
1077+
forceinline = self.targetoptions.get("forceinline")
10771078
fastmath = self.targetoptions.get("fastmath")
10781079

10791080
nvvm_options = {
@@ -1091,7 +1092,7 @@ def compile_device(self, args, return_type=None):
10911092
args,
10921093
debug=debug,
10931094
lineinfo=lineinfo,
1094-
inline=inline,
1095+
forceinline=forceinline,
10951096
fastmath=fastmath,
10961097
nvvm_options=nvvm_options,
10971098
cc=cc,

numba_cuda/numba/cuda/tests/cudapy/test_inline.py

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
)
99

1010

11+
@skip_on_cudasim("Cudasim does not support inline and forceinline")
1112
class TestCudaInline(CUDATestCase):
12-
@skip_on_cudasim("Cudasim does not support inline")
1313
def _test_call_inline(self, inline):
1414
"""Test @cuda.jit(inline=...)"""
1515
a = np.ones(2, dtype=np.int32)
@@ -42,6 +42,9 @@ def call_set_zero(a):
4242
# check that call was not inlined
4343
self.assertIsNotNone(match, msg=llvm_ir)
4444

45+
# alwaysinline should not be in the IR when the inline kwarg is used
46+
self.assertNotIn("alwaysinline", llvm_ir)
47+
4548
def test_call_inline_always(self):
4649
self._test_call_inline("always")
4750

@@ -54,6 +57,100 @@ def test_call_inline_true(self):
5457
def test_call_inline_false(self):
5558
self._test_call_inline(False)
5659

60+
def _test_call_forceinline(self, forceinline):
61+
"""Test @cuda.jit(forceinline=...)"""
62+
a = np.ones(2, dtype=np.int32)
63+
64+
sig = (types.int32[::1],)
65+
66+
@cuda.jit(forceinline=forceinline)
67+
def set_zero(a):
68+
a[0] = 0
69+
70+
@cuda.jit(sig)
71+
def call_set_zero(a):
72+
set_zero(a)
73+
74+
call_set_zero[1, 2](a)
75+
76+
expected = np.arange(2, dtype=np.int32)
77+
self.assertTrue(np.all(a == expected))
78+
79+
llvm_ir = call_set_zero.inspect_llvm(sig)
80+
pat = r"call [a-zA-Z0-9]* @"
81+
match = re.compile(pat).search(llvm_ir)
82+
83+
# Check that call was not inlined at the Numba IR level - the call
84+
# should still be present in the IR
85+
self.assertIsNotNone(match)
86+
87+
# Check the definition of set_zero - it is a definition where the
88+
# name does not include an underscore just before "set_zero", because
89+
# that would match the "call_set_zero" definition
90+
pat = r"define.*[^_]set_zero.*"
91+
match = re.compile(pat).search(llvm_ir)
92+
self.assertIsNotNone(match)
93+
if forceinline:
94+
self.assertIn("alwaysinline", match.group())
95+
else:
96+
self.assertNotIn("alwaysinline", match.group())
97+
98+
# The kernel, "call_set_zero", should never have "alwaysinline" set
99+
pat = r"define.*call_set_zero.*"
100+
match = re.compile(pat).search(llvm_ir)
101+
self.assertIsNotNone(match)
102+
self.assertNotIn("alwaysinline", match.group())
103+
104+
def test_call_forceinline_true(self):
105+
self._test_call_forceinline(True)
106+
107+
def test_call_forceinline_false(self):
108+
self._test_call_forceinline(False)
109+
110+
def test_compile_forceinline_ltoir_only(self):
111+
def set_zero(a):
112+
a[0] = 0
113+
114+
args = (types.float32[::1],)
115+
msg = r"Can only designate forced inlining in LTO-IR"
116+
with self.assertRaisesRegex(ValueError, msg):
117+
cuda.compile(
118+
set_zero,
119+
args,
120+
device=True,
121+
forceinline=True,
122+
)
123+
124+
def _compile_set_zero(self, forceinline):
125+
def set_zero(a):
126+
a[0] = 0
127+
128+
args = (types.float32[::1],)
129+
ltoir, resty = cuda.compile(
130+
set_zero,
131+
args,
132+
device=True,
133+
output="ltoir",
134+
forceinline=forceinline,
135+
)
136+
137+
# Sanity check
138+
self.assertEqual(resty, types.none)
139+
140+
return ltoir
141+
142+
def test_compile_forceinline(self):
143+
ltoir_noinline = self._compile_set_zero(False)
144+
ltoir_forceinline = self._compile_set_zero(True)
145+
146+
# As LTO-IR is opaque, the best we can do is check that changing the
147+
# flag resulted in a change in the generated LTO-IR in some way.
148+
self.assertNotEqual(
149+
ltoir_noinline,
150+
ltoir_forceinline,
151+
"forceinline flag appeared to have no effect on LTO-IR",
152+
)
153+
57154

58155
if __name__ == "__main__":
59156
unittest.main()

0 commit comments

Comments
 (0)