Fix lineinfo generation when compile_internal used (#271) (#287)

gmarkall · web-flow · commit 2fa741ce6114 · 2025-06-30T22:49:52.000+01:00
Lineinfo generation is broken by function implementations that generate code via `context.compile_internal()`. The root cause is that the implementation eventually reaches upstream Numba's `BaseContext._compile_subroutine_no_cache()` method, which ignores the flags in the context stack and creates new ones. The outcome of this is that the debug info kind is forgotten, leading to a default debug info kind of `"FullDebug"` being emitted, which then enables the PTX `debug` target, leading to deoptimized code. This change works around the issue (pending a fix upstream) by overriding the `_compile_subroutine_no_cache()` implementation to use flags from the context stack when they are otherwise not provided. The fix to upstream will look like a similar modification of the method. The `CUDAFlags` class is moved to its own module to avoid circular import dependencies between `compiler.py` and `target.py`. Fixes #271.
diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
@@ -14,8 +14,6 @@
     sanitize_compile_result_entries,
     CompilerBase,
     DefaultPassBuilder,
-    Flags,
-    Option,
     CompileResult,
 )
 from numba.core.compiler_lock import global_compiler_lock
@@ -39,45 +37,11 @@
 from numba.cuda.codegen import ExternalCodeLibrary
 from numba.cuda.cudadrv import nvvm
 from numba.cuda.descriptor import cuda_target
+from numba.cuda.flags import CUDAFlags
 from numba.cuda.target import CUDACABICallConv
 from numba.cuda import lowering
 
 
-def _nvvm_options_type(x):
-    if x is None:
-        return None
-
-    else:
-        assert isinstance(x, dict)
-        return x
-
-
-def _optional_int_type(x):
-    if x is None:
-        return None
-
-    else:
-        assert isinstance(x, int)
-        return x
-
-
-class CUDAFlags(Flags):
-    nvvm_options = Option(
-        type=_nvvm_options_type,
-        default=None,
-        doc="NVVM options",
-    )
-    compute_capability = Option(
-        type=tuple,
-        default=None,
-        doc="Compute Capability",
-    )
-    max_registers = Option(
-        type=_optional_int_type, default=None, doc="Max registers"
-    )
-    lto = Option(type=bool, default=False, doc="Enable Link-time Optimization")
-
-
 # The CUDACompileResult (CCR) has a specially-defined entry point equal to its
 # id.  This is because the entry point is used as a key into a dict of
 # overloads by the base dispatcher. The id of the CCR is the only small and
diff --git a/numba_cuda/numba/cuda/flags.py b/numba_cuda/numba/cuda/flags.py
@@ -0,0 +1,36 @@
+from numba.core.compiler import Flags, Option
+
+
+def _nvvm_options_type(x):
+    if x is None:
+        return None
+
+    else:
+        assert isinstance(x, dict)
+        return x
+
+
+def _optional_int_type(x):
+    if x is None:
+        return None
+
+    else:
+        assert isinstance(x, int)
+        return x
+
+
+class CUDAFlags(Flags):
+    nvvm_options = Option(
+        type=_nvvm_options_type,
+        default=None,
+        doc="NVVM options",
+    )
+    compute_capability = Option(
+        type=tuple,
+        default=None,
+        doc="Compute Capability",
+    )
+    max_registers = Option(
+        type=_optional_int_type, default=None, doc="Max registers"
+    )
+    lto = Option(type=bool, default=False, doc="Enable Link-time Optimization")
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
@@ -2,9 +2,20 @@
 from functools import cached_property
 import llvmlite.binding as ll
 from llvmlite import ir
-
-from numba.core import cgutils, config, itanium_mangler, types, typing
+import warnings
+
+from numba.core import (
+    cgutils,
+    compiler,
+    config,
+    itanium_mangler,
+    targetconfig,
+    types,
+    typing,
+)
+from numba.core.compiler_lock import global_compiler_lock
 from numba.core.dispatcher import Dispatcher
+from numba.core.errors import NumbaWarning
 from numba.core.base import BaseContext
 from numba.core.callconv import BaseCallConv, MinimalCallConv
 from numba.core.typing import cmathdecl
@@ -13,6 +24,7 @@
 from .cudadrv import nvvm
 from numba.cuda import codegen, ufuncs
 from numba.cuda.debuginfo import CUDADIBuilder
+from numba.cuda.flags import CUDAFlags
 from numba.cuda.models import cuda_data_manager
 
 # -----------------------------------------------------------------------------
@@ -288,6 +300,47 @@ def optimize_function(self, func):
     def get_ufunc_info(self, ufunc_key):
         return ufuncs.get_ufunc_info(ufunc_key)
 
+    def _compile_subroutine_no_cache(
+        self, builder, impl, sig, locals=None, flags=None
+    ):
+        # Overrides numba.core.base.BaseContext._compile_subroutine_no_cache().
+        # Modified to use flags from the context stack if they are not provided
+        # (pending a fix in Numba upstream).
+
+        if locals is None:
+            locals = {}
+
+        with global_compiler_lock:
+            codegen = self.codegen()
+            library = codegen.create_library(impl.__name__)
+            if flags is None:
+                cstk = targetconfig.ConfigStack()
+                if cstk:
+                    flags = cstk.top().copy()
+                else:
+                    msg = "There should always be a context stack; none found."
+                    warnings.warn(msg, NumbaWarning)
+                    flags = CUDAFlags()
+
+            flags.no_compile = True
+            flags.no_cpython_wrapper = True
+            flags.no_cfunc_wrapper = True
+
+            cres = compiler.compile_internal(
+                self.typing_context,
+                self,
+                library,
+                impl,
+                sig.args,
+                sig.return_type,
+                flags,
+                locals=locals,
+            )
+
+            # Allow inlining the function inside callers
+            self.active_code_library.add_linking_library(cres.library)
+            return cres
+
 
 class CUDACallConv(MinimalCallConv):
     def decorate_function(self, fn, args, fe_argtypes, noalias=False):
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py b/numba_cuda/numba/cuda/tests/cudapy/test_lineinfo.py
@@ -198,6 +198,24 @@ def f():
             "debug and lineinfo are mutually exclusive", str(w[0].message)
         )
 
+    def test_lineinfo_with_compile_internal(self):
+        # Calling a function implemented using compile_internal should not
+        # enable full debug info generation. See Numba-CUDA Issue #271,
+        # https://github.com/NVIDIA/numba-cuda/issues/271
+
+        @cuda.jit("void(complex128[::1], complex128[::1])", lineinfo=True)
+        def complex_abs_use(r, x):
+            r[0] = abs(x[0])
+
+        cc = cuda.get_current_device().compute_capability
+        ov = complex_abs_use.overloads[complex_abs_use.signatures[0]]
+        ptx = ov.inspect_asm(cc)
+
+        target = ".target sm_%s%s" % cc
+        target_debug = f"{target}, debug"
+        self.assertIn(target, ptx)
+        self.assertNotIn(target_debug, ptx)
+
 
 if __name__ == "__main__":
     unittest.main()