Support linking code for device functions in declaration (#124)

gmarkall · web-flow · commit a58f3fc28cdc · 2025-02-18T16:43:41.000Z
Allows specifying files to link in the `cuda.declare_device()` declaration, so that it's no longer required for the user to know which files to link. Changes consist of: - Adding the `link` kwarg to the `declare_device` function, and automatically linking in any linkable items when the declared function is used. - Updating the documentation to describe this mechanism, and reflect that it's the recommended way to specify what to link. - Documents the `LinkableCode` classes, which were previously undocumented. - Removes some obsolete notices about needing the NVIDIA bindings for linking C/C++ code. - Adds cffi to the test environment, as it's used by one of the new tests (it should have already been present, really). I decided to not tackle #67 in its entirety, which also requests that a callback function can be used to generate the implementation, for a couple of reasons: - I think the existing implementation is of immediate value for Numbast, and all other FFI-calling implementations. - There is some thought needed about how to handle typing when a callback function is used - for example, whether it's necessary to generalize the typing beyond just the single signature that `declare_device()` presently accepts.
diff --git a/ci/test_conda.sh b/ci/test_conda.sh
@@ -8,7 +8,7 @@ set -euo pipefail
 if [ "${CUDA_VER%.*.*}" = "11" ]; then
   CTK_PACKAGES="cudatoolkit"
 else
-  CTK_PACKAGES="cuda-cccl cuda-nvcc-impl cuda-nvrtc"
+  CTK_PACKAGES="cuda-cccl cuda-nvcc-impl cuda-nvrtc libcurand-dev"
 fi
 
 rapids-logger "Install testing dependencies"
@@ -22,6 +22,7 @@ rapids-mamba-retry create -n test \
     make \
     psutil \
     pytest \
+    cffi \
     python=${RAPIDS_PY_VERSION}
 
 # Temporarily allow unbound variables for conda activation.
diff --git a/ci/test_conda_pynvjitlink.sh b/ci/test_conda_pynvjitlink.sh
@@ -8,7 +8,7 @@ set -euo pipefail
 if [ "${CUDA_VER%.*.*}" = "11" ]; then
   CTK_PACKAGES="cudatoolkit"
 else
-  CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc cuda-cuobjdump"
+  CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc cuda-cuobjdump libcurand-dev"
 fi
 
 rapids-logger "Install testing dependencies"
@@ -22,6 +22,7 @@ rapids-mamba-retry create -n test \
     make \
     psutil \
     pytest \
+    cffi \
     python=${RAPIDS_PY_VERSION}
 
 # Temporarily allow unbound variables for conda activation.
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
@@ -7,8 +7,10 @@ rapids-logger "Install testing dependencies"
 # TODO: Replace with rapids-dependency-file-generator
 python -m pip install \
     psutil \
+    cffi \
     cuda-python \
     nvidia-cuda-cccl-cu12 \
+    nvidia-curand-cu12 \
     pytest
 
 rapids-logger "Install wheel"
diff --git a/ci/test_wheel_pynvjitlink.sh b/ci/test_wheel_pynvjitlink.sh
@@ -7,7 +7,9 @@ rapids-logger "Install testing dependencies"
 # TODO: Replace with rapids-dependency-file-generator
 python -m pip install \
     psutil \
+    cffi \
     cuda-python \
+    nvidia-curand-cu12 \
     pytest
 
 rapids-logger "Install pynvjitlink"
diff --git a/docs/source/user/cuda_ffi.rst b/docs/source/user/cuda_ffi.rst
@@ -11,7 +11,7 @@ of a Python kernel call to a foreign device function are:
 
 - The device function implementation in a foreign language (e.g. CUDA C).
 - A declaration of the device function in Python.
-- A kernel that links with and calls the foreign function.
+- A kernel that calls the foreign function.
 
 .. _device-function-abi:
 
@@ -83,7 +83,7 @@ For example, when:
 
 .. code::
 
-   mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)')
+   mul = cuda.declare_device('mul_f32_f32', 'float32(float32, float32)' , link="functions.cu")
 
 is declared, calling ``mul(a, b)`` inside a kernel will translate into a call to
 ``mul_f32_f32(a, b)`` in the compiled code.
@@ -134,15 +134,63 @@ where ``result`` and ``array`` are both arrays of ``float32`` data.
 Linking and Calling functions
 -----------------------------
 
-The ``link`` keyword argument of the :func:`@cuda.jit <numba.cuda.jit>`
-decorator accepts a list of file names specified by absolute path or a path
-relative to the current working directory. Files whose name ends in ``.cu``
-will be compiled with the `NVIDIA Runtime Compiler (NVRTC)
-<https://docs.nvidia.com/cuda/nvrtc/index.html>`_ and linked into the kernel as
-PTX; other files will be passed directly to the CUDA Linker.
+The ``link`` keyword argument to the :func:`declare_device
+<numba.cuda.declare_device>` function accepts *Linkable Code* items. Either a
+single Linkable Code item can be passed, or multiple items in a list, tuple, or
+set.
+
+A Linkable Code item is either:
+
+* A string indicating the location of a file in the filesystem, or
+* A :class:`LinkableCode <numba.cuda.LinkableCode>` object, for linking code
+  that exists in memory.
+
+Suported code formats that can be linked are:
+
+* PTX source code (``*.ptx``)
+* CUDA C/C++ source code (``*.cu``)
+* CUDA ELF Fat Binaries (``*.fatbin``)
+* CUDA ELF Cubins (``*.cubin``)
+* CUDA ELF archives (``*.a``)
+* CUDA Object files (``*.o``)
+* CUDA LTOIR files (``*.ltoir``)
+
+CUDA C/C++ source code will be compiled with the `NVIDIA Runtime Compiler
+(NVRTC) <https://docs.nvidia.com/cuda/nvrtc/index.html>`_ and linked into the
+kernel as either PTX or LTOIR, depending on whether LTO is enabled. Other files
+will be passed directly to the CUDA Linker.
+
+:class:`LinkableCode <numba.cuda.LinkableCode>` objects are initialized using
+the parameters of their base class:
 
-For example, the following kernel calls the ``mul()`` function declared above
-with the implementation ``mul_f32_f32()`` in a file called ``functions.cu``:
+.. autoclass:: numba.cuda.LinkableCode
+
+However, one should instantiate an instance of the class that represents the
+type of item being linked:
+
+.. autoclass:: numba.cuda.PTXSource
+.. autoclass:: numba.cuda.CUSource
+.. autoclass:: numba.cuda.Fatbin
+.. autoclass:: numba.cuda.Cubin
+.. autoclass:: numba.cuda.Archive
+.. autoclass:: numba.cuda.Object
+.. autoclass:: numba.cuda.LTOIR
+
+Legacy ``@cuda.jit`` decorator ``link`` support
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``link`` keyword argument of the :func:`@cuda.jit <numba.cuda.jit>`
+decorator also accepts a list of Linkable Code items, which will then be linked
+into the kernel. This facility is provided for backwards compatibility; it is
+recommended that Linkable Code items are always specified in the
+:func:`declare_device <numba.cuda.declare_device>` call, so that the user of the
+declared API is not burdened with specifying the items to link themselves when
+writing a kernel.
+
+As an example of how this legacy mechanism looked at the point of use: the
+following kernel calls the ``mul()`` function declared above with the
+implementation ``mul_f32_f32()`` as if it were in a file called ``functions.cu``
+that had not been declared as part of the ``link`` argument in the declaration:
 
 .. code::
 
@@ -153,17 +201,13 @@ with the implementation ``mul_f32_f32()`` in a file called ``functions.cu``:
        if i < len(r):
            r[i] = mul(x[i], y[i])
 
-
 C/C++ Support
 -------------
 
 Support for compiling and linking of CUDA C/C++ code is provided through the use
 of NVRTC subject to the following considerations:
 
-- It is only available when using the NVIDIA Bindings. See
-  :envvar:`NUMBA_CUDA_USE_NVIDIA_BINDING`.
-- A suitable version of the NVRTC library for the installed version of the
-  NVIDIA CUDA Bindings must be available.
+- A suitable version of the NVRTC library must be available.
 - The CUDA include path is assumed by default to be ``/usr/local/cuda/include``
   on Linux and ``$env:CUDA_PATH\include`` on Windows. It can be modified using
   the environment variable :envvar:`NUMBA_CUDA_INCLUDE_PATH`.
diff --git a/numba_cuda/numba/cuda/compiler.py b/numba_cuda/numba/cuda/compiler.py
@@ -570,16 +570,16 @@ def compile_ptx_for_current_device(pyfunc, sig, debug=None, lineinfo=False,
                        abi=abi, abi_info=abi_info)
 
 
-def declare_device_function(name, restype, argtypes):
-    return declare_device_function_template(name, restype, argtypes).key
+def declare_device_function(name, restype, argtypes, link):
+    return declare_device_function_template(name, restype, argtypes, link).key
 
 
-def declare_device_function_template(name, restype, argtypes):
+def declare_device_function_template(name, restype, argtypes, link):
     from .descriptor import cuda_target
     typingctx = cuda_target.typing_context
     targetctx = cuda_target.target_context
     sig = typing.signature(restype, *argtypes)
-    extfn = ExternFunction(name, sig)
+    extfn = ExternFunction(name, sig, link)
 
     class device_function_template(ConcreteTemplate):
         key = extfn
@@ -593,7 +593,8 @@ class device_function_template(ConcreteTemplate):
     return device_function_template
 
 
-class ExternFunction(object):
-    def __init__(self, name, sig):
+class ExternFunction:
+    def __init__(self, name, sig, link):
         self.name = name
         self.sig = sig
+        self.link = link
diff --git a/numba_cuda/numba/cuda/cudadecl.py b/numba_cuda/numba/cuda/cudadecl.py
@@ -403,16 +403,20 @@ def _genfp16_binary_operator(op):
 
 
 def _resolve_wrapped_unary(fname):
+    link = tuple()
     decl = declare_device_function_template(f'__numba_wrapper_{fname}',
                                             types.float16,
-                                            (types.float16,))
+                                            (types.float16,),
+                                            link)
     return types.Function(decl)
 
 
 def _resolve_wrapped_binary(fname):
+    link = tuple()
     decl = declare_device_function_template(f'__numba_wrapper_{fname}',
                                             types.float16,
-                                            (types.float16, types.float16,))
+                                            (types.float16, types.float16,),
+                                            link)
     return types.Function(decl)
 
 
diff --git a/numba_cuda/numba/cuda/cudadrv/linkable_code.py b/numba_cuda/numba/cuda/cudadrv/linkable_code.py
@@ -2,8 +2,12 @@
 
 
 class LinkableCode:
-    """An object that can be passed in the `link` list argument to `@cuda.jit`
-    kernels to supply code to be linked from memory."""
+    """An object that holds code to be linked from memory.
+
+    :param data: A buffer containing the data to link.
+    :param name: The name of the file to be referenced in any compilation or
+                 linking errors that may be produced.
+    """
 
     def __init__(self, data, name=None):
         self.data = data
@@ -15,49 +19,49 @@ def name(self):
 
 
 class PTXSource(LinkableCode):
-    """PTX Source code in memory"""
+    """PTX source code in memory."""
 
     kind = FILE_EXTENSION_MAP["ptx"]
     default_name = "<unnamed-ptx>"
 
 
 class CUSource(LinkableCode):
-    """CUDA C/C++ Source code in memory"""
+    """CUDA C/C++ source code in memory."""
 
     kind = "cu"
     default_name = "<unnamed-cu>"
 
 
 class Fatbin(LinkableCode):
-    """A fatbin ELF in memory"""
+    """An ELF Fatbin in memory."""
 
     kind = FILE_EXTENSION_MAP["fatbin"]
     default_name = "<unnamed-fatbin>"
 
 
 class Cubin(LinkableCode):
-    """A cubin ELF in memory"""
+    """An ELF Cubin in memory."""
 
     kind = FILE_EXTENSION_MAP["cubin"]
     default_name = "<unnamed-cubin>"
 
 
 class Archive(LinkableCode):
-    """An archive of objects in memory"""
+    """An archive of objects in memory."""
 
     kind = FILE_EXTENSION_MAP["a"]
     default_name = "<unnamed-archive>"
 
 
 class Object(LinkableCode):
-    """An object file in memory"""
+    """An object file in memory."""
 
     kind = FILE_EXTENSION_MAP["o"]
     default_name = "<unnamed-object>"
 
 
 class LTOIR(LinkableCode):
-    """An LTOIR file in memory"""
+    """An LTOIR file in memory."""
 
     kind = "ltoir"
     default_name = "<unnamed-ltoir>"
diff --git a/numba_cuda/numba/cuda/decorators.py b/numba_cuda/numba/cuda/decorators.py
@@ -173,18 +173,25 @@ def autojitwrapper(func):
                 return disp
 
 
-def declare_device(name, sig):
+def declare_device(name, sig, link=None):
     """
     Declare the signature of a foreign function. Returns a descriptor that can
     be used to call the function from a Python kernel.
 
     :param name: The name of the foreign function.
     :type name: str
     :param sig: The Numba signature of the function.
+    :param link: External code to link when calling the function.
     """
+    if link is None:
+        link = tuple()
+    else:
+        if not isinstance(link, (list, tuple, set)):
+            link = (link,)
+
     argtypes, restype = sigutils.normalize_signature(sig)
     if restype is None:
         msg = 'Return type must be provided for device declarations'
         raise TypeError(msg)
 
-    return declare_device_function(name, restype, argtypes)
+    return declare_device_function(name, restype, argtypes, link)
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
@@ -11,10 +11,11 @@
 from numba.core.dispatcher import Dispatcher
 from numba.core.errors import NumbaPerformanceWarning
 from numba.core.typing.typeof import Purpose, typeof
-
+from numba.core.types.functions import Function
 from numba.cuda.api import get_current_device
 from numba.cuda.args import wrap_arg
-from numba.cuda.compiler import compile_cuda, CUDACompiler, kernel_fixup
+from numba.cuda.compiler import (compile_cuda, CUDACompiler, kernel_fixup,
+                                 ExternFunction)
 from numba.cuda.cudadrv import driver
 from numba.cuda.cudadrv.devices import get_context
 from numba.cuda.descriptor import cuda_target
@@ -158,6 +159,16 @@ def link_to_library_functions(library_functions, library_path,
 
         self.maybe_link_nrt(link, tgt_ctx, asm)
 
+        for k, v in cres.fndesc.typemap.items():
+            if not isinstance(v, Function):
+                continue
+
+            if not isinstance(v.typing_key, ExternFunction):
+                continue
+
+            for obj in v.typing_key.link:
+                lib.add_linking_file(obj)
+
         for filepath in link:
             lib.add_linking_file(filepath)
 
diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py
@@ -115,12 +115,22 @@ def skip_on_arm(reason):
 def skip_if_cuda_includes_missing(fn):
     # Skip when cuda.h is not available - generally this should indicate
     # whether the CUDA includes are available or not
-    cuda_h = os.path.join(config.CUDA_INCLUDE_PATH, 'cuda.h')
+    cuda_include_path = libs.get_cuda_include_dir()
+    cuda_h = os.path.join(cuda_include_path, 'cuda.h')
     cuda_h_file = (os.path.exists(cuda_h) and os.path.isfile(cuda_h))
     reason = 'CUDA include dir not available on this system'
     return unittest.skipUnless(cuda_h_file, reason)(fn)
 
 
+def skip_if_curand_kernel_missing(fn):
+    cuda_include_path = libs.get_cuda_include_dir()
+    curand_kernel_h = os.path.join(cuda_include_path, 'curand_kernel.h')
+    curand_kernel_h_file = (os.path.exists(curand_kernel_h) and
+                            os.path.isfile(curand_kernel_h))
+    reason = 'curand_kernel.h not available on this system'
+    return unittest.skipUnless(curand_kernel_h_file, reason)(fn)
+
+
 def skip_if_mvc_enabled(reason):
     """Skip a test if Minor Version Compatibility is enabled"""
     return unittest.skipIf(config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY,
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py b/numba_cuda/numba/cuda/tests/cudapy/test_device_func.py
diff --git a/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py b/numba_cuda/numba/cuda/tests/doc_examples/test_ffi.py