NVIDIA
diff --git a/‎ci/test_conda.sh‎
Lines changed: 1 addition & 1 deletion b/‎ci/test_conda.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/test_wheel.sh‎
Lines changed: 1 addition & 0 deletions b/‎ci/test_wheel.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎numba_cuda/numba/cuda/cuda_paths.py‎
Lines changed: 68 additions & 0 deletions b/‎numba_cuda/numba/cuda/cuda_paths.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎numba_cuda/numba/cuda/cudadrv/libs.py‎
Lines changed: 38 additions & 0 deletions b/‎numba_cuda/numba/cuda/cudadrv/libs.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎numba_cuda/numba/cuda/cudadrv/nvrtc.py‎
Lines changed: 9 additions & 4 deletions b/‎numba_cuda/numba/cuda/cudadrv/nvrtc.py‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎numba_cuda/numba/cuda/dispatcher.py‎
Lines changed: 41 additions & 3 deletions b/‎numba_cuda/numba/cuda/dispatcher.py‎
Lines changed: 41 additions & 3 deletions
@@ -8,7 +8,7 @@ set -euo pipefail
 if [ "${CUDA_VER%.*.*}" = "11" ]; then
   CTK_PACKAGES="cudatoolkit"
 else
-  CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc"
+  CTK_PACKAGES="cuda-cccl cuda-nvcc-impl cuda-nvrtc"
 fi
 
 rapids-logger "Install testing dependencies"
 
@@ -8,6 +8,7 @@ rapids-logger "Install testing dependencies"
 python -m pip install \
     psutil \
     cuda-python \
+    nvidia-cuda-cccl-cu12 \
     pytest
 
 rapids-logger "Install wheel"
 
@@ -2,9 +2,11 @@
 import re
 import os
 from collections import namedtuple
+import platform
 
 from numba.core.config import IS_WIN32
 from numba.misc.findlib import find_lib, find_file
+from numba import config
 
 
 _env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
@@ -241,6 +243,7 @@ def get_cuda_paths():
             'libdevice': _get_libdevice_paths(),
             'cudalib_dir': _get_cudalib_dir(),
             'static_cudalib_dir': _get_static_cudalib_dir(),
+            'include_dir': _get_include_dir(),
         }
         # Cache result
         get_cuda_paths._cached_result = d
@@ -256,3 +259,68 @@ def get_debian_pkg_libdevice():
     if not os.path.exists(pkg_libdevice_location):
         return None
     return pkg_libdevice_location
+
+
+def get_current_cuda_target_name():
+    """Determine conda's CTK target folder based on system and machine arch.
+
+    CTK's conda package delivers headers based on its architecture type. For example,
+    `x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
+    `aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
+    nuances at cudart's conda feedstock:
+    https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11  # noqa: E501
+    """
+    system = platform.system()
+    machine = platform.machine()
+
+    if system == "Linux":
+        arch_to_targets = {
+            'x86_64': 'x86_64-linux',
+            'aarch64': 'sbsa-linux'
+        }
+    elif system == "Windows":
+        arch_to_targets = {
+            'AMD64': 'x64',
+        }
+    else:
+        arch_to_targets = {}
+
+    return arch_to_targets.get(machine, None)
+
+
+def get_conda_include_dir():
+    """
+    Return the include directory in the current conda environment, if one
+    is active and it exists.
+    """
+    is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
+    if not is_conda_env:
+        return
+
+    if platform.system() == "Windows":
+        include_dir = os.path.join(
+            sys.prefix, 'Library', 'include'
+        )
+    elif target_name := get_current_cuda_target_name():
+        include_dir = os.path.join(
+            sys.prefix, 'targets', target_name, 'include'
+        )
+    else:
+        # A fallback when target cannot determined
+        # though usually it shouldn't.
+        include_dir = os.path.join(sys.prefix, 'include')
+
+    if os.path.exists(include_dir):
+        return include_dir
+    return
+
+
+def _get_include_dir():
+    """Find the root include directory."""
+    options = [
+        ('Conda environment (NVIDIA package)', get_conda_include_dir()),
+        ('CUDA_INCLUDE_PATH Config Entry', config.CUDA_INCLUDE_PATH),
+        # TODO: add others
+    ]
+    by, include_dir = _find_valid_path(options)
+    return _env_path_tuple(by, include_dir)
@@ -18,6 +18,7 @@
 from numba.cuda.cuda_paths import get_cuda_paths
 from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
 from numba.cuda.cudadrv.error import CudaSupportError
+from numba.core import config
 
 
 if sys.platform == 'win32':
@@ -60,6 +61,24 @@ def get_cudalib(lib, static=False):
     return max(candidates) if candidates else namepattern % lib
 
 
+def get_cuda_include_dir():
+    """
+    Find the path to cuda include dir based on a list of default locations.
+    Note that this does not list the `CUDA_INCLUDE_PATH` entry in user
+    configuration.
+    """
+
+    return get_cuda_paths()['include_dir'].info
+
+
+def check_cuda_include_dir(path):
+    if path is None or not os.path.exists(path):
+        raise FileNotFoundError(f"{path} not found")
+
+    if not os.path.exists(os.path.join(path, "cuda_runtime.h")):
+        raise FileNotFoundError(f"Unable to find cuda_runtime.h from {path}")
+
+
 def open_cudalib(lib):
     path = get_cudalib(lib)
     return ctypes.CDLL(path)
@@ -75,6 +94,8 @@ def _get_source_variable(lib, static=False):
         return get_cuda_paths()['nvvm'].by
     elif lib == 'libdevice':
         return get_cuda_paths()['libdevice'].by
+    elif lib == 'include_dir':
+        return get_cuda_paths()['include_dir'].by
     else:
         dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
         return get_cuda_paths()[dir_type].by
@@ -173,4 +194,21 @@ def test():
         print('\tERROR: failed to find %s:\n%s' % (lib, e))
         failed = True
 
+    # Check cuda include paths
+
+    print("Include directory configuration variable:")
+    print(f"\tCUDA_INCLUDE_PATH={config.CUDA_INCLUDE_PATH}")
+
+    where = _get_source_variable('include_dir')
+    print(f'Finding include directory from {where}')
+    include = get_cuda_include_dir()
+    print('\tLocated at', include)
+    try:
+        print('\tChecking include directory', end='...')
+        check_cuda_include_dir(include)
+        print('\tok')
+    except FileNotFoundError as e:
+        print('\tERROR: failed to find cuda include directory:\n%s' % e)
+        failed = True
+
     return not failed
@@ -1,9 +1,8 @@
 from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
 from enum import IntEnum
-from numba.core import config
 from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
                                       NvrtcSupportError)
-
+from numba.cuda.cuda_paths import get_cuda_paths
 import functools
 import os
 import threading
@@ -233,12 +232,18 @@ def compile(src, name, cc):
     #   being optimized away.
     major, minor = cc
     arch = f'--gpu-architecture=compute_{major}{minor}'
-    include = f'-I{config.CUDA_INCLUDE_PATH}'
+
+    cuda_include = [
+        f"-I{get_cuda_paths()['include_dir'].info}",
+    ]
 
     cudadrv_path = os.path.dirname(os.path.abspath(__file__))
     numba_cuda_path = os.path.dirname(cudadrv_path)
     numba_include = f'-I{numba_cuda_path}'
-    options = [arch, include, numba_include, '-rdc', 'true']
+    options = [arch, *cuda_include, numba_include, '-rdc', 'true']
+
+    if nvrtc.get_version() < (12, 0):
+        options += ["-std=c++17"]
 
     # Compile the program
     compile_error = nvrtc.compile_program(program, options)
 
@@ -1,5 +1,6 @@
 import numpy as np
 import os
+import re
 import sys
 import ctypes
 import functools
@@ -43,6 +44,21 @@ class _Kernel(serialize.ReduceMixin):
     object launches the kernel on the device.
     '''
 
+    NRT_functions = [
+        "NRT_Allocate",
+        "NRT_MemInfo_init",
+        "NRT_MemInfo_new",
+        "NRT_Free",
+        "NRT_dealloc",
+        "NRT_MemInfo_destroy",
+        "NRT_MemInfo_call_dtor",
+        "NRT_MemInfo_data_fast",
+        "NRT_MemInfo_alloc_aligned",
+        "NRT_Allocate_External",
+        "NRT_decref",
+        "NRT_incref"
+    ]
+
     @global_compiler_lock
     def __init__(self, py_func, argtypes, link=None, debug=False,
                  lineinfo=False, inline=False, fastmath=False, extensions=None,
@@ -105,16 +121,20 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
         if self.cooperative:
             lib.needs_cudadevrt = True
 
+        basedir = os.path.dirname(os.path.abspath(__file__))
+        asm = lib.get_asm_str()
+
         res = [fn for fn in cuda_fp16_math_funcs
-               if (f'__numba_wrapper_{fn}' in lib.get_asm_str())]
+               if (f'__numba_wrapper_{fn}' in asm)]
 
         if res:
             # Path to the source containing the foreign function
-            basedir = os.path.dirname(os.path.abspath(__file__))
             functions_cu_path = os.path.join(basedir,
                                              'cpp_function_wrappers.cu')
             link.append(functions_cu_path)
 
+        link = self.maybe_link_nrt(link, tgt_ctx, asm)
+
         for filepath in link:
             lib.add_linking_file(filepath)
 
@@ -136,6 +156,25 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
         self.lifted = []
         self.reload_init = []
 
+    def maybe_link_nrt(self, link, tgt_ctx, asm):
+        if not tgt_ctx.enable_nrt:
+            return link
+
+        all_nrt = "|".join(self.NRT_functions)
+        pattern = (
+            r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?('
+            + all_nrt + r')\s*\([^)]*\)\s*;'
+        )
+
+        nrt_in_asm = re.findall(pattern, asm)
+
+        basedir = os.path.dirname(os.path.abspath(__file__))
+        if nrt_in_asm:
+            nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
+            link.append(nrt_path)
+
+        return link
+
     @property
     def library(self):
         return self._codelibrary
@@ -385,7 +424,6 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
 
         if isinstance(ty, types.Array):
             devary = wrap_arg(val).to_device(retr, stream)
-
             c_intp = ctypes.c_ssize_t
 
             meminfo = ctypes.c_void_p(0)