Skip to content

Commit c04efe0

Browse files
brandon-b-millergmarkallisVoid
authored
Add NRT c++ functions (#17)
This PR adds device side implementations of some of the NRT c-api. This is a first step towards support for allocations and refcounting / garbage collection on the device, and serves as a foundation for building on, rather than feature completeness for any particular piece of functionality. Combined with the change to the `CUDATargetContext` object, this allows launching of kernels like this: ```python from numba import cuda import numpy as np @cuda.jit def f(x): return x[:5] @cuda.jit('void()', link=['nrt.cu']) def g(): x = cuda.shared.array(10, dtype=np.int32) f(x) g[1,1]() ``` Notes on the implementation: - Currently, no memsys is used and it remains to be discussed how we'd like to expose it, knowing that it may have to be used outside of numba to free objects that persist after the kernel finishes executing. - Basic tests are added. We have `test_nrt.py` that mainly enables NRT and test that a refcounted variable can successfully pass in and return from a second function. We also have another test that mocks up a device side allocation and test that allocation statistics can be correctly collected, which is xfailed until stats are functional. - NRT functions are linked when any of the NRT specific functions are found in the jitted PTX. --------- Co-authored-by: Graham Markall <[email protected]> Co-authored-by: Michael Yh Wang <[email protected]>
1 parent c349038 commit c04efe0

File tree

13 files changed

+573
-9
lines changed

13 files changed

+573
-9
lines changed

ci/test_conda.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ set -euo pipefail
88
if [ "${CUDA_VER%.*.*}" = "11" ]; then
99
CTK_PACKAGES="cudatoolkit"
1010
else
11-
CTK_PACKAGES="cuda-nvcc-impl cuda-nvrtc"
11+
CTK_PACKAGES="cuda-cccl cuda-nvcc-impl cuda-nvrtc"
1212
fi
1313

1414
rapids-logger "Install testing dependencies"

ci/test_wheel.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ rapids-logger "Install testing dependencies"
88
python -m pip install \
99
psutil \
1010
cuda-python \
11+
nvidia-cuda-cccl-cu12 \
1112
pytest
1213

1314
rapids-logger "Install wheel"

numba_cuda/numba/cuda/cuda_paths.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
import re
33
import os
44
from collections import namedtuple
5+
import platform
56

67
from numba.core.config import IS_WIN32
78
from numba.misc.findlib import find_lib, find_file
9+
from numba import config
810

911

1012
_env_path_tuple = namedtuple('_env_path_tuple', ['by', 'info'])
@@ -241,6 +243,7 @@ def get_cuda_paths():
241243
'libdevice': _get_libdevice_paths(),
242244
'cudalib_dir': _get_cudalib_dir(),
243245
'static_cudalib_dir': _get_static_cudalib_dir(),
246+
'include_dir': _get_include_dir(),
244247
}
245248
# Cache result
246249
get_cuda_paths._cached_result = d
@@ -256,3 +259,68 @@ def get_debian_pkg_libdevice():
256259
if not os.path.exists(pkg_libdevice_location):
257260
return None
258261
return pkg_libdevice_location
262+
263+
264+
def get_current_cuda_target_name():
265+
"""Determine conda's CTK target folder based on system and machine arch.
266+
267+
CTK's conda package delivers headers based on its architecture type. For example,
268+
`x86_64` machine places header under `$CONDA_PREFIX/targets/x86_64-linux`, and
269+
`aarch64` places under `$CONDA_PREFIX/targets/sbsa-linux`. Read more about the
270+
nuances at cudart's conda feedstock:
271+
https://github.com/conda-forge/cuda-cudart-feedstock/blob/main/recipe/meta.yaml#L8-L11 # noqa: E501
272+
"""
273+
system = platform.system()
274+
machine = platform.machine()
275+
276+
if system == "Linux":
277+
arch_to_targets = {
278+
'x86_64': 'x86_64-linux',
279+
'aarch64': 'sbsa-linux'
280+
}
281+
elif system == "Windows":
282+
arch_to_targets = {
283+
'AMD64': 'x64',
284+
}
285+
else:
286+
arch_to_targets = {}
287+
288+
return arch_to_targets.get(machine, None)
289+
290+
291+
def get_conda_include_dir():
292+
"""
293+
Return the include directory in the current conda environment, if one
294+
is active and it exists.
295+
"""
296+
is_conda_env = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
297+
if not is_conda_env:
298+
return
299+
300+
if platform.system() == "Windows":
301+
include_dir = os.path.join(
302+
sys.prefix, 'Library', 'include'
303+
)
304+
elif target_name := get_current_cuda_target_name():
305+
include_dir = os.path.join(
306+
sys.prefix, 'targets', target_name, 'include'
307+
)
308+
else:
309+
# A fallback when target cannot determined
310+
# though usually it shouldn't.
311+
include_dir = os.path.join(sys.prefix, 'include')
312+
313+
if os.path.exists(include_dir):
314+
return include_dir
315+
return
316+
317+
318+
def _get_include_dir():
319+
"""Find the root include directory."""
320+
options = [
321+
('Conda environment (NVIDIA package)', get_conda_include_dir()),
322+
('CUDA_INCLUDE_PATH Config Entry', config.CUDA_INCLUDE_PATH),
323+
# TODO: add others
324+
]
325+
by, include_dir = _find_valid_path(options)
326+
return _env_path_tuple(by, include_dir)

numba_cuda/numba/cuda/cudadrv/libs.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from numba.cuda.cuda_paths import get_cuda_paths
1919
from numba.cuda.cudadrv.driver import locate_driver_and_loader, load_driver
2020
from numba.cuda.cudadrv.error import CudaSupportError
21+
from numba.core import config
2122

2223

2324
if sys.platform == 'win32':
@@ -60,6 +61,24 @@ def get_cudalib(lib, static=False):
6061
return max(candidates) if candidates else namepattern % lib
6162

6263

64+
def get_cuda_include_dir():
65+
"""
66+
Find the path to cuda include dir based on a list of default locations.
67+
Note that this does not list the `CUDA_INCLUDE_PATH` entry in user
68+
configuration.
69+
"""
70+
71+
return get_cuda_paths()['include_dir'].info
72+
73+
74+
def check_cuda_include_dir(path):
75+
if path is None or not os.path.exists(path):
76+
raise FileNotFoundError(f"{path} not found")
77+
78+
if not os.path.exists(os.path.join(path, "cuda_runtime.h")):
79+
raise FileNotFoundError(f"Unable to find cuda_runtime.h from {path}")
80+
81+
6382
def open_cudalib(lib):
6483
path = get_cudalib(lib)
6584
return ctypes.CDLL(path)
@@ -75,6 +94,8 @@ def _get_source_variable(lib, static=False):
7594
return get_cuda_paths()['nvvm'].by
7695
elif lib == 'libdevice':
7796
return get_cuda_paths()['libdevice'].by
97+
elif lib == 'include_dir':
98+
return get_cuda_paths()['include_dir'].by
7899
else:
79100
dir_type = 'static_cudalib_dir' if static else 'cudalib_dir'
80101
return get_cuda_paths()[dir_type].by
@@ -173,4 +194,21 @@ def test():
173194
print('\tERROR: failed to find %s:\n%s' % (lib, e))
174195
failed = True
175196

197+
# Check cuda include paths
198+
199+
print("Include directory configuration variable:")
200+
print(f"\tCUDA_INCLUDE_PATH={config.CUDA_INCLUDE_PATH}")
201+
202+
where = _get_source_variable('include_dir')
203+
print(f'Finding include directory from {where}')
204+
include = get_cuda_include_dir()
205+
print('\tLocated at', include)
206+
try:
207+
print('\tChecking include directory', end='...')
208+
check_cuda_include_dir(include)
209+
print('\tok')
210+
except FileNotFoundError as e:
211+
print('\tERROR: failed to find cuda include directory:\n%s' % e)
212+
failed = True
213+
176214
return not failed

numba_cuda/numba/cuda/cudadrv/nvrtc.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
22
from enum import IntEnum
3-
from numba.core import config
43
from numba.cuda.cudadrv.error import (NvrtcError, NvrtcCompilationError,
54
NvrtcSupportError)
6-
5+
from numba.cuda.cuda_paths import get_cuda_paths
76
import functools
87
import os
98
import threading
@@ -233,12 +232,18 @@ def compile(src, name, cc):
233232
# being optimized away.
234233
major, minor = cc
235234
arch = f'--gpu-architecture=compute_{major}{minor}'
236-
include = f'-I{config.CUDA_INCLUDE_PATH}'
235+
236+
cuda_include = [
237+
f"-I{get_cuda_paths()['include_dir'].info}",
238+
]
237239

238240
cudadrv_path = os.path.dirname(os.path.abspath(__file__))
239241
numba_cuda_path = os.path.dirname(cudadrv_path)
240242
numba_include = f'-I{numba_cuda_path}'
241-
options = [arch, include, numba_include, '-rdc', 'true']
243+
options = [arch, *cuda_include, numba_include, '-rdc', 'true']
244+
245+
if nvrtc.get_version() < (12, 0):
246+
options += ["-std=c++17"]
242247

243248
# Compile the program
244249
compile_error = nvrtc.compile_program(program, options)

numba_cuda/numba/cuda/dispatcher.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22
import os
3+
import re
34
import sys
45
import ctypes
56
import functools
@@ -43,6 +44,21 @@ class _Kernel(serialize.ReduceMixin):
4344
object launches the kernel on the device.
4445
'''
4546

47+
NRT_functions = [
48+
"NRT_Allocate",
49+
"NRT_MemInfo_init",
50+
"NRT_MemInfo_new",
51+
"NRT_Free",
52+
"NRT_dealloc",
53+
"NRT_MemInfo_destroy",
54+
"NRT_MemInfo_call_dtor",
55+
"NRT_MemInfo_data_fast",
56+
"NRT_MemInfo_alloc_aligned",
57+
"NRT_Allocate_External",
58+
"NRT_decref",
59+
"NRT_incref"
60+
]
61+
4662
@global_compiler_lock
4763
def __init__(self, py_func, argtypes, link=None, debug=False,
4864
lineinfo=False, inline=False, fastmath=False, extensions=None,
@@ -105,16 +121,20 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
105121
if self.cooperative:
106122
lib.needs_cudadevrt = True
107123

124+
basedir = os.path.dirname(os.path.abspath(__file__))
125+
asm = lib.get_asm_str()
126+
108127
res = [fn for fn in cuda_fp16_math_funcs
109-
if (f'__numba_wrapper_{fn}' in lib.get_asm_str())]
128+
if (f'__numba_wrapper_{fn}' in asm)]
110129

111130
if res:
112131
# Path to the source containing the foreign function
113-
basedir = os.path.dirname(os.path.abspath(__file__))
114132
functions_cu_path = os.path.join(basedir,
115133
'cpp_function_wrappers.cu')
116134
link.append(functions_cu_path)
117135

136+
link = self.maybe_link_nrt(link, tgt_ctx, asm)
137+
118138
for filepath in link:
119139
lib.add_linking_file(filepath)
120140

@@ -136,6 +156,25 @@ def __init__(self, py_func, argtypes, link=None, debug=False,
136156
self.lifted = []
137157
self.reload_init = []
138158

159+
def maybe_link_nrt(self, link, tgt_ctx, asm):
160+
if not tgt_ctx.enable_nrt:
161+
return link
162+
163+
all_nrt = "|".join(self.NRT_functions)
164+
pattern = (
165+
r'\.extern\s+\.func\s+(?:\s*\(.+\)\s*)?('
166+
+ all_nrt + r')\s*\([^)]*\)\s*;'
167+
)
168+
169+
nrt_in_asm = re.findall(pattern, asm)
170+
171+
basedir = os.path.dirname(os.path.abspath(__file__))
172+
if nrt_in_asm:
173+
nrt_path = os.path.join(basedir, 'runtime', 'nrt.cu')
174+
link.append(nrt_path)
175+
176+
return link
177+
139178
@property
140179
def library(self):
141180
return self._codelibrary
@@ -385,7 +424,6 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
385424

386425
if isinstance(ty, types.Array):
387426
devary = wrap_arg(val).to_device(retr, stream)
388-
389427
c_intp = ctypes.c_ssize_t
390428

391429
meminfo = ctypes.c_void_p(0)

0 commit comments

Comments
 (0)