Skip to content

Commit 8095d5e

Browse files
committed
Merge 'main' into vk/types
2 parents c4e6c4a + a194f90 commit 8095d5e

17 files changed

+535
-156
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ testing/*.ptx
1919
.pixi/*
2020
!.pixi/config.toml
2121
*.log
22+
.benchmarks

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,14 @@ cd testing
4040
make -j $(nproc)
4141
export NUMBA_CUDA_TEST_BIN_DIR=`pwd`
4242
# Execute tests
43-
pytest -n auto -v
43+
pytest -n auto -v --dist loadscope
4444
```
4545

4646
Alternatively, you can use [pixi](https://pixi.sh/latest/installation/) to wrap all of that up for you:
4747

4848
```
4949
# run tests against CUDA 13
50-
pixi run -e cu13 test -n auto -v
50+
pixi run -e cu13 test -n auto -v --dist loadscope
5151
```
5252

5353

ci/test_conda.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ DEPENDENCIES=(
3535
"psutil"
3636
"pytest"
3737
"pytest-xdist"
38+
"pytest-benchmark"
3839
"cffi"
3940
"ml_dtypes"
4041
"python=${RAPIDS_PY_VERSION}"

ci/test_simulator.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ DEPENDENCIES=(
1212
"psutil"
1313
"pytest"
1414
"pytest-xdist"
15+
"pytest-benchmark"
1516
"cffi"
1617
"ml_dtypes"
1718
"python=${RAPIDS_PY_VERSION}"

numba_cuda/numba/cuda/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,5 @@
6464
"sys.setdlopenflags() to disable RTLD_GLOBAL "
6565
"if you encounter symbol conflicts."
6666
)
67+
68+
from numba.cuda.np.ufunc import vectorize, guvectorize
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: BSD-2-Clause
3+
4+
from numba.cuda.np.ufunc.decorators import vectorize, guvectorize # noqa: F401
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: BSD-2-Clause
3+
4+
from numba.core.registry import DelayedRegistry
5+
from numba.cuda.vectorizers import CUDAVectorize, CUDAGUFuncVectorize
6+
7+
8+
class _BaseVectorize(object):
9+
@classmethod
10+
def get_identity(cls, kwargs):
11+
return kwargs.pop("identity", None)
12+
13+
@classmethod
14+
def get_cache(cls, kwargs):
15+
return kwargs.pop("cache", False)
16+
17+
@classmethod
18+
def get_writable_args(cls, kwargs):
19+
return kwargs.pop("writable_args", ())
20+
21+
@classmethod
22+
def get_target_implementation(cls, kwargs):
23+
target = kwargs.pop("target", "cpu")
24+
try:
25+
return cls.target_registry[target]
26+
except KeyError:
27+
raise ValueError("Unsupported target: %s" % target)
28+
29+
30+
class Vectorize(_BaseVectorize):
31+
target_registry = DelayedRegistry({"cuda": CUDAVectorize})
32+
33+
def __new__(cls, func, **kws):
34+
identity = cls.get_identity(kws)
35+
cache = cls.get_cache(kws)
36+
imp = cls.get_target_implementation(kws)
37+
return imp(func, identity=identity, cache=cache, targetoptions=kws)
38+
39+
40+
class GUVectorize(_BaseVectorize):
41+
target_registry = DelayedRegistry({"cuda": CUDAGUFuncVectorize})
42+
43+
def __new__(cls, func, signature, **kws):
44+
identity = cls.get_identity(kws)
45+
cache = cls.get_cache(kws)
46+
imp = cls.get_target_implementation(kws)
47+
writable_args = cls.get_writable_args(kws)
48+
return imp(
49+
func,
50+
signature,
51+
identity=identity,
52+
cache=cache,
53+
targetoptions=kws,
54+
writable_args=writable_args,
55+
)
56+
57+
58+
def vectorize(ftylist_or_function=(), target="cuda", **kws):
59+
"""vectorize(ftylist_or_function=(), target='cuda', identity=None, **kws)
60+
61+
A decorator that creates a NumPy ufunc object using Numba compiled
62+
code. When no arguments or only keyword arguments are given,
63+
vectorize will return a Numba dynamic ufunc (DUFunc) object, where
64+
compilation/specialization may occur at call-time.
65+
66+
Args
67+
-----
68+
ftylist_or_function: function or iterable
69+
70+
When the first argument is a function, signatures are dealt
71+
with at call-time.
72+
73+
When the first argument is an iterable of type signatures,
74+
which are either function type object or a string describing
75+
the function type, signatures are finalized at decoration
76+
time.
77+
78+
Keyword Args
79+
------------
80+
81+
target: str
82+
A string for code generation target. Default to "cuda".
83+
84+
identity: int, str, or None
85+
The identity (or unit) value for the element-wise function
86+
being implemented. Allowed values are None (the default), 0, 1,
87+
and "reorderable".
88+
89+
cache: bool
90+
Turns on caching.
91+
92+
93+
Returns
94+
--------
95+
96+
A NumPy universal function
97+
98+
Examples
99+
-------
100+
@vectorize(['float32(float32, float32)',
101+
'float64(float64, float64)'], identity=0)
102+
def sum(a, b):
103+
return a + b
104+
105+
@vectorize
106+
def sum(a, b):
107+
return a + b
108+
109+
@vectorize(identity=1)
110+
def mul(a, b):
111+
return a * b
112+
113+
"""
114+
if isinstance(ftylist_or_function, str):
115+
# Common user mistake
116+
ftylist = [ftylist_or_function]
117+
elif ftylist_or_function is not None:
118+
ftylist = ftylist_or_function
119+
120+
def wrap(func):
121+
kws["target"] = target
122+
vec = Vectorize(func, **kws)
123+
for sig in ftylist:
124+
vec.add(sig)
125+
if len(ftylist) > 0:
126+
vec.disable_compile()
127+
return vec.build_ufunc()
128+
129+
return wrap
130+
131+
132+
def guvectorize(*args, **kwargs):
133+
"""guvectorize(ftylist, signature, target='cuda', identity=None, **kws)
134+
135+
A decorator to create NumPy generalized-ufunc object from Numba compiled
136+
code.
137+
138+
Args
139+
-----
140+
ftylist: iterable
141+
An iterable of type signatures, which are either
142+
function type object or a string describing the
143+
function type.
144+
145+
signature: str
146+
A NumPy generalized-ufunc signature.
147+
e.g. "(m, n), (n, p)->(m, p)"
148+
149+
identity: int, str, or None
150+
The identity (or unit) value for the element-wise function
151+
being implemented. Allowed values are None (the default), 0, 1,
152+
and "reorderable".
153+
154+
cache: bool
155+
Turns on caching.
156+
157+
writable_args: tuple
158+
a tuple of indices of input variables that are writable.
159+
160+
target: str
161+
A string for code generation target. Defaults to "cuda".
162+
163+
Returns
164+
--------
165+
166+
A NumPy generalized universal-function
167+
168+
Example
169+
-------
170+
@guvectorize(['void(int32[:,:], int32[:,:], int32[:,:])',
171+
'void(float32[:,:], float32[:,:], float32[:,:])'],
172+
'(x, y),(x, y)->(x, y)')
173+
def add_2d_array(a, b, c):
174+
for i in range(c.shape[0]):
175+
for j in range(c.shape[1]):
176+
c[i, j] = a[i, j] + b[i, j]
177+
178+
"""
179+
if len(args) == 1:
180+
ftylist = []
181+
signature = args[0]
182+
kwargs.setdefault("is_dynamic", True)
183+
elif len(args) == 2:
184+
ftylist = args[0]
185+
signature = args[1]
186+
else:
187+
raise TypeError("guvectorize() takes one or two positional arguments")
188+
189+
if isinstance(ftylist, str):
190+
# Common user mistake
191+
ftylist = [ftylist]
192+
193+
kwargs.setdefault("target", "cuda")
194+
195+
def wrap(func):
196+
guvec = GUVectorize(func, signature, **kwargs)
197+
for fty in ftylist:
198+
guvec.add(fty)
199+
if len(ftylist) > 0:
200+
guvec.disable_compile()
201+
return guvec.build_ufunc()
202+
203+
return wrap

numba_cuda/numba/cuda/tests/benchmarks/__init__.py

Whitespace-only changes.
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: BSD-2-Clause
3+
4+
import string
5+
from numba import cuda
6+
import numpy as np
7+
import pytest
8+
9+
10+
@pytest.fixture
11+
def many_arrs():
12+
return [
13+
cuda.device_array(10000, dtype=np.float32)
14+
for _ in range(len(string.ascii_lowercase))
15+
]
16+
17+
18+
@pytest.fixture
19+
def one_arr():
20+
return cuda.device_array(10000, dtype=np.float32)
21+
22+
23+
def test_one_arg(benchmark, one_arr):
24+
@cuda.jit("void(float32[:])")
25+
def one_arg(arr1):
26+
return
27+
28+
benchmark(one_arg[1, 1], one_arr)
29+
30+
31+
def test_many_args(benchmark, many_arrs):
32+
@cuda.jit("void({})".format(", ".join(["float32[:]"] * len(many_arrs))))
33+
def many_args(
34+
a,
35+
b,
36+
c,
37+
d,
38+
e,
39+
f,
40+
g,
41+
h,
42+
i,
43+
j,
44+
k,
45+
l,
46+
m,
47+
n,
48+
o,
49+
p,
50+
q,
51+
r,
52+
s,
53+
t,
54+
u,
55+
v,
56+
w,
57+
x,
58+
y,
59+
z,
60+
):
61+
return
62+
63+
benchmark(many_args[1, 1], *many_arrs)

0 commit comments

Comments
 (0)