NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ci/test_conda.sh‎
Lines changed: 1 addition & 0 deletions b/‎ci/test_conda.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/test_simulator.sh‎
Lines changed: 1 addition & 0 deletions b/‎ci/test_simulator.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎numba_cuda/numba/cuda/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎numba_cuda/numba/cuda/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎numba_cuda/numba/cuda/np/ufunc/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎numba_cuda/numba/cuda/np/ufunc/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎numba_cuda/numba/cuda/np/ufunc/decorators.py‎
Lines changed: 203 additions & 0 deletions b/‎numba_cuda/numba/cuda/np/ufunc/decorators.py‎
Lines changed: 203 additions & 0 deletions
diff --git a/‎numba_cuda/numba/cuda/tests/benchmarks/__init__.py‎ b/‎numba_cuda/numba/cuda/tests/benchmarks/__init__.py‎
diff --git a/‎numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py‎
Lines changed: 63 additions & 0 deletions b/‎numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py‎
Lines changed: 63 additions & 0 deletions
@@ -19,3 +19,4 @@ testing/*.ptx
 .pixi/*
 !.pixi/config.toml
 *.log
+.benchmarks
@@ -40,14 +40,14 @@ cd testing
 make -j $(nproc)
 export NUMBA_CUDA_TEST_BIN_DIR=`pwd`
 # Execute tests
-pytest -n auto -v
+pytest -n auto -v --dist loadscope
 ```
 
 Alternatively, you can use [pixi](https://pixi.sh/latest/installation/) to wrap all of that up for you:
 
 ```
 # run tests against CUDA 13
-pixi run -e cu13 test -n auto -v
+pixi run -e cu13 test -n auto -v --dist loadscope
 ```
 
 
 
@@ -35,6 +35,7 @@ DEPENDENCIES=(
     "psutil"
     "pytest"
     "pytest-xdist"
+    "pytest-benchmark"
     "cffi"
     "ml_dtypes"
     "python=${RAPIDS_PY_VERSION}"
 
@@ -12,6 +12,7 @@ DEPENDENCIES=(
     "psutil"
     "pytest"
     "pytest-xdist"
+    "pytest-benchmark"
     "cffi"
     "ml_dtypes"
     "python=${RAPIDS_PY_VERSION}"
 
@@ -64,3 +64,5 @@
         "sys.setdlopenflags() to disable RTLD_GLOBAL "
         "if you encounter symbol conflicts."
     )
+
+from numba.cuda.np.ufunc import vectorize, guvectorize
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+from numba.cuda.np.ufunc.decorators import vectorize, guvectorize  # noqa: F401
@@ -0,0 +1,203 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+from numba.core.registry import DelayedRegistry
+from numba.cuda.vectorizers import CUDAVectorize, CUDAGUFuncVectorize
+
+
+class _BaseVectorize(object):
+    @classmethod
+    def get_identity(cls, kwargs):
+        return kwargs.pop("identity", None)
+
+    @classmethod
+    def get_cache(cls, kwargs):
+        return kwargs.pop("cache", False)
+
+    @classmethod
+    def get_writable_args(cls, kwargs):
+        return kwargs.pop("writable_args", ())
+
+    @classmethod
+    def get_target_implementation(cls, kwargs):
+        target = kwargs.pop("target", "cpu")
+        try:
+            return cls.target_registry[target]
+        except KeyError:
+            raise ValueError("Unsupported target: %s" % target)
+
+
+class Vectorize(_BaseVectorize):
+    target_registry = DelayedRegistry({"cuda": CUDAVectorize})
+
+    def __new__(cls, func, **kws):
+        identity = cls.get_identity(kws)
+        cache = cls.get_cache(kws)
+        imp = cls.get_target_implementation(kws)
+        return imp(func, identity=identity, cache=cache, targetoptions=kws)
+
+
+class GUVectorize(_BaseVectorize):
+    target_registry = DelayedRegistry({"cuda": CUDAGUFuncVectorize})
+
+    def __new__(cls, func, signature, **kws):
+        identity = cls.get_identity(kws)
+        cache = cls.get_cache(kws)
+        imp = cls.get_target_implementation(kws)
+        writable_args = cls.get_writable_args(kws)
+        return imp(
+            func,
+            signature,
+            identity=identity,
+            cache=cache,
+            targetoptions=kws,
+            writable_args=writable_args,
+        )
+
+
+def vectorize(ftylist_or_function=(), target="cuda", **kws):
+    """vectorize(ftylist_or_function=(), target='cuda', identity=None, **kws)
+
+    A decorator that creates a NumPy ufunc object using Numba compiled
+    code.  When no arguments or only keyword arguments are given,
+    vectorize will return a Numba dynamic ufunc (DUFunc) object, where
+    compilation/specialization may occur at call-time.
+
+    Args
+    -----
+    ftylist_or_function: function or iterable
+
+        When the first argument is a function, signatures are dealt
+        with at call-time.
+
+        When the first argument is an iterable of type signatures,
+        which are either function type object or a string describing
+        the function type, signatures are finalized at decoration
+        time.
+
+    Keyword Args
+    ------------
+
+    target: str
+            A string for code generation target.  Default to "cuda".
+
+    identity: int, str, or None
+        The identity (or unit) value for the element-wise function
+        being implemented.  Allowed values are None (the default), 0, 1,
+        and "reorderable".
+
+    cache: bool
+        Turns on caching.
+
+
+    Returns
+    --------
+
+    A NumPy universal function
+
+    Examples
+    -------
+        @vectorize(['float32(float32, float32)',
+                    'float64(float64, float64)'], identity=0)
+        def sum(a, b):
+            return a + b
+
+        @vectorize
+        def sum(a, b):
+            return a + b
+
+        @vectorize(identity=1)
+        def mul(a, b):
+            return a * b
+
+    """
+    if isinstance(ftylist_or_function, str):
+        # Common user mistake
+        ftylist = [ftylist_or_function]
+    elif ftylist_or_function is not None:
+        ftylist = ftylist_or_function
+
+    def wrap(func):
+        kws["target"] = target
+        vec = Vectorize(func, **kws)
+        for sig in ftylist:
+            vec.add(sig)
+        if len(ftylist) > 0:
+            vec.disable_compile()
+        return vec.build_ufunc()
+
+    return wrap
+
+
+def guvectorize(*args, **kwargs):
+    """guvectorize(ftylist, signature, target='cuda', identity=None, **kws)
+
+    A decorator to create NumPy generalized-ufunc object from Numba compiled
+    code.
+
+    Args
+    -----
+    ftylist: iterable
+        An iterable of type signatures, which are either
+        function type object or a string describing the
+        function type.
+
+    signature: str
+        A NumPy generalized-ufunc signature.
+        e.g. "(m, n), (n, p)->(m, p)"
+
+    identity: int, str, or None
+        The identity (or unit) value for the element-wise function
+        being implemented.  Allowed values are None (the default), 0, 1,
+        and "reorderable".
+
+    cache: bool
+        Turns on caching.
+
+    writable_args: tuple
+        a tuple of indices of input variables that are writable.
+
+    target: str
+            A string for code generation target.  Defaults to "cuda".
+
+    Returns
+    --------
+
+    A NumPy generalized universal-function
+
+    Example
+    -------
+        @guvectorize(['void(int32[:,:], int32[:,:], int32[:,:])',
+                      'void(float32[:,:], float32[:,:], float32[:,:])'],
+                      '(x, y),(x, y)->(x, y)')
+        def add_2d_array(a, b, c):
+            for i in range(c.shape[0]):
+                for j in range(c.shape[1]):
+                    c[i, j] = a[i, j] + b[i, j]
+
+    """
+    if len(args) == 1:
+        ftylist = []
+        signature = args[0]
+        kwargs.setdefault("is_dynamic", True)
+    elif len(args) == 2:
+        ftylist = args[0]
+        signature = args[1]
+    else:
+        raise TypeError("guvectorize() takes one or two positional arguments")
+
+    if isinstance(ftylist, str):
+        # Common user mistake
+        ftylist = [ftylist]
+
+    kwargs.setdefault("target", "cuda")
+
+    def wrap(func):
+        guvec = GUVectorize(func, signature, **kwargs)
+        for fty in ftylist:
+            guvec.add(fty)
+        if len(ftylist) > 0:
+            guvec.disable_compile()
+        return guvec.build_ufunc()
+
+    return wrap
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-2-Clause
+
+import string
+from numba import cuda
+import numpy as np
+import pytest
+
+
+@pytest.fixture
+def many_arrs():
+    return [
+        cuda.device_array(10000, dtype=np.float32)
+        for _ in range(len(string.ascii_lowercase))
+    ]
+
+
+@pytest.fixture
+def one_arr():
+    return cuda.device_array(10000, dtype=np.float32)
+
+
+def test_one_arg(benchmark, one_arr):
+    @cuda.jit("void(float32[:])")
+    def one_arg(arr1):
+        return
+
+    benchmark(one_arg[1, 1], one_arr)
+
+
+def test_many_args(benchmark, many_arrs):
+    @cuda.jit("void({})".format(", ".join(["float32[:]"] * len(many_arrs))))
+    def many_args(
+        a,
+        b,
+        c,
+        d,
+        e,
+        f,
+        g,
+        h,
+        i,
+        j,
+        k,
+        l,
+        m,
+        n,
+        o,
+        p,
+        q,
+        r,
+        s,
+        t,
+        u,
+        v,
+        w,
+        x,
+        y,
+        z,
+    ):
+        return
+
+    benchmark(many_args[1, 1], *many_arrs)
Original file line number	Diff line number	Diff line change
`@@ -64,3 +64,5 @@`
`64`	`64`	`"sys.setdlopenflags() to disable RTLD_GLOBAL "`
`65`	`65`	`"if you encounter symbol conflicts."`
`66`	`66`	`)`
	`67`	`+`
	`68`	`+from numba.cuda.np.ufunc import vectorize, guvectorize`