NVIDIA
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ci/test_conda.sh‎
Lines changed: 1 addition & 0 deletions b/‎ci/test_conda.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ci/test_simulator.sh‎
Lines changed: 1 addition & 0 deletions b/‎ci/test_simulator.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎numba_cuda/numba/cuda/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎numba_cuda/numba/cuda/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎numba_cuda/numba/cuda/api.py‎
Lines changed: 6 additions & 10 deletions b/‎numba_cuda/numba/cuda/api.py‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎numba_cuda/numba/cuda/args.py‎
Lines changed: 1 addition & 5 deletions b/‎numba_cuda/numba/cuda/args.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎numba_cuda/numba/cuda/core/ssa.py‎
Lines changed: 4 additions & 4 deletions b/‎numba_cuda/numba/cuda/core/ssa.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎numba_cuda/numba/cuda/cudadrv/devicearray.py‎
Lines changed: 23 additions & 20 deletions b/‎numba_cuda/numba/cuda/cudadrv/devicearray.py‎
Lines changed: 23 additions & 20 deletions
diff --git a/‎numba_cuda/numba/cuda/cudadrv/devices.py‎
Lines changed: 29 additions & 33 deletions b/‎numba_cuda/numba/cuda/cudadrv/devices.py‎
Lines changed: 29 additions & 33 deletions
@@ -19,3 +19,4 @@ testing/*.ptx
 .pixi/*
 !.pixi/config.toml
 *.log
+.benchmarks
@@ -40,14 +40,14 @@ cd testing
 make -j $(nproc)
 export NUMBA_CUDA_TEST_BIN_DIR=`pwd`
 # Execute tests
-pytest -n auto -v
+pytest -n auto -v --dist loadscope
 ```
 
 Alternatively, you can use [pixi](https://pixi.sh/latest/installation/) to wrap all of that up for you:
 
 ```
 # run tests against CUDA 13
-pixi run -e cu13 test -n auto -v
+pixi run -e cu13 test -n auto -v --dist loadscope
 ```
 
 
 
@@ -35,6 +35,7 @@ DEPENDENCIES=(
     "psutil"
     "pytest"
     "pytest-xdist"
+    "pytest-benchmark"
     "cffi"
     "ml_dtypes"
     "python=${RAPIDS_PY_VERSION}"
 
@@ -12,6 +12,7 @@ DEPENDENCIES=(
     "psutil"
     "pytest"
     "pytest-xdist"
+    "pytest-benchmark"
     "cffi"
     "ml_dtypes"
     "python=${RAPIDS_PY_VERSION}"
 
@@ -58,3 +58,5 @@
         "sys.setdlopenflags() to disable RTLD_GLOBAL "
         "if you encounter symbol conflicts."
     )
+
+from numba.cuda.np.ufunc import vectorize, guvectorize
@@ -21,7 +21,6 @@
 gpus = devices.gpus
 
 
-@require_context
 def from_cuda_array_interface(desc, owner=None, sync=True):
     """Create a DeviceNDArray from a cuda-array-interface description.
     The ``owner`` is the owner of the underlying memory.
@@ -49,9 +48,7 @@ def from_cuda_array_interface(desc, owner=None, sync=True):
 
     cudevptr_class = driver.binding.CUdeviceptr
     devptr = cudevptr_class(desc["data"][0])
-    data = driver.MemoryPointer(
-        current_context(), devptr, size=size, owner=owner
-    )
+    data = driver.MemoryPointer(devptr, size=size, owner=owner)
     stream_ptr = desc.get("stream", None)
     if stream_ptr is not None:
         stream = external_stream(stream_ptr)
@@ -75,12 +72,11 @@ def as_cuda_array(obj, sync=True):
     If ``sync`` is ``True``, then the imported stream (if present) will be
     synchronized.
     """
-    if not is_cuda_array(obj):
-        raise TypeError("*obj* doesn't implement the cuda array interface.")
-    else:
-        return from_cuda_array_interface(
-            obj.__cuda_array_interface__, owner=obj, sync=sync
-        )
+    if (
+        interface := getattr(obj, "__cuda_array_interface__", None)
+    ) is not None:
+        return from_cuda_array_interface(interface, owner=obj, sync=sync)
+    raise TypeError("*obj* doesn't implement the cuda array interface.")
 
 
 def is_cuda_array(obj):
 
@@ -6,16 +6,13 @@
 memory transfers before & after the kernel call.
 """
 
-import abc
-
 from numba.cuda.typing.typeof import typeof, Purpose
 
 
-class ArgHint(metaclass=abc.ABCMeta):
+class ArgHint:
     def __init__(self, value):
         self.value = value
 
-    @abc.abstractmethod
     def to_device(self, retr, stream=0):
         """
         :param stream: a stream to use when copying data
@@ -25,7 +22,6 @@ def to_device(self, retr, stream=0):
         :return: a value (usually an `DeviceNDArray`) to be passed to
             the kernel
         """
-        pass
 
     @property
     def _numba_type_(self):
 
@@ -20,7 +20,7 @@
 from numba.cuda import config
 from numba.core import ir, errors
 from numba.cuda.core import ir_utils
-from numba.cuda.utils import OrderedSet, _lazy_pformat
+from numba.cuda.utils import _lazy_pformat
 from numba.cuda.core.analysis import compute_cfg_from_blocks
 
 
@@ -160,7 +160,7 @@ def _find_defs_violators(blocks, cfg):
     # Gather violators by number of definitions.
     # The violators are added by the order that they are seen and the algorithm
     # scan from the first to the last basic-block as they occur in bytecode.
-    violators = OrderedSet([k for k, vs in defs.items() if len(vs) > 1])
+    violators = {k: None for k, vs in defs.items() if len(vs) > 1}
     # Gather violators by uses not dominated by the one def
     doms = cfg.dominators()
     for k, use_blocks in uses.items():
@@ -169,9 +169,9 @@ def _find_defs_violators(blocks, cfg):
                 dom = doms[label]
                 def_labels = {label for _assign, label in defs[k]}
                 if not def_labels.intersection(dom):
-                    violators.add(k)
+                    violators[k] = None
                     break
-    _logger.debug("SSA violators %s", _lazy_pformat(violators))
+    _logger.debug("SSA violators %s", _lazy_pformat(list(violators)))
     return violators
 
 
 
@@ -15,7 +15,6 @@
 
 import numpy as np
 
-import numba
 from numba.cuda.cext import _devicearray
 from numba.cuda.cudadrv import devices, dummyarray
 from numba.cuda.cudadrv import driver as _driver
@@ -90,32 +89,31 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
         if isinstance(strides, int):
             strides = (strides,)
         dtype = np.dtype(dtype)
-        self.ndim = len(shape)
-        if len(strides) != self.ndim:
+        itemsize = dtype.itemsize
+        self.ndim = ndim = len(shape)
+        if len(strides) != ndim:
             raise ValueError("strides not match ndim")
-        self._dummy = dummyarray.Array.from_desc(
-            0, shape, strides, dtype.itemsize
+        self._dummy = dummy = dummyarray.Array.from_desc(
+            0, shape, strides, itemsize
         )
         # confirm that all elements of shape are ints
         if not all(isinstance(dim, (int, np.integer)) for dim in shape):
             raise TypeError("all elements of shape must be ints")
-        self.shape = tuple(shape)
-        self.strides = tuple(strides)
+        self.shape = shape = dummy.shape
+        self.strides = strides = dummy.strides
         self.dtype = dtype
-        self.size = int(functools.reduce(operator.mul, self.shape, 1))
+        self.size = size = dummy.size
         # prepare gpu memory
-        if self.size > 0:
-            self.alloc_size = _driver.memory_size_from_info(
-                self.shape, self.strides, self.dtype.itemsize
+        if size:
+            self.alloc_size = alloc_size = _driver.memory_size_from_info(
+                shape, strides, itemsize
             )
             if gpu_data is None:
-                gpu_data = devices.get_context().memalloc(self.alloc_size)
+                gpu_data = devices.get_context().memalloc(alloc_size)
         else:
             # Make NULL pointer for empty allocation
             null = _driver.binding.CUdeviceptr(0)
-            gpu_data = _driver.MemoryPointer(
-                context=devices.get_context(), pointer=null, size=0
-            )
+            gpu_data = _driver.MemoryPointer(pointer=null, size=0)
             self.alloc_size = 0
 
         self.gpu_data = gpu_data
@@ -199,10 +197,11 @@ def _numba_type_(self):
     @property
     def device_ctypes_pointer(self):
         """Returns the ctypes pointer to the GPU data buffer"""
-        if self.gpu_data is None:
-            return c_void_p(0)
-        else:
+        try:
+            # apparently faster in the non-exceptional case
             return self.gpu_data.device_ctypes_pointer
+        except AttributeError:
+            return c_void_p(0)
 
     @devices.require_context
     def copy_to_device(self, ary, stream=0):
@@ -901,8 +900,12 @@ def auto_device(obj, stream=0, copy=True, user_explicit=False):
     """
     if _driver.is_device_memory(obj):
         return obj, False
-    elif hasattr(obj, "__cuda_array_interface__"):
-        return numba.cuda.as_cuda_array(obj), False
+    elif (
+        interface := getattr(obj, "__cuda_array_interface__", None)
+    ) is not None:
+        from numba.cuda.api import from_cuda_array_interface
+
+        return from_cuda_array_interface(interface, owner=obj), False
     else:
         if isinstance(obj, np.void):
             devobj = from_record_like(obj, stream=stream)
 
@@ -21,23 +21,14 @@
 from .driver import driver
 
 
-class _DeviceList(object):
-    def __getattr__(self, attr):
-        # First time looking at "lst" attribute.
-        if attr == "lst":
-            # Device list is not initialized.
-            # Query all CUDA devices.
-            numdev = driver.get_device_count()
-            gpus = [
-                _DeviceContextManager(driver.get_device(devid))
-                for devid in range(numdev)
-            ]
-            # Define "lst" to avoid re-initialization
-            self.lst = gpus
-            return gpus
-
-        # Other attributes
-        return super(_DeviceList, self).__getattr__(attr)
+class _DeviceList:
+    @property
+    @functools.cache
+    def lst(self):
+        return [
+            _DeviceContextManager(driver.get_device(devid))
+            for devid in range(driver.get_device_count())
+        ]
 
     def __getitem__(self, devnum):
         """
@@ -79,6 +70,9 @@ class _DeviceContextManager(object):
 
     def __init__(self, device):
         self._device = device
+        # Forwarded directly, to avoid the performance overhead of
+        # `__getattr__` and method lookup for a commonly accessed method
+        self.get_primary_context = self._device.get_primary_context
 
     def __getattr__(self, item):
         return getattr(self._device, item)
@@ -88,10 +82,10 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         # this will verify that we are popping the right device context.
-        self._device.get_primary_context().pop()
+        self.get_primary_context().pop()
 
     def __str__(self):
-        return "<Managed Device {self.id}>".format(self=self)
+        return f"<Managed Device {self.id}>"
 
 
 class _Runtime(object):
@@ -147,7 +141,8 @@ def get_or_create_context(self, devnum):
                 return attached_ctx
         else:
             devnum = int(devnum)
-            return self._activate_context_for(devnum)
+            with self._lock:
+                return self._activate_context_for(devnum)
 
     def _get_or_create_context_uncached(self, devnum):
         """See also ``get_or_create_context(devnum)``.
@@ -166,28 +161,29 @@ def _get_or_create_context_uncached(self, devnum):
                     ctx_handle = ctx.handle.value
                     ac_ctx_handle = ac.context_handle.value
                     if ctx_handle != ac_ctx_handle:
-                        msg = (
+                        raise RuntimeError(
                             "Numba cannot operate on non-primary"
-                            " CUDA context {:x}"
+                            f" CUDA context {ac_ctx_handle:x}"
                         )
-                        raise RuntimeError(msg.format(ac_ctx_handle))
                     # Ensure the context is ready
                     ctx.prepare_for_use()
                 return ctx
 
     def _activate_context_for(self, devnum):
-        with self._lock:
-            gpu = self.gpus[devnum]
-            newctx = gpu.get_primary_context()
-            # Detect unexpected context switch
-            cached_ctx = self._get_attached_context()
-            if cached_ctx is not None and cached_ctx is not newctx:
-                raise RuntimeError("Cannot switch CUDA-context.")
-            newctx.push()
-            return newctx
+        gpu = self.gpus[devnum]
+        newctx = gpu.get_primary_context()
+        # Detect unexpected context switch
+        cached_ctx = self._get_attached_context()
+        if cached_ctx is not None and cached_ctx is not newctx:
+            raise RuntimeError("Cannot switch CUDA-context.")
+        newctx.push()
+        return newctx
 
     def _get_attached_context(self):
-        return getattr(self._tls, "attached_context", None)
+        try:
+            return self._tls.attached_context
+        except AttributeError:
+            return None
 
     def _set_attached_context(self, ctx):
         self._tls.attached_context = ctx
Original file line number	Diff line number	Diff line change
`@@ -58,3 +58,5 @@`
`58`	`58`	`"sys.setdlopenflags() to disable RTLD_GLOBAL "`
`59`	`59`	`"if you encounter symbol conflicts."`
`60`	`60`	`)`
	`61`	`+`
	`62`	`+from numba.cuda.np.ufunc import vectorize, guvectorize`