perf: cache dimension computations (#542)

cpcloud · web-flow · commit 3ebbe29adf5a · 2025-10-24T17:18:05.000+01:00
This PR adds some `functools.cache` decorations to a few functions for
computing dimensions, which are repeatedly computed for the same inputs
many times.

This does trade some space to reduce duration of conversion. That space will
scale with the number of unique combinations of shape, strides, dtypes, and
memory ordering.

If someone has a lot of arrays that have different shapes,
strides, dtypes, and memory ordering, the cache can potentially use a
lot of memory.

I'm not sure that's a likely scenario, but it's worth pointing out as a potential issue.
diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py
@@ -39,10 +39,9 @@ def from_cuda_array_interface(desc, owner=None, sync=True):
 
     shape = desc["shape"]
     strides = desc.get("strides")
-    dtype = np.dtype(desc["typestr"])
 
     shape, strides, dtype = prepare_shape_strides_dtype(
-        shape, strides, dtype, order="C"
+        shape, strides, desc["typestr"], order="C"
     )
     size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
 
diff --git a/numba_cuda/numba/cuda/api_util.py b/numba_cuda/numba/cuda/api_util.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 
+import functools
+
 
 def prepare_shape_strides_dtype(shape, strides, dtype, order):
     dtype = np.dtype(dtype)
@@ -14,25 +16,33 @@ def prepare_shape_strides_dtype(shape, strides, dtype, order):
         raise TypeError("shape must be an integer or tuple of integers")
     if isinstance(shape, int):
         shape = (shape,)
+    else:
+        shape = tuple(shape)
     if isinstance(strides, int):
         strides = (strides,)
     else:
-        strides = strides or _fill_stride_by_order(shape, dtype, order)
+        if not strides:
+            strides = _fill_stride_by_order(shape, dtype, order)
+        else:
+            strides = tuple(strides)
     return shape, strides, dtype
 
 
+@functools.cache
 def _fill_stride_by_order(shape, dtype, order):
-    nd = len(shape)
-    if nd == 0:
+    ndims = len(shape)
+    if not ndims:
         return ()
-    strides = [0] * nd
+    strides = [0] * ndims
     if order == "C":
         strides[-1] = dtype.itemsize
-        for d in reversed(range(nd - 1)):
+        # -2 because we subtract one for zero-based indexing and another one
+        # for skipping the already-filled-in last element
+        for d in range(ndims - 2, -1, -1):
             strides[d] = strides[d + 1] * shape[d + 1]
     elif order == "F":
         strides[0] = dtype.itemsize
-        for d in range(1, nd):
+        for d in range(1, ndims):
             strides[d] = strides[d - 1] * shape[d - 1]
     else:
         raise ValueError("must be either C/F order")
diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py
@@ -86,8 +86,13 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
         """
         if isinstance(shape, int):
             shape = (shape,)
+        else:
+            shape = tuple(shape)
         if isinstance(strides, int):
             strides = (strides,)
+        else:
+            if strides:
+                strides = tuple(strides)
         dtype = np.dtype(dtype)
         itemsize = dtype.itemsize
         self.ndim = ndim = len(shape)
@@ -96,9 +101,6 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
         self._dummy = dummy = dummyarray.Array.from_desc(
             0, shape, strides, itemsize
         )
-        # confirm that all elements of shape are ints
-        if not all(isinstance(dim, (int, np.integer)) for dim in shape):
-            raise TypeError("all elements of shape must be ints")
         self.shape = shape = dummy.shape
         self.strides = strides = dummy.strides
         self.dtype = dtype
@@ -121,17 +123,17 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
 
     @property
     def __cuda_array_interface__(self):
-        if self.device_ctypes_pointer.value is not None:
-            ptr = self.device_ctypes_pointer.value
+        if (value := self.device_ctypes_pointer.value) is not None:
+            ptr = value
         else:
             ptr = 0
 
         return {
-            "shape": tuple(self.shape),
+            "shape": self.shape,
             "strides": None if is_contiguous(self) else tuple(self.strides),
             "data": (ptr, False),
             "typestr": self.dtype.str,
-            "stream": int(self.stream) if self.stream != 0 else None,
+            "stream": int(stream) if (stream := self.stream) != 0 else None,
             "version": 3,
         }
 
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -3023,6 +3023,7 @@ def host_memory_extents(obj):
     return mviewbuf.memoryview_get_extents(obj)
 
 
+@functools.cache
 def memory_size_from_info(shape, strides, itemsize):
     """Get the byte size of a contiguous memory buffer given the shape, strides
     and itemsize.
diff --git a/numba_cuda/numba/cuda/cudadrv/dummyarray.py b/numba_cuda/numba/cuda/cudadrv/dummyarray.py
@@ -5,6 +5,7 @@
 import itertools
 import functools
 import operator
+import numpy as np
 
 
 Extent = namedtuple("Extent", ["begin", "end"])
@@ -245,9 +246,12 @@ class Array(object):
     is_array = True
 
     @classmethod
+    @functools.cache
     def from_desc(cls, offset, shape, strides, itemsize):
         dims = []
         for ashape, astride in zip(shape, strides):
+            if not isinstance(ashape, (int, np.integer)):
+                raise TypeError("all elements of shape must be ints")
             dim = Dim(
                 offset, offset + ashape * astride, ashape, astride, single=False
             )
@@ -442,8 +446,8 @@ def reshape(self, *newdims, **kws):
 
         ret = self.from_desc(
             self.extent.begin,
-            shape=newdims,
-            strides=newstrides,
+            shape=tuple(newdims),
+            strides=tuple(newstrides),
             itemsize=self.itemsize,
         )
 
@@ -471,8 +475,8 @@ def squeeze(self, axis=None):
                     newstrides.append(stride)
         newarr = self.from_desc(
             self.extent.begin,
-            shape=newshape,
-            strides=newstrides,
+            shape=tuple(newshape),
+            strides=tuple(newstrides),
             itemsize=self.itemsize,
         )
         return newarr, list(self.iter_contiguous_extent())
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
@@ -1629,11 +1629,15 @@ def typeof_pyval(self, val):
         try:
             return typeof(val, Purpose.argument)
         except ValueError:
-            if cuda.is_cuda_array(val):
+            if (
+                interface := getattr(val, "__cuda_array_interface__")
+            ) is not None:
                 # When typing, we don't need to synchronize on the array's
                 # stream - this is done when the kernel is launched.
+
                 return typeof(
-                    cuda.as_cuda_array(val, sync=False), Purpose.argument
+                    cuda.from_cuda_array_interface(interface, sync=False),
+                    Purpose.argument,
                 )
             else:
                 raise
diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -198,7 +198,7 @@ benchcmp = { cmd = [
     "numba.cuda.tests.benchmarks",
     "--benchmark-only",
     "--benchmark-enable",
-    "--benchmark-group-by=func",
+    "--benchmark-group-by=name",
     "--benchmark-compare",
 ] }
 

Original file line number	Diff line number	Diff line change
`@@ -39,10 +39,9 @@ def from_cuda_array_interface(desc, owner=None, sync=True):`
`39`	`39`
`40`	`40`	`shape = desc["shape"]`
`41`	`41`	`strides = desc.get("strides")`
`42`		`- dtype = np.dtype(desc["typestr"])`
`43`	`42`
`44`	`43`	`shape, strides, dtype = prepare_shape_strides_dtype(`
`45`		`- shape, strides, dtype, order="C"`
	`44`	`+ shape, strides, desc["typestr"], order="C"`
`46`	`45`	`)`
`47`	`46`	`size = driver.memory_size_from_info(shape, strides, dtype.itemsize)`
`48`	`47`