Implement alignment support for local and shared arrays.

tpn · tpn · commit 24a02efa9bee · 2025-03-03T13:50:14.000-08:00
diff --git a/numba_cuda/numba/cuda/cudadecl.py b/numba_cuda/numba/cuda/cudadecl.py
@@ -25,7 +25,7 @@
 
 class Cuda_array_decl(CallableTemplate):
     def generic(self):
-        def typer(shape, dtype):
+        def typer(shape, dtype, alignment=None):
 
             # Only integer literals and tuples of integer literals are valid
             # shapes
@@ -39,6 +39,14 @@ def typer(shape, dtype):
             else:
                 return None
 
+            # N.B. We don't do anything with alignment in this routine; it's
+            #      not part of the underlying types.Array interface, so we
+            #      don't need to pass it down the stack.  The value supplied
+            #      to the array declaration will be handled in the lowering.
+            #
+            #      E.g. `cuda.local.array(..., alignment=256)` will be handled
+            #      by `cudaimpl.cuda_local_array_integer()`.
+
             ndim = parse_shape(shape)
             nb_dtype = parse_dtype(dtype)
             if nb_dtype is not None and ndim is not None:
diff --git a/numba_cuda/numba/cuda/cudaimpl.py b/numba_cuda/numba/cuda/cudaimpl.py
@@ -1,6 +1,7 @@
 from functools import reduce
 import operator
 import math
+import struct
 
 from llvmlite import ir
 import llvmlite.binding as ll
@@ -91,35 +92,85 @@ def _get_unique_smem_id(name):
     return "{0}_{1}".format(name, _unique_smem_id)
 
 
+def _validate_alignment(alignment: int):
+    """
+    Ensures that *alignment*, if not None, is a) greater than zero, b) a power
+    of two, and c) a multiple of the size of a pointer.  If any of these
+    conditions are not met, a NumbaValueError is raised.  Otherwise, this
+    function returns None, indicating that the alignment is valid.
+    """
+    if alignment is None:
+        return
+    if not isinstance(alignment, int):
+        raise ValueError("Alignment must be an integer")
+    if alignment <= 0:
+        raise ValueError("Alignment must be positive")
+    if (alignment & (alignment - 1)) != 0:
+        raise ValueError("Alignment must be a power of 2")
+    pointer_size = struct.calcsize("P")
+    if (alignment % pointer_size) != 0:
+        msg = f"Alignment must be a multiple of {pointer_size}"
+        raise ValueError(msg)
+
+
 @lower(cuda.shared.array, types.IntegerLiteral, types.Any)
+@lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
+@lower(cuda.shared.array, types.IntegerLiteral, types.Any, types.NoneType)
 def cuda_shared_array_integer(context, builder, sig, args):
     length = sig.args[0].literal_value
     dtype = parse_dtype(sig.args[1])
+    alignment = None
+    if len(sig.args) == 3:
+        try:
+            alignment = sig.args[2].literal_value
+            _validate_alignment(alignment)
+        except (AttributeError, ValueError):
+            pass
     return _generic_array(context, builder, shape=(length,), dtype=dtype,
                           symbol_name=_get_unique_smem_id('_cudapy_smem'),
                           addrspace=nvvm.ADDRSPACE_SHARED,
-                          can_dynsized=True)
+                          can_dynsized=True, alignment=alignment)
 
 
 @lower(cuda.shared.array, types.Tuple, types.Any)
 @lower(cuda.shared.array, types.UniTuple, types.Any)
+@lower(cuda.shared.array, types.Tuple, types.Any, types.IntegerLiteral)
+@lower(cuda.shared.array, types.UniTuple, types.Any, types.IntegerLiteral)
+@lower(cuda.shared.array, types.Tuple, types.Any, types.NoneType)
+@lower(cuda.shared.array, types.UniTuple, types.Any, types.NoneType)
 def cuda_shared_array_tuple(context, builder, sig, args):
     shape = [ s.literal_value for s in sig.args[0] ]
     dtype = parse_dtype(sig.args[1])
+    alignment = None
+    if len(sig.args) == 3:
+        try:
+            alignment = sig.args[2].literal_value
+            _validate_alignment(alignment)
+        except (AttributeError, ValueError):
+            pass
     return _generic_array(context, builder, shape=shape, dtype=dtype,
                           symbol_name=_get_unique_smem_id('_cudapy_smem'),
                           addrspace=nvvm.ADDRSPACE_SHARED,
-                          can_dynsized=True)
+                          can_dynsized=True, alignment=alignment)
 
 
 @lower(cuda.local.array, types.IntegerLiteral, types.Any)
+@lower(cuda.local.array, types.IntegerLiteral, types.Any, types.IntegerLiteral)
+@lower(cuda.local.array, types.IntegerLiteral, types.Any, types.NoneType)
 def cuda_local_array_integer(context, builder, sig, args):
     length = sig.args[0].literal_value
     dtype = parse_dtype(sig.args[1])
+    alignment = None
+    if len(sig.args) == 3:
+        try:
+            alignment = sig.args[2].literal_value
+            _validate_alignment(alignment)
+        except (AttributeError, ValueError):
+            pass
     return _generic_array(context, builder, shape=(length,), dtype=dtype,
                           symbol_name='_cudapy_lmem',
                           addrspace=nvvm.ADDRSPACE_LOCAL,
-                          can_dynsized=False)
+                          can_dynsized=False, alignment=alignment)
 
 
 @lower(cuda.local.array, types.Tuple, types.Any)
@@ -954,7 +1005,7 @@ def ptx_nanosleep(context, builder, sig, args):
 
 
 def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
-                   can_dynsized=False):
+                   can_dynsized=False, alignment=None):
     elemcount = reduce(operator.mul, shape, 1)
 
     # Check for valid shape for this type of allocation.
@@ -981,17 +1032,37 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
         # NVVM is smart enough to only use local memory if no register is
         # available
         dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name)
+
+        # If the caller has specified a custom alignment, just set the align
+        # attribute on the alloca IR directly.  We don't do any additional
+        # hand-holding here like checking the underlying data type's alignment
+        # or rounding up to the next power of 2--those checks will have already
+        # been done by the time we see the alignment value.
+        if alignment is not None:
+            dataptr.align = alignment
     else:
         lmod = builder.module
 
         # Create global variable in the requested address space
         gvmem = cgutils.add_global_variable(lmod, laryty, symbol_name,
                                             addrspace)
-        # Specify alignment to avoid misalignment bug
-        align = context.get_abi_sizeof(lldtype)
-        # Alignment is required to be a power of 2 for shared memory. If it is
-        # not a power of 2 (e.g. for a Record array) then round up accordingly.
-        gvmem.align = 1 << (align - 1 ).bit_length()
+
+        # If the caller hasn't specified a custom alignment, obtain the
+        # underlying dtype alignment from the ABI and then round it up to
+        # a power of two.  Otherwise, just use the caller's alignment.
+        #
+        # N.B. The caller *could* provide a valid-but-smaller-than-natural
+        #      alignment here; we'll assume the caller knows what they're
+        #      doing and let that through without error.
+
+        if alignment is None:
+            abi_alignment = context.get_abi_alignment(lldtype)
+            # Ensure a power of two alignment.
+            actual_alignment = 1 << (abi_alignment - 1).bit_length()
+        else:
+            actual_alignment = alignment
+
+        gvmem.align = actual_alignment
 
         if dynamic_smem:
             gvmem.linkage = 'external'
@@ -1041,7 +1112,8 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
 
     # Create array object
     ndim = len(shape)
-    aryty = types.Array(dtype=dtype, ndim=ndim, layout='C')
+    aryty = types.Array(dtype=dtype, ndim=ndim, layout='C',
+                        alignment=alignment)
     ary = context.make_array(aryty)(context, builder)
 
     context.populate_array(ary,
diff --git a/numba_cuda/numba/cuda/stubs.py b/numba_cuda/numba/cuda/stubs.py
@@ -116,12 +116,17 @@ class shared(Stub):
     _description_ = '<shared>'
 
     @stub_function
-    def array(shape, dtype):
+    def array(shape, dtype, alignment=None):
         '''
-        Allocate a shared array of the given *shape* and *type*. *shape* is
-        either an integer or a tuple of integers representing the array's
-        dimensions.  *type* is a :ref:`Numba type <numba-types>` of the
-        elements needing to be stored in the array.
+        Allocate a shared array of the given *shape*, *type*, and, optionally,
+        *alignment*.  *shape* is either an integer or a tuple of integers
+        representing the array's dimensions.  *type* is a :ref:`Numba type
+        <numba-types>` of the elements needing to be stored in the array.
+        *alignment* is an optional integer specifying the byte alignment of
+        the array.  When specified, it must be a power of two, and a multiple
+        of the size of a pointer (4 for 32-bit, 8 for 64-bit).  When not
+        specified, the array is allocated with an alignment appropriate for
+        the supplied *dtype*.
 
         The returned array-like object can be read and written to like any
         normal device array (e.g. through indexing).
@@ -135,12 +140,21 @@ class local(Stub):
     _description_ = '<local>'
 
     @stub_function
-    def array(shape, dtype):
+    def array(shape, dtype, alignment=None):
         '''
-        Allocate a local array of the given *shape* and *type*. The array is
-        private to the current thread, and resides in global memory. An
-        array-like object is returned which can be read and written to like any
-        standard array (e.g.  through indexing).
+        Allocate a local array of the given *shape*, *type*, and, optionally,
+        *alignment*.  *shape* is either an integer or a tuple of integers
+        representing the array's dimensions.  *type* is a :ref:`Numba type
+        <numba-types>` of the elements needing to be stored in the array.
+        *alignment* is an optional integer specifying the byte alignment of
+        the array.  When specified, it must be a power of two, and a multiple
+        of the size of a pointer (4 for 32-bit, 8 for 64-bit).  When not
+        specified, the array is allocated with an alignment appropriate for
+        the supplied *dtype*.
+
+        The array is private to the current thread, and resides in global
+        memory.  An array-like object is returned which can be read and
+        written to like any standard array (e.g. through indexing).
         '''
 
 
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_alignment.py
@@ -0,0 +1,114 @@
+import itertools
+import numpy as np
+from numba import cuda
+from numba.cuda.testing import CUDATestCase
+from numba.core.errors import TypingError
+import unittest
+
+
+# Set to true if you want to see dots printed for each subtest.
+NOISY = True
+
+
+# N.B. We name the test class TestArrayAddressAlignment to avoid name conflict
+#      with the test_alignment.TestArrayAlignment class.
+
+
+class TestArrayAddressAlignment(CUDATestCase):
+    """
+    Test cuda.local.array and cuda.shared.array support for an alignment
+    keyword argument.
+    """
+
+    def test_array_alignment(self):
+        shapes = (1, 3, 4, 8, 9, 50)
+        dtypes = (np.uint8, np.uint16, np.uint32, np.uint64)
+        alignments = (None, 8, 16, 32, 64, 128, 256)
+        array_types = [(0, 'local'), (1, 'shared')]
+
+        items = itertools.product(array_types, shapes, dtypes, alignments)
+
+        for (which, array_type), shape, dtype, alignment in items:
+            with self.subTest(array_type=array_type, shape=shape,
+                              dtype=dtype, alignment=alignment):
+                @cuda.jit
+                def f(loc, shrd, which):
+                    i = cuda.grid(1)
+                    if which == 0:
+                        local_array = cuda.local.array(
+                            shape=shape,
+                            dtype=dtype,
+                            alignment=alignment,
+                        )
+                        if i == 0:
+                            loc[0] = local_array.ctypes.data
+                    else:
+                        shared_array = cuda.shared.array(
+                            shape=shape,
+                            dtype=dtype,
+                            alignment=alignment,
+                        )
+                        if i == 0:
+                            shrd[0] = shared_array.ctypes.data
+
+                loc = np.zeros(1, dtype=np.uint64)
+                shrd = np.zeros(1, dtype=np.uint64)
+                f[1, 1](loc, shrd, which)
+
+                if alignment is not None:
+                    address = loc[0] if which == 0 else shrd[0]
+                    alignment_mod = int(address % alignment)
+                    self.assertEqual(alignment_mod, 0)
+
+                if NOISY:
+                    print('.', end='', flush=True)
+
+    def test_invalid_aligments(self):
+        shapes = (1, 3, 4, 8, 9, 50)
+        dtypes = (np.uint8, np.uint16, np.uint32, np.uint64)
+        alignments = (-1, 0, 3, 5, 7, 9, 15, 17, 31, 33, 63, 65)
+        array_types = [(0, 'local'), (1, 'shared')]
+
+        items = itertools.product(array_types, shapes, dtypes, alignments)
+
+        for (which, array_type), shape, dtype, alignment in items:
+            with self.subTest(array_type=array_type, shape=shape,
+                              dtype=dtype, alignment=alignment):
+                @cuda.jit
+                def f(local_array, shared_array, which):
+                    i = cuda.grid(1)
+                    if which == 0:
+                        local_array = cuda.local.array(
+                            shape=shape,
+                            dtype=dtype,
+                            alignment=alignment,
+                        )
+                        if i == 0:
+                            local_array[0] = local_array.ctypes.data
+                    else:
+                        shared_array = cuda.shared.array(
+                            shape=shape,
+                            dtype=dtype,
+                            alignment=alignment,
+                        )
+                        if i == 0:
+                            shared_array[0] = shared_array.ctypes.data
+
+                loc = np.zeros(1, dtype=np.uint64)
+                shrd = np.zeros(1, dtype=np.uint64)
+
+                with self.assertRaises(TypingError) as raises:
+                    f[1, 1](loc, shrd, which)
+                exc = str(raises.exception)
+                self.assertIn("Alignment must be", exc)
+
+                if NOISY:
+                    print('.', end='', flush=True)
+
+    def test_array_like(self):
+        # XXX-140: TODO; need to flush out the array_like stuff more.
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()