Fix freezing in of constant arrays with negative strides (#589)

brandon-b-miller · web-flow · commit 5389798fd4b0 · 2025-11-14T23:06:17.000Z
This PR fixes a bug when freezing certain views of numpy arrays into
kernels. The current implementation loops through the bytes of the
source array in viewed order but maintains a potentially negative stride
without also in that case bumping the pointer to the end of the array.
This PR changes it so that the physical array we put in constant memory
is just the contiguous logical equivalent of the original array with a
positive stride. This should be simpler than actually jumping through
the pointer arithmetic given we have to make a copy of the view anyways,
so the inherent "viewness" is already going to be lost when moving the
data to device.
diff --git a/numba_cuda/numba/cuda/target.py b/numba_cuda/numba/cuda/target.py
@@ -7,6 +7,7 @@
 from llvmlite import ir
 import warnings
 import importlib.util
+import numpy as np
 
 from numba.cuda import types
 from numba.cuda import HAS_NUMBA
@@ -280,6 +281,14 @@ def make_constant_array(self, builder, aryty, arr):
         addrspace.
         """
 
+        # Ensure we have a contiguous buffer with non-negative strides. views with
+        # negative strides must be materialized so that the
+        # constant bytes and the data pointer/strides are consistent.
+        if any(s < 0 for s in arr.strides) or not (
+            arr.flags.c_contiguous or arr.flags.f_contiguous
+        ):
+            arr = np.ascontiguousarray(arr)
+
         lmod = builder.module
 
         constvals = [
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py b/numba_cuda/numba/cuda/tests/cudapy/test_array_methods.py
@@ -5,6 +5,7 @@
 from numba import cuda
 from numba.cuda.testing import CUDATestCase
 import unittest
+from numba.cuda import config
 
 
 def reinterpret_array_type(byte_arr, start, stop, output):
@@ -14,6 +15,15 @@ def reinterpret_array_type(byte_arr, start, stop, output):
 
 
 class TestCudaArrayMethods(CUDATestCase):
+    def setUp(self):
+        self.old_nrt_setting = config.CUDA_ENABLE_NRT
+        config.CUDA_ENABLE_NRT = True
+        super(TestCudaArrayMethods, self).setUp()
+
+    def tearDown(self):
+        config.CUDA_ENABLE_NRT = self.old_nrt_setting
+        super(TestCudaArrayMethods, self).tearDown()
+
     def test_reinterpret_array_type(self):
         """
         Reinterpret byte array as int32 in the GPU.
@@ -33,6 +43,21 @@ def test_reinterpret_array_type(self):
             got = output[0]
             self.assertEqual(expect, got)
 
+    def test_array_copy(self):
+        val = np.array([1, 2, 3])[::-1]
+
+        @cuda.jit
+        def kernel(out):
+            q = val.copy()
+            for i in range(len(out)):
+                out[i] = q[i]
+
+        out = cuda.to_device(np.zeros(len(val), dtype="float64"))
+
+        kernel[1, 1](out)
+        for i, j in zip(out.copy_to_host(), val):
+            self.assertEqual(i, j)
+
 
 if __name__ == "__main__":
     unittest.main()