Backporting Features to v0.20, bump to v0.20.1 (#562)

isVoid · ashermancinelli · web-flow · commit aeceaefd37c5 · 2025-10-29T16:48:01.000-07:00
- Relax the pinning to cuda-core to allow it floating across minor releases (#559) (645e46c) - Bump version to 0.20.1 (2e58567) - [test] Use numpy's tolerance for float16 (#491) (8bb46bc) --------- Co-authored-by: Michael Wang <isVoid@users.noreply.github.com> Co-authored-by: Asher Mancinelli <ashermancinelli@gmail.com>
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
@@ -13,7 +13,7 @@ echo "Package path: ${package}"
 DEPENDENCIES=(
     "${package}[test]"
     "cuda-python==${CUDA_VER_MAJOR_MINOR%.*}.*"
-    "cuda-core==0.3.*"
+    "cuda-core>=0.3.0,<1.0.0"
 )
 
 # Constrain oldest supported dependencies for testing
diff --git a/conda/recipes/numba-cuda/meta.yaml b/conda/recipes/numba-cuda/meta.yaml
@@ -30,7 +30,7 @@ requirements:
     - python
     - numba >=0.59.1
     - cuda-bindings >=12.9.1
-    - cuda-core ==0.3.*
+    - cuda-core >=0.3.0,<1.0.0
 
 about:
   home: {{ project_urls["Homepage"] }}
diff --git a/numba_cuda/VERSION b/numba_cuda/VERSION
@@ -1 +1 @@
-0.20.0
+0.20.1
diff --git a/numba_cuda/numba/cuda/testing.py b/numba_cuda/numba/cuda/testing.py
@@ -17,6 +17,7 @@
 from typing import Iterable, Union
 from io import StringIO
 import unittest
+import numpy as np
 
 if PYVERSION >= (3, 10):
     from filecheck.matcher import Matcher
@@ -44,6 +45,8 @@ class CUDATestCase(TestCase):
     matches FileCheck checks, and is not specific to CUDADispatcher.
     """
 
+    FLOAT16_RTOL = np.finfo(np.float16).eps
+
     def setUp(self):
         self._low_occupancy_warnings = config.CUDA_LOW_OCCUPANCY_WARNINGS
         self._warn_on_implicit_copy = config.CUDA_WARN_ON_IMPLICIT_COPY
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py b/numba_cuda/numba/cuda/tests/cudapy/test_intrinsics.py
@@ -629,7 +629,7 @@ def test_hadd(self):
         arg1 = np.array([3.0], dtype=np.float16)
         arg2 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg1 + arg2)
+        np.testing.assert_allclose(ary[0], arg1 + arg2, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hadd_scalar(self):
@@ -639,7 +639,7 @@ def test_hadd_scalar(self):
         arg2 = np.float16(3.0)
         compiled[1, 1](ary, arg1, arg2)
         ref = arg1 + arg2
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -657,7 +657,9 @@ def test_hfma(self):
         arg2 = np.array([3.0], dtype=np.float16)
         arg3 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2, arg3)
-        np.testing.assert_allclose(ary[0], arg1 * arg2 + arg3)
+        np.testing.assert_allclose(
+            ary[0], arg1 * arg2 + arg3, rtol=self.FLOAT16_RTOL
+        )
 
     @skip_unless_cc_53
     def test_hfma_scalar(self):
@@ -668,7 +670,7 @@ def test_hfma_scalar(self):
         arg3 = np.float16(4.0)
         compiled[1, 1](ary, arg1, arg2, arg3)
         ref = arg1 * arg2 + arg3
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -687,7 +689,7 @@ def test_hsub(self):
         arg1 = np.array([3.0], dtype=np.float16)
         arg2 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg1 - arg2)
+        np.testing.assert_allclose(ary[0], arg1 - arg2, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hsub_scalar(self):
@@ -697,7 +699,7 @@ def test_hsub_scalar(self):
         arg2 = np.float16(1.57)
         compiled[1, 1](ary, arg1, arg2)
         ref = arg1 - arg2
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -714,7 +716,7 @@ def test_hmul(self):
         arg1 = np.array([3.0], dtype=np.float16)
         arg2 = np.array([4.0], dtype=np.float16)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg1 * arg2)
+        np.testing.assert_allclose(ary[0], arg1 * arg2, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hmul_scalar(self):
@@ -724,7 +726,7 @@ def test_hmul_scalar(self):
         arg2 = np.float16(1.57)
         compiled[1, 1](ary, arg1, arg2)
         ref = arg1 * arg2
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -743,7 +745,7 @@ def test_hdiv_scalar(self):
 
         compiled[1, 1](ary, arg1, arg2)
         ref = arg1 / arg2
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hdiv(self):
@@ -754,15 +756,15 @@ def test_hdiv(self):
 
         compiled.forall(ary.size)(ary, arry1, arry2)
         ref = arry1 / arry2
-        np.testing.assert_allclose(ary, ref)
+        np.testing.assert_allclose(ary, ref, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hneg(self):
         compiled = cuda.jit("void(f2[:], f2[:])")(simple_hneg)
         ary = np.zeros(1, dtype=np.float16)
         arg1 = np.array([3.0], dtype=np.float16)
         compiled[1, 1](ary, arg1)
-        np.testing.assert_allclose(ary[0], -arg1)
+        np.testing.assert_allclose(ary[0], -arg1, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hneg_scalar(self):
@@ -771,7 +773,7 @@ def test_hneg_scalar(self):
         arg1 = np.float16(3.1415926)
         compiled[1, 1](ary, arg1)
         ref = -arg1
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -787,7 +789,7 @@ def test_habs(self):
         ary = np.zeros(1, dtype=np.float16)
         arg1 = np.array([-3.0], dtype=np.float16)
         compiled[1, 1](ary, arg1)
-        np.testing.assert_allclose(ary[0], abs(arg1))
+        np.testing.assert_allclose(ary[0], abs(arg1), rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_habs_scalar(self):
@@ -796,7 +798,7 @@ def test_habs_scalar(self):
         arg1 = np.float16(-3.1415926)
         compiled[1, 1](ary, arg1)
         ref = abs(arg1)
-        np.testing.assert_allclose(ary[0], ref)
+        np.testing.assert_allclose(ary[0], ref, rtol=self.FLOAT16_RTOL)
 
     @skip_on_cudasim("Compilation unsupported in the simulator")
     @skip_if_nvjitlink_missing("Numbast generated bindings")
@@ -849,15 +851,15 @@ def test_fp16_intrinsics_common(self):
                 kernel = cuda.jit("void(f2[:], f2[:])")(kernel)
                 kernel[1, N](r, x)
                 expected = fn(x, dtype=np.float16)
-                np.testing.assert_allclose(r, expected)
+                np.testing.assert_allclose(r, expected, rtol=self.FLOAT16_RTOL)
 
         x2 = np.random.randint(1, 10, size=N).astype(np.float16)
         for kernel, fn in zip(exp_kernels, expected_exp_functions):
             with self.subTest(fn=fn):
                 kernel = cuda.jit("void(f2[:], f2[:])")(kernel)
                 kernel[1, N](r, x2)
                 expected = fn(x2, dtype=np.float16)
-                np.testing.assert_allclose(r, expected)
+                np.testing.assert_allclose(r, expected, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hexp10(self):
@@ -876,7 +878,7 @@ def hexp10_vectors(r, x):
 
         # Run the kernel
         hexp10_vectors[1, N](r, x)
-        np.testing.assert_allclose(r, 10**x)
+        np.testing.assert_allclose(r, 10**x, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_fp16_comparison(self):
@@ -948,10 +950,10 @@ def test_hmax(self):
         arg1 = np.float16(3.0)
         arg2 = np.float16(4.0)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg2)
+        np.testing.assert_allclose(ary[0], arg2, rtol=self.FLOAT16_RTOL)
         arg1 = np.float16(5.0)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg1)
+        np.testing.assert_allclose(ary[0], arg1, rtol=self.FLOAT16_RTOL)
 
     @skip_unless_cc_53
     def test_hmin(self):
@@ -960,10 +962,10 @@ def test_hmin(self):
         arg1 = np.float16(3.0)
         arg2 = np.float16(4.0)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg1)
+        np.testing.assert_allclose(ary[0], arg1, rtol=self.FLOAT16_RTOL)
         arg1 = np.float16(5.0)
         compiled[1, 1](ary, arg1, arg2)
-        np.testing.assert_allclose(ary[0], arg2)
+        np.testing.assert_allclose(ary[0], arg2, rtol=self.FLOAT16_RTOL)
 
     def test_cbrt_f32(self):
         compiled = cuda.jit("void(float32[:], float32)")(simple_cbrt)
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = ["numba>=0.60.0"]
 [project.optional-dependencies]
 cu12 = [
     "cuda-bindings>=12.9.1,<13.0.0",
-    "cuda-core==0.3.*",
+    "cuda-core>=0.3.0,<1.0.0",
     "cuda-python==12.9.*",  # supports all CTK 12.x
     "nvidia-cuda-nvcc-cu12",  # for libNVVM
     "nvidia-cuda-runtime-cu12",
@@ -36,7 +36,7 @@ cu12 = [
 # TODO: Use cuda-toolkit package dependencies - e.g. cuda-toolkit[curand,nvvm,nvrtc]=13.*
 cu13 = [
     "cuda-bindings==13.*",
-    "cuda-core==0.3.2,<0.4.0dev0",
+    "cuda-core>=0.3.2,<1.0.0",
     "cuda-python==13.*",
     "nvidia-nvvm==13.*",
     "nvidia-cuda-runtime==13.*",

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ echo "Package path: ${package}"`
`13`	`13`	`DEPENDENCIES=(`
`14`	`14`	`"${package}[test]"`
`15`	`15`	`"cuda-python==${CUDA_VER_MAJOR_MINOR%.}."`
`16`		`- "cuda-core==0.3.*"`
	`16`	`+ "cuda-core>=0.3.0,<1.0.0"`
`17`	`17`	`)`
`18`	`18`
`19`	`19`	`# Constrain oldest supported dependencies for testing`