Merge 'main' into vk/target_extension

VijayKandiah · VijayKandiah · commit 798dfdef7473 · 2025-10-27T08:55:04.000-07:00
diff --git a/docs/source/user/cudapysupported.rst b/docs/source/user/cudapysupported.rst
@@ -214,6 +214,7 @@ The following functions from the :mod:`math` module are supported:
 * :func:`math.erf`
 * :func:`math.erfc`
 * :func:`math.exp`
+* :func:`math.exp2`
 * :func:`math.expm1`
 * :func:`math.fabs`
 * :func:`math.frexp`
diff --git a/numba_cuda/numba/cuda/bf16.py b/numba_cuda/numba/cuda/bf16.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
+import sys
 
 from numba.cuda._internal.cuda_bf16 import (
     typing_registry,
@@ -191,14 +192,12 @@ def exp_ol(a):
     return _make_unary(a, hexp)
 
 
-try:
-    from math import exp2
+if sys.version_info >= (3, 11):
 
-    @overload(exp2, target="cuda")
+    @overload(math.exp2, target="cuda")
     def exp2_ol(a):
         return _make_unary(a, hexp2)
-except ImportError:
-    pass
+
 
 ## Public aliases using Numba/Numpy-style type names
 # Floating-point
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -43,6 +43,7 @@
 import importlib
 import numpy as np
 from collections import namedtuple, deque
+from uuid import UUID
 
 
 from numba.cuda.cext import mviewbuf
@@ -536,11 +537,10 @@ def from_identity(self, identity):
             if d.get_device_identity() == identity:
                 return d
         else:
-            errmsg = (
-                "No device of {} is found. "
+            raise RuntimeError(
+                f"No device of {identity} is found. "
                 "Target device may not be visible in this process."
-            ).format(identity)
-            raise RuntimeError(errmsg)
+            )
 
     def __init__(self, devnum):
         result = driver.cuDeviceGet(devnum)
@@ -551,8 +551,6 @@ def __init__(self, devnum):
         if devnum != got_devnum:
             raise RuntimeError(msg)
 
-        self.attributes = {}
-
         # Read compute capability
         self.compute_capability = (
             self.COMPUTE_CAPABILITY_MAJOR,
@@ -562,20 +560,13 @@ def __init__(self, devnum):
         # Read name
         bufsz = 128
         buf = driver.cuDeviceGetName(bufsz, self.id)
-        name = buf.split(b"\x00")[0]
+        name = buf.split(b"\x00", 1)[0]
 
         self.name = name
 
         # Read UUID
         uuid = driver.cuDeviceGetUuid(self.id)
-        uuid_vals = tuple(uuid.bytes)
-
-        b = "%02x"
-        b2 = b * 2
-        b4 = b * 4
-        b6 = b * 6
-        fmt = f"GPU-{b4}-{b2}-{b2}-{b2}-{b6}"
-        self.uuid = fmt % uuid_vals
+        self.uuid = f"GPU-{UUID(bytes=uuid.bytes)}"
 
         self.primary_context = None
 
@@ -587,7 +578,7 @@ def get_device_identity(self):
         }
 
     def __repr__(self):
-        return "<CUDA device %d '%s'>" % (self.id, self.name)
+        return f"<CUDA device {self.id:d} '{self.name}'>"
 
     def __getattr__(self, attr):
         """Read attributes lazily"""
@@ -603,9 +594,7 @@ def __hash__(self):
         return hash(self.id)
 
     def __eq__(self, other):
-        if isinstance(other, Device):
-            return self.id == other.id
-        return False
+        return isinstance(other, Device) and self.id == other.id
 
     def __ne__(self, other):
         return not (self == other)
@@ -615,8 +604,8 @@ def get_primary_context(self):
         Returns the primary context for the device.
         Note: it is not pushed to the CPU thread.
         """
-        if self.primary_context is not None:
-            return self.primary_context
+        if (ctx := self.primary_context) is not None:
+            return ctx
 
         met_requirement_for_device(self)
         # create primary context
@@ -637,8 +626,8 @@ def release_primary_context(self):
 
     def reset(self):
         try:
-            if self.primary_context is not None:
-                self.primary_context.reset()
+            if (ctx := self.primary_context) is not None:
+                ctx.reset()
             self.release_primary_context()
         finally:
             # reset at the driver level
diff --git a/numba_cuda/numba/cuda/cudamath.py b/numba_cuda/numba/cuda/cudamath.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
+import sys
 import math
 from numba.cuda import types
 from numba.cuda.typing.templates import ConcreteTemplate, signature, Registry
@@ -58,6 +59,10 @@ class Math_unary_with_fp16(ConcreteTemplate):
     ]
 
 
+if sys.version_info >= (3, 11):
+    Math_unary_with_fp16 = infer_global(math.exp2)(Math_unary_with_fp16)
+
+
 @infer_global(math.atan2)
 class Math_atan2(ConcreteTemplate):
     key = math.atan2
diff --git a/numba_cuda/numba/cuda/fp16.py b/numba_cuda/numba/cuda/fp16.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
+import sys
 import numba.cuda.types as types
 from numba.cuda._internal.cuda_fp16 import (
     typing_registry,
@@ -190,6 +191,13 @@ def exp_ol(a):
     return _make_unary(a, hexp)
 
 
+if sys.version_info >= (3, 11):
+
+    @overload(math.exp2, target="cuda")
+    def exp2_ol(a):
+        return _make_unary(a, hexp2)
+
+
 @overload(math.tanh, target="cuda")
 def tanh_ol(a):
     return _make_unary(a, htanh)
diff --git a/numba_cuda/numba/cuda/mathimpl.py b/numba_cuda/numba/cuda/mathimpl.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
+import sys
 import math
 import operator
 from llvmlite import ir
@@ -25,6 +26,8 @@
 unarys += [("floor", "floorf", math.floor)]
 unarys += [("fabs", "fabsf", math.fabs)]
 unarys += [("exp", "expf", math.exp)]
+if sys.version_info >= (3, 11):
+    unarys += [("exp2", "exp2f", math.exp2)]
 unarys += [("expm1", "expm1f", math.expm1)]
 unarys += [("erf", "erff", math.erf)]
 unarys += [("erfc", "erfcf", math.erfc)]
@@ -330,6 +333,7 @@ def tanhf_impl_fastmath():
 impl_unary_int(math.tanh, int64, libdevice.tanh)
 impl_unary_int(math.tanh, uint64, libdevice.tanh)
 
+
 # Complex power implementations - translations of _Py_c_pow from CPython
 # https://github.com/python/cpython/blob/a755410e054e1e2390de5830befc08fe80706c66/Objects/complexobject.c#L123-L151
 #
diff --git a/numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py b/numba_cuda/numba/cuda/tests/benchmarks/test_kernel_launch.py
@@ -23,6 +23,10 @@
             ),
             id="torch",
         ),
+        param(
+            lambda: pytest.importorskip("cupy").empty(128, dtype=np.float32),
+            id="cupy",
+        ),
     ],
 )
 def test_one_arg(benchmark, array_func):
@@ -58,6 +62,13 @@ def bench(func, arr):
             ],
             id="torch",
         ),
+        param(
+            lambda: [
+                pytest.importorskip("cupy").empty(128, dtype=np.float32)
+                for _ in range(len(string.ascii_lowercase))
+            ],
+            id="cupy",
+        ),
     ],
 )
 def test_many_args(benchmark, array_func):
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py b/numba_cuda/numba/cuda/tests/cudapy/test_bfloat16.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
+import sys
 import numpy as np
 from ml_dtypes import bfloat16 as mldtypes_bf16
 from numba import cuda
@@ -134,12 +135,8 @@ def test_math_bindings(self):
         self.skip_unsupported()
 
         exp_functions = [math.exp]
-        try:
-            from math import exp2
-
-            exp_functions += [exp2]
-        except ImportError:
-            pass
+        if sys.version_info >= (3, 11):
+            exp_functions += [math.exp2]
 
         functions = [
             math.trunc,
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py b/numba_cuda/numba/cuda/tests/cudapy/test_fastmath.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
+import sys
 from typing import List
 from dataclasses import dataclass, field
 from numba import cuda
@@ -142,6 +143,19 @@ def test_expf(self):
             ),
         )
 
+    @unittest.skipUnless(sys.version_info >= (3, 11), "Python 3.11+ required")
+    def test_exp2f(self):
+        from math import exp2
+
+        self._test_fast_math_unary(
+            exp2,
+            FastMathCriterion(
+                fast_expected=["ex2.approx.ftz.f32 "],
+                prec_expected=["ex2.approx.f32 "],
+                prec_unexpected=["ex2.approx.ftz.f32 "],
+            ),
+        )
+
     def test_logf(self):
         # Look for constant used to convert from log base 2 to log base e
         self._test_fast_math_unary(
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_math.py b/numba_cuda/numba/cuda/tests/cudapy/test_math.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
+import sys
 import numpy as np
 from numba.cuda.testing import (
     skip_unless_cc_53,
@@ -84,6 +85,11 @@ def math_exp(A, B):
     B[i] = math.exp(A[i])
 
 
+def math_exp2(A, B):
+    i = cuda.grid(1)
+    B[i] = math.exp2(A[i])
+
+
 def math_erf(A, B):
     i = cuda.grid(1)
     B[i] = math.erf(A[i])
@@ -401,6 +407,8 @@ def test_math_fp16(self):
         self.unary_template_float16(math_sqrt, np.sqrt)
         self.unary_template_float16(math_ceil, np.ceil)
         self.unary_template_float16(math_floor, np.floor)
+        if sys.version_info >= (3, 11):
+            self.unary_template_float16(math_exp2, np.exp2)
 
     @skip_on_cudasim("numpy does not support trunc for float16")
     @skip_unless_cc_53
@@ -496,6 +504,16 @@ def test_math_exp(self):
         self.unary_template_int64(math_exp, np.exp)
         self.unary_template_uint64(math_exp, np.exp)
 
+    # ---------------------------------------------------------------------------
+    # test_math_exp2
+
+    @unittest.skipUnless(sys.version_info >= (3, 11), "Python 3.11+ required")
+    def test_math_exp2(self):
+        self.unary_template_float32(math_exp2, np.exp2)
+        self.unary_template_float64(math_exp2, np.exp2)
+        self.unary_template_int64(math_exp2, np.exp2)
+        self.unary_template_uint64(math_exp2, np.exp2)
+
     # ---------------------------------------------------------------------------
     # test_math_expm1
 
diff --git a/numba_cuda/numba/cuda/typing/mathdecl.py b/numba_cuda/numba/cuda/typing/mathdecl.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-2-Clause
 
+import sys
 import math
 from numba.cuda import types
 from numba.cuda.typing.templates import ConcreteTemplate, signature, Registry
@@ -44,6 +45,10 @@ class Math_unary(ConcreteTemplate):
     ]
 
 
+if sys.version_info >= (3, 11):
+    Math_unary = infer_global(math.exp2)(Math_unary)
+
+
 @infer_global(math.atan2)
 class Math_atan2(ConcreteTemplate):
     cases = [
diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml