From a219b8869512852be93de48fc8c33bd513134003 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Wed, 26 Nov 2025 22:04:00 +0000 Subject: [PATCH] Revert "perf: remove context threading in various pointer abstractions (#536)" This reverts commit 9a56516cdaf5fe216a3db3db896974e084b4fa23. This changed the public API of `MemoryPointer` and related classes, and the context that they held was used by Arrow (see https://github.com/apache/arrow/pull/48259#issuecomment-3580532978): > Numba interop tests fail with: ``` arrow-dev/lib/python3.12/site-packages/pyarrow/tests/test_cuda_numba_interop.py:233: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ > ??? E TypeError: MemoryPointer.__init__() got multiple values for argument 'pointer' ``` This commit reverts the change, as it was intended to improve performance without changing functionality, but has had a functional change as a side effect. Following the merge of this PR, we should be able to remove some of the `@require_context` decorators with some more targeted changes. --- numba_cuda/numba/cuda/api.py | 5 +- numba_cuda/numba/cuda/cudadrv/devicearray.py | 4 +- numba_cuda/numba/cuda/cudadrv/driver.py | 52 ++++++++++++------- .../cuda/tests/cudadrv/test_cuda_memory.py | 8 ++- .../cuda/tests/cudadrv/test_emm_plugins.py | 4 +- 5 files changed, 50 insertions(+), 23 deletions(-) diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py index caacfd7c8..81c338806 100644 --- a/numba_cuda/numba/cuda/api.py +++ b/numba_cuda/numba/cuda/api.py @@ -21,6 +21,7 @@ gpus = devices.gpus +@require_context def from_cuda_array_interface(desc, owner=None, sync=True): """Create a DeviceNDArray from a cuda-array-interface description. The ``owner`` is the owner of the underlying memory. @@ -47,7 +48,9 @@ def from_cuda_array_interface(desc, owner=None, sync=True): cudevptr_class = driver.binding.CUdeviceptr devptr = cudevptr_class(desc["data"][0]) - data = driver.MemoryPointer(devptr, size=size, owner=owner) + data = driver.MemoryPointer( + current_context(), devptr, size=size, owner=owner + ) stream_ptr = desc.get("stream", None) if stream_ptr is not None: stream = external_stream(stream_ptr) diff --git a/numba_cuda/numba/cuda/cudadrv/devicearray.py b/numba_cuda/numba/cuda/cudadrv/devicearray.py index 9897a3f04..f0be6aefd 100644 --- a/numba_cuda/numba/cuda/cudadrv/devicearray.py +++ b/numba_cuda/numba/cuda/cudadrv/devicearray.py @@ -108,7 +108,9 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None): else: # Make NULL pointer for empty allocation null = _driver.binding.CUdeviceptr(0) - gpu_data = _driver.MemoryPointer(pointer=null, size=0) + gpu_data = _driver.MemoryPointer( + context=devices.get_context(), pointer=null, size=0 + ) self.alloc_size = 0 self.gpu_data = gpu_data diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index ee9bac927..6f04acc27 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -814,13 +814,14 @@ def allocator(): alloc_key = pointer finalizer = _hostalloc_finalizer(self, pointer, alloc_key, size, mapped) + ctx = weakref.proxy(self.context) if mapped: - mem = MappedMemory(pointer, size, finalizer=finalizer) + mem = MappedMemory(ctx, pointer, size, finalizer=finalizer) self.allocations[alloc_key] = mem return mem.own() else: - return PinnedMemory(pointer, size, finalizer=finalizer) + return PinnedMemory(ctx, pointer, size, finalizer=finalizer) def mempin(self, owner, pointer, size, mapped=False): """Implements the pinning of host memory. @@ -847,13 +848,18 @@ def allocator(): allocator() finalizer = _pin_finalizer(self, pointer, alloc_key, mapped) + ctx = weakref.proxy(self.context) if mapped: - mem = MappedMemory(pointer, size, owner=owner, finalizer=finalizer) + mem = MappedMemory( + ctx, pointer, size, owner=owner, finalizer=finalizer + ) self.allocations[alloc_key] = mem return mem.own() else: - return PinnedMemory(pointer, size, owner=owner, finalizer=finalizer) + return PinnedMemory( + ctx, pointer, size, owner=owner, finalizer=finalizer + ) def memallocmanaged(self, size, attach_global): def allocator(): @@ -871,7 +877,8 @@ def allocator(): alloc_key = ptr finalizer = _alloc_finalizer(self, ptr, alloc_key, size) - mem = ManagedMemory(ptr, size, finalizer=finalizer) + ctx = weakref.proxy(self.context) + mem = ManagedMemory(ctx, ptr, size, finalizer=finalizer) self.allocations[alloc_key] = mem return mem.own() @@ -934,7 +941,8 @@ def allocator(): alloc_key = ptr finalizer = _alloc_finalizer(self, ptr, alloc_key, size) - mem = AutoFreePointer(ptr, size, finalizer=finalizer) + ctx = weakref.proxy(self.context) + mem = AutoFreePointer(ctx, ptr, size, finalizer=finalizer) self.allocations[alloc_key] = mem return mem.own() @@ -1265,7 +1273,9 @@ def open_ipc_handle(self, handle, size): dptr = driver.cuIpcOpenMemHandle(handle, flags) # wrap it - return MemoryPointer(pointer=dptr, size=size) + return MemoryPointer( + context=weakref.proxy(self), pointer=dptr, size=size + ) def enable_peer_access(self, peer_context, flags=0): """Enable peer access between the current context and the peer context""" @@ -1751,7 +1761,7 @@ def _rebuild(cls, handle_ary, size, source_info, offset): ) -class MemoryPointer: +class MemoryPointer(object): """A memory pointer that owns a buffer, with an optional finalizer. Memory pointers provide reference counting, and instances are initialized with a reference count of 1. @@ -1767,6 +1777,8 @@ class MemoryPointer: tie the buffer lifetime to the reference count, so that the buffer is freed when there are no more references. + :param context: The context in which the pointer was allocated. + :type context: Context :param pointer: The address of the buffer. :type pointer: ctypes.c_void_p :param size: The size of the allocation in bytes. @@ -1783,10 +1795,11 @@ class MemoryPointer: __cuda_memory__ = True - def __init__(self, pointer, size, owner=None, finalizer=None): + def __init__(self, context, pointer, size, owner=None, finalizer=None): if isinstance(pointer, ctypes.c_void_p): pointer = binding.CUdeviceptr(pointer.value) + self.context = context self.device_pointer = pointer self.size = size self._cuda_memsize_ = size @@ -1842,7 +1855,7 @@ def view(self, start, stop=None): pointer = binding.CUdeviceptr() ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr()) ctypes_ptr.value = base - view = MemoryPointer(pointer, size, owner=self.owner) + view = MemoryPointer(self.context, pointer, size, owner=self.owner) if isinstance(self.owner, (MemoryPointer, OwnedPointer)): # Owned by a numba-managed memory segment, take an owned reference @@ -1871,7 +1884,7 @@ class AutoFreePointer(MemoryPointer): def __init__(self, *args, **kwargs): super(AutoFreePointer, self).__init__(*args, **kwargs) - # Release the self reference to the buffer, so that the finalizer + # Releease the self reference to the buffer, so that the finalizer # is invoked if all the derived pointers are gone. self.refct -= 1 @@ -1898,7 +1911,7 @@ class MappedMemory(AutoFreePointer): __cuda_memory__ = True - def __init__(self, pointer, size, owner=None, finalizer=None): + def __init__(self, context, pointer, size, owner=None, finalizer=None): self.owned = owner self.host_pointer = pointer @@ -1906,7 +1919,9 @@ def __init__(self, pointer, size, owner=None, finalizer=None): self._bufptr_ = self.host_pointer self.device_pointer = devptr - super(MappedMemory, self).__init__(devptr, size, finalizer=finalizer) + super(MappedMemory, self).__init__( + context, devptr, size, finalizer=finalizer + ) self.handle = self.host_pointer # For buffer interface @@ -1935,7 +1950,8 @@ class PinnedMemory(mviewbuf.MemAlloc): :type finalizer: function """ - def __init__(self, pointer, size, owner=None, finalizer=None): + def __init__(self, context, pointer, size, owner=None, finalizer=None): + self.context = context self.owned = owner self.size = size self.host_pointer = pointer @@ -1975,10 +1991,10 @@ class ManagedMemory(AutoFreePointer): __cuda_memory__ = True - def __init__(self, pointer, size, owner=None, finalizer=None): + def __init__(self, context, pointer, size, owner=None, finalizer=None): self.owned = owner devptr = pointer - super().__init__(devptr, size, finalizer=finalizer) + super().__init__(context, devptr, size, finalizer=finalizer) # For buffer interface self._buflen_ = self.size @@ -2302,7 +2318,7 @@ def get_global_symbol(self, name): driver.cuModuleGetGlobal( byref(ptr), byref(size), self.handle, name.encode("utf8") ) - return MemoryPointer(ptr, size), size.value + return MemoryPointer(self.context, ptr, size), size.value class CudaPythonModule(Module): @@ -2312,7 +2328,7 @@ def get_function(self, name): def get_global_symbol(self, name): ptr, size = driver.cuModuleGetGlobal(self.handle, name.encode("utf8")) - return MemoryPointer(ptr, size), size + return MemoryPointer(self.context, ptr, size), size FuncAttr = namedtuple( diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py index 12bfa4845..d6852c8c5 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_cuda_memory.py @@ -87,13 +87,17 @@ def dtor(): dtor_invoked[0] += 1 # Ensure finalizer is called when pointer is deleted - ptr = driver.MemoryPointer(pointer=fake_ptr, size=40, finalizer=dtor) + ptr = driver.MemoryPointer( + context=self.context, pointer=fake_ptr, size=40, finalizer=dtor + ) self.assertEqual(dtor_invoked[0], 0) del ptr self.assertEqual(dtor_invoked[0], 1) # Ensure removing derived pointer doesn't call finalizer - ptr = driver.MemoryPointer(pointer=fake_ptr, size=40, finalizer=dtor) + ptr = driver.MemoryPointer( + context=self.context, pointer=fake_ptr, size=40, finalizer=dtor + ) owned = ptr.own() del owned self.assertEqual(dtor_invoked[0], 1) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py b/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py index be07ce5bb..a41e13dcc 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_emm_plugins.py @@ -3,6 +3,7 @@ import ctypes import numpy as np +import weakref from numba import cuda from numba.cuda.core import config @@ -57,9 +58,10 @@ def finalizer(): # We use an AutoFreePointer so that the finalizer will be run when # the reference count drops to zero. + ctx = weakref.proxy(self.context) ptr = ctypes.c_void_p(alloc_count) return cuda.cudadrv.driver.AutoFreePointer( - ptr, size, finalizer=finalizer + ctx, ptr, size, finalizer=finalizer ) def initialize(self):