Add support for pre-kernel-launch callbacks to launch config.

tpn · tpn · commit fea9f79f9302 · 2025-09-29T15:01:41.000-07:00
This is required by cuda.coop in order to pass two-phase primitive
instances as kernel parameters without having to call the @cuda.jit
decorator with extensions=[...] up-front.
diff --git a/numba_cuda/numba/cuda/dispatcher.py b/numba_cuda/numba/cuda/dispatcher.py
@@ -1024,12 +1024,15 @@ def call(self, args, griddim, blockdim, stream, sharedmem):
             blockdim=blockdim,
             stream=stream,
             sharedmem=sharedmem,
-        ):
+        ) as launch_config:
             if self.specialized:
                 kernel = next(iter(self.overloads.values()))
             else:
                 kernel = _dispatcher.Dispatcher._cuda_call(self, *args)
 
+            for callback in launch_config.pre_launch_callbacks:
+                callback(kernel, launch_config)
+
             kernel.launch(args, griddim, blockdim, stream, sharedmem)
 
     def _compile_for_args(self, *args, **kws):
diff --git a/numba_cuda/numba/cuda/launchconfig.py b/numba_cuda/numba/cuda/launchconfig.py
@@ -3,10 +3,17 @@
 from dataclasses import dataclass
 from contextvars import ContextVar
 from contextlib import contextmanager
-from typing import Any, Tuple, Optional, TYPE_CHECKING
+from typing import (
+    Any,
+    Callable,
+    List,
+    Tuple,
+    Optional,
+    TYPE_CHECKING,
+)
 
 if TYPE_CHECKING:
-    from numba.cuda.dispatcher import CUDADispatcher
+    from numba.cuda.dispatcher import CUDADispatcher, _Kernel
 
 
 @dataclass(frozen=True, slots=True)
@@ -22,14 +29,24 @@ class LaunchConfig:
     blockdim: Tuple[int, int, int]
     stream: Any
     sharedmem: int
+    pre_launch_callbacks: List[Callable[["_Kernel", "LaunchConfig"], None]]
+    """
+    List of functions to call before launching a kernel.  The functions are
+    called with the kernel and the launch config as arguments.  This enables
+    just-in-time modifications to the kernel's configuration prior to launch,
+    such as appending extensions for dynamic types that were created after the
+    @cuda.jit decorator appeared (i.e. as part of rewriting).
+    """
 
     def __str__(self) -> str:
         a = ", ".join(map(str, self.args))
         g = "×".join(map(str, self.griddim))
         b = "×".join(map(str, self.blockdim))
+        cb = ", ".join(map(str, self.pre_launch_callbacks))
         return (
             f"<LaunchConfig args=[{a}], grid={g}, block={b}, "
-            f"stream={self.stream}, smem={self.sharedmem}B>"
+            f"stream={self.stream}, smem={self.sharedmem}B, "
+            f"pre_launch_callbacks=[{cb}]>"
         )
 
 
@@ -72,10 +89,18 @@ def launch_config_ctx(
     Install a LaunchConfig for the dynamic extent of the with-block.
     The previous value (if any) is restored automatically.
     """
-    token = _launch_config_var.set(
-        LaunchConfig(dispatcher, args, griddim, blockdim, stream, sharedmem)
+    pre_launch_callbacks = []
+    launch_config = LaunchConfig(
+        dispatcher,
+        args,
+        griddim,
+        blockdim,
+        stream,
+        sharedmem,
+        pre_launch_callbacks,
     )
+    token = _launch_config_var.set(launch_config)
     try:
-        yield
+        yield launch_config
     finally:
         _launch_config_var.reset(token)