diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f2e9b132..974c0c70 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -121,8 +121,8 @@ jobs: run: | curl -L -O https://tiker.net/ci-support-v0 . ./ci-support-v0 - if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "e2p" ]]; then - DOWNSTREAM_PROJECT=https://github.com/isuruf/pytential.git@e2p + if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "towards-array-context" ]]; then + DOWNSTREAM_PROJECT=https://github.com/alexfikl/pytential.git@towards-array-context fi test_downstream "$DOWNSTREAM_PROJECT" diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index 2249501e..46ddbbee 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -8,7 +8,8 @@ dependencies: - numpy - scipy - sympy -- pocl +# https://github.com/pocl/pocl/issues/2069 +- pocl<7 - pocl-cuda - islpy - pyopencl diff --git a/doc/conf.py b/doc/conf.py index 2e3b8ce2..21e6be08 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -26,6 +26,7 @@ nitpick_ignore_regex = [ ["py:class", r"symengine\.(.+)"], # :cry: + ["py:class", r"ToTagSetConvertible"], # :cry: ] sphinxconfig_missing_reference_aliases = { @@ -36,6 +37,7 @@ "np.complexfloating": "class:numpy.complexfloating", "np.inexact": "class:numpy.inexact", "np.dtype": "class:numpy.dtype", + "np.number": "class:numpy.number", # pytools "obj_array.ObjectArray1D": "obj:pytools.obj_array.ObjectArray1D", # sympy @@ -53,6 +55,7 @@ "CallInstruction": "class:loopy.kernel.instruction.CallInstruction", # arraycontext "Array": "obj:arraycontext.Array", + "ArrayContext": "class:arraycontext.ArrayContext", # boxtree "FMMTraversalInfo": "class:boxtree.traversal.FMMTraversalInfo", # sumpy diff --git a/examples/curve-pot.py b/examples/curve-pot.py index 10175851..a2669da4 100644 --- a/examples/curve-pot.py +++ b/examples/curve-pot.py @@ -91,7 +91,7 @@ def draw_pot_figure(aspect_ratio, knl_kwargs = {} vol_source_knl, vol_target_knl = process_kernel(knl, what_operator) - p2p = P2P(actx.context, + p2p = P2P( source_kernels=(vol_source_knl,), target_kernels=(vol_target_knl,), exclude_self=False, @@ -100,7 +100,7 @@ def draw_pot_figure(aspect_ratio, lpot_source_knl, lpot_target_knl = process_kernel(knl, what_operator_lpot) from sumpy.qbx import LayerPotential - lpot = LayerPotential(actx.context, + lpot = LayerPotential( expansion=expn_class(knl, order=order), source_kernels=(lpot_source_knl,), target_kernels=(lpot_target_knl,), @@ -184,7 +184,8 @@ def map_to_curve(t: NDArray[np.floating]): def apply_lpot(x: NDArray[np.inexact]) -> NDArray[np.inexact]: xovsmp = fim @ x - _evt, (y,) = lpot(actx.queue, + y, = lpot( + actx, sources, ovsmp_sources, actx.from_numpy(centers), @@ -209,7 +210,8 @@ def apply_lpot(x: NDArray[np.inexact]) -> NDArray[np.inexact]: density = np.cos(mode_nr*2*np.pi*native_t).astype(np.complex128) strength = actx.from_numpy(native_curve.speed * native_weights * density) - _evt, (vol_pot,) = p2p(actx.queue, + vol_pot, = p2p( + actx, targets, sources, [strength], **volpot_kwargs) @@ -219,7 +221,8 @@ def apply_lpot(x: NDArray[np.inexact]) -> NDArray[np.inexact]: ovsmp_strength = actx.from_numpy( ovsmp_curve.speed * ovsmp_weights * ovsmp_density) - _evt, (curve_pot,) = lpot(actx.queue, + curve_pot, = lpot( + actx, sources, ovsmp_sources, actx.from_numpy(centers), diff --git a/examples/expansion-toys.py b/examples/expansion-toys.py index b61aafcf..4d1483ae 100644 --- a/examples/expansion-toys.py +++ b/examples/expansion-toys.py @@ -21,7 +21,6 @@ def main(): actx = PyOpenCLArrayContext(queue) tctx = t.ToyContext( - actx.context, # LaplaceKernel(2), YukawaKernel(2), extra_kernel_kwargs={"lam": 5}, # HelmholtzKernel(2), extra_kernel_kwargs={"k": 0.3}, @@ -36,22 +35,22 @@ def main(): fp = FieldPlotter([3, 0], extent=8) if USE_MATPLOTLIB: - t.logplot(fp, pt_src, cmap="jet") + t.logplot(actx, fp, pt_src, cmap="jet") plt.colorbar() plt.show() - mexp = t.multipole_expand(pt_src, [0, 0], 5) - mexp2 = t.multipole_expand(mexp, [0, 0.25]) # noqa: F841 - lexp = t.local_expand(mexp, [3, 0]) - lexp2 = t.local_expand(lexp, [3, 1], 3) + mexp = t.multipole_expand(actx, pt_src, [0, 0], order=5) + mexp2 = t.multipole_expand(actx, mexp, [0, 0.25]) # noqa: F841 + lexp = t.local_expand(actx, mexp, [3, 0]) + lexp2 = t.local_expand(actx, lexp, [3, 1], order=3) # diff = mexp - pt_src # diff = mexp2 - pt_src diff = lexp2 - pt_src - print(t.l_inf(diff, 1.2, center=lexp2.center)) + print(t.l_inf(actx, diff, 1.2, center=lexp2.center)) if USE_MATPLOTLIB: - t.logplot(fp, diff, cmap="jet", vmin=-3, vmax=0) + t.logplot(actx, fp, diff, cmap="jet", vmin=-3, vmax=0) plt.colorbar() plt.show() diff --git a/requirements.txt b/requirements.txt index 57085a11..953ff21d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ git+https://github.com/inducer/pytools.git#egg=pytools git+https://github.com/inducer/pymbolic.git#egg=pymbolic git+https://github.com/inducer/islpy.git#egg=islpy git+https://github.com/inducer/pyopencl.git#egg=pyopencl -git+https://github.com/inducer/boxtree.git#egg=boxtree +git+https://github.com/alexfikl/boxtree.git@towards-array-context#egg=boxtree git+https://github.com/inducer/loopy.git#egg=loopy git+https://github.com/inducer/arraycontext.git#egg=arraycontext git+https://github.com/inducer/pyfmmlib.git#egg=pyfmmlib diff --git a/sumpy/__init__.py b/sumpy/__init__.py index 5a25a161..3acd93aa 100644 --- a/sumpy/__init__.py +++ b/sumpy/__init__.py @@ -66,8 +66,8 @@ ] -code_cache: WriteOncePersistentDict[Hashable, lp.TranslationUnit] = \ - WriteOncePersistentDict("sumpy-code-cache-v6-"+VERSION_TEXT, safe_sync=False) +code_cache: WriteOncePersistentDict[Hashable, lp.TranslationUnit] = ( + WriteOncePersistentDict(f"sumpy-code-cache-v8-{VERSION_TEXT}", safe_sync=False)) # {{{ optimization control @@ -87,8 +87,6 @@ def set_optimization_enabled(flag): # {{{ cache control -CACHING_ENABLED = True - CACHING_ENABLED = ( "SUMPY_NO_CACHE" not in os.environ and "CG_NO_CACHE" not in os.environ) diff --git a/sumpy/array_context.py b/sumpy/array_context.py index cc54048a..2341c1e5 100644 --- a/sumpy/array_context.py +++ b/sumpy/array_context.py @@ -23,36 +23,102 @@ THE SOFTWARE. """ +from typing import TYPE_CHECKING, Any + from boxtree.array_context import PyOpenCLArrayContext as PyOpenCLArrayContextBase +from typing_extensions import override +import loopy as lp from arraycontext.pytest import ( _PytestPyOpenCLArrayContextFactoryWithClass, register_pytest_array_context_factory, ) +if TYPE_CHECKING: + from collections.abc import Iterator + + from numpy.typing import DTypeLike + + from arraycontext import ArrayContext + from loopy import TranslationUnit + from loopy.codegen import PreambleInfo + from pytools.tag import ToTagSetConvertible + + __doc__ = """ Array Context ------------- +.. autofunction:: make_loopy_program .. autoclass:: PyOpenCLArrayContext """ # {{{ PyOpenCLArrayContext +def make_loopy_program( + domains, statements, + kernel_data: list[Any] | None = None, *, + name: str = "sumpy_loopy_kernel", + silenced_warnings: list[str] | str | None = None, + assumptions: str = "", + fixed_parameters: dict[str, Any] | None = None, + index_dtype: DTypeLike = None, + tags: ToTagSetConvertible = None): + """Return a :class:`loopy.LoopKernel` suitable for use with + :meth:`arraycontext.ArrayContext.call_loopy`. + """ + if kernel_data is None: + kernel_data = [...] + + if silenced_warnings is None: + silenced_warnings = [] + + import loopy as lp + from arraycontext.loopy import _DEFAULT_LOOPY_OPTIONS + + return lp.make_kernel( + domains, + statements, + kernel_data=kernel_data, + options=_DEFAULT_LOOPY_OPTIONS, + default_offset=lp.auto, + name=name, + lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, + assumptions=assumptions, + fixed_parameters=fixed_parameters, + silenced_warnings=silenced_warnings, + index_dtype=index_dtype, + tags=tags) + + +def _fp_contract_fast_preamble( + preamble_info: PreambleInfo + ) -> Iterator[tuple[str, str]]: + yield ("fp_contract_fast_pocl", "#pragma clang fp contract(fast)") + + class PyOpenCLArrayContext(PyOpenCLArrayContextBase): - def transform_loopy_program(self, t_unit): - default_ep = t_unit.default_entrypoint - options = default_ep.options + @override + def transform_loopy_program(self, t_unit: TranslationUnit): + import pyopencl as cl + device = self.queue.device + if (device.platform.name == "Portable Computing Language" + and (device.type & cl.device_type.GPU)): + t_unit = lp.register_preamble_generators( + t_unit, + [_fp_contract_fast_preamble]) + + return t_unit - if not (options.return_dict and options.no_numpy): - raise ValueError("Loopy kernel passed to call_loopy must " - "have return_dict and no_numpy options set. " - "Did you use arraycontext.make_loopy_program " - "to create this kernel?") - return super().transform_loopy_program(t_unit) +def is_cl_cpu(actx: ArrayContext) -> bool: + if not isinstance(actx, PyOpenCLArrayContext): + return False + + import pyopencl as cl + return all(dev.type & cl.device_type.CPU for dev in actx.context.devices) # }}} diff --git a/sumpy/codegen.py b/sumpy/codegen.py index e3e071a6..517c0ad8 100644 --- a/sumpy/codegen.py +++ b/sumpy/codegen.py @@ -47,8 +47,6 @@ from numpy.typing import DTypeLike - import pyopencl as cl - from loopy.codegen import PreambleInfo from loopy.target import TargetBase from loopy.translation_unit import CallablesInferenceContext from loopy.types import LoopyType @@ -249,26 +247,6 @@ def register_bessel_callables(loopy_knl: lp.TranslationUnit) -> lp.TranslationUn return loopy_knl - -def _fp_contract_fast_preamble( - preamble_info: PreambleInfo - ) -> Iterator[tuple[str, str]]: - yield ("fp_contract_fast_pocl", "#pragma clang fp contract(fast)") - - -def register_optimization_preambles( - loopy_knl: lp.TranslationUnit, device: cl.Device - ) -> lp.TranslationUnit: - if isinstance(loopy_knl.target, lp.PyOpenCLTarget): - import pyopencl as cl - if (device.platform.name == "Portable Computing Language" - and (device.type & cl.device_type.GPU)): - loopy_knl = lp.register_preamble_generators( - loopy_knl, - [_fp_contract_fast_preamble]) - - return loopy_knl - # }}} diff --git a/sumpy/distributed.py b/sumpy/distributed.py index df66f308..a7a5e194 100644 --- a/sumpy/distributed.py +++ b/sumpy/distributed.py @@ -23,66 +23,78 @@ THE SOFTWARE. """ -from boxtree.distributed.calculation import DistributedExpansionWrangler +from typing import TYPE_CHECKING + +from boxtree.distributed.calculation import DistributedExpansionWranglerMixin -import pyopencl.array as cl_array import pytools.obj_array as obj_array from sumpy.fmm import SumpyExpansionWrangler +if TYPE_CHECKING: + from arraycontext import ArrayContext + + class DistributedSumpyExpansionWrangler( - DistributedExpansionWrangler, SumpyExpansionWrangler): + DistributedExpansionWranglerMixin, SumpyExpansionWrangler): def __init__( - self, context, comm, tree_indep, local_traversal, global_traversal, + self, actx: ArrayContext, + comm, tree_indep, local_traversal, global_traversal, dtype, fmm_level_to_order, communicate_mpoles_via_allreduce=False, - **kwarg): - DistributedExpansionWrangler.__init__( - self, context, comm, global_traversal, True, - communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce) + **kwargs): SumpyExpansionWrangler.__init__( - self, tree_indep, local_traversal, dtype, fmm_level_to_order, **kwarg) + self, tree_indep, local_traversal, dtype, fmm_level_to_order, + **kwargs) - def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): - src_weight_vecs_host = [src_weight.get() for src_weight in src_weight_vecs] + self.comm = comm + self.traversal_in_device_memory = True + self.global_traversal = global_traversal + self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce + + def distribute_source_weights(self, + actx: ArrayContext, src_weight_vecs, src_idx_all_ranks): + src_weight_vecs_host = [ + actx.to_numpy(src_weight) for src_weight in src_weight_vecs + ] local_src_weight_vecs_host = super().distribute_source_weights( - src_weight_vecs_host, src_idx_all_ranks) + actx, src_weight_vecs_host, src_idx_all_ranks) local_src_weight_vecs_device = [ - cl_array.to_device(src_weight.queue, local_src_weight) - for local_src_weight, src_weight in - zip(local_src_weight_vecs_host, src_weight_vecs, strict=True)] + actx.from_numpy(local_src_weight) + for local_src_weight in local_src_weight_vecs_host] return local_src_weight_vecs_device - def gather_potential_results(self, potentials, tgt_idx_all_ranks): - mpi_rank = self.comm.Get_rank() - - potentials_host_vec = [potentials_dev.get() for potentials_dev in potentials] + def gather_potential_results(self, + actx: ArrayContext, potentials, tgt_idx_all_ranks): + potentials_host_vec = [ + actx.to_numpy(potentials_dev) for potentials_dev in potentials + ] gathered_potentials_host_vec = [] for potentials_host in potentials_host_vec: gathered_potentials_host_vec.append( - super().gather_potential_results(potentials_host, tgt_idx_all_ranks)) + super().gather_potential_results( + actx, potentials_host, tgt_idx_all_ranks)) - if mpi_rank == 0: + if self.is_mpi_root: return obj_array.new_1d([ - cl_array.to_device(potentials_dev.queue, gathered_potentials_host) - for gathered_potentials_host, potentials_dev in - zip(gathered_potentials_host_vec, potentials, strict=True)]) + actx.from_numpy(gathered_potentials_host) + for gathered_potentials_host in gathered_potentials_host_vec + ]) else: return None def reorder_sources(self, source_array): - if self.comm.Get_rank() == 0: - return source_array.with_queue(source_array.queue)[ - self.global_traversal.tree.user_source_ids] + if self.is_mpi_root: + return source_array[self.global_traversal.tree.user_source_ids] else: return source_array def reorder_potentials(self, potentials): - if self.comm.Get_rank() == 0: + if self.is_mpi_root: import numpy as np assert ( @@ -96,8 +108,9 @@ def reorder(x): else: return None - def communicate_mpoles(self, mpole_exps, return_stats=False): - mpole_exps_host = mpole_exps.get() - stats = super().communicate_mpoles(mpole_exps_host, return_stats) + def communicate_mpoles(self, + actx: ArrayContext, mpole_exps, return_stats=False): + mpole_exps_host = actx.to_numpy(mpole_exps) + stats = super().communicate_mpoles(actx, mpole_exps_host, return_stats) mpole_exps[:] = mpole_exps_host return stats diff --git a/sumpy/e2e.py b/sumpy/e2e.py index 4f5162f0..34b888de 100644 --- a/sumpy/e2e.py +++ b/sumpy/e2e.py @@ -25,19 +25,24 @@ import logging from abc import ABC, abstractmethod +from typing import TYPE_CHECKING import numpy as np +from typing_extensions import override import loopy as lp -import pymbolic -from loopy.version import MOST_RECENT_LANGUAGE_VERSION +from loopy.version import MOST_RECENT_LANGUAGE_VERSION # noqa: F401 from pytools import memoize_method import sumpy.symbolic as sym -from sumpy.codegen import register_optimization_preambles +from sumpy.array_context import make_loopy_program from sumpy.tools import KernelCacheMixin, to_complex_dtype +if TYPE_CHECKING: + from arraycontext import ArrayContext + + logger = logging.getLogger(__name__) @@ -50,15 +55,13 @@ .. autoclass:: E2EFromCSR .. autoclass:: E2EFromParent .. autoclass:: E2EFromChildren - """ -# {{{ translation base class +# {{{ E2EBase: base class class E2EBase(KernelCacheMixin, ABC): - def __init__(self, ctx, src_expansion, tgt_expansion, - name=None, device=None): + def __init__(self, src_expansion, tgt_expansion, name=None): """ :arg expansion: a subclass of :class:`sympy.expansion.ExpansionBase` :arg strength_usage: A list of integers indicating which expression @@ -66,37 +69,25 @@ def __init__(self, ctx, src_expansion, tgt_expansion, number of strength arrays that need to be passed. Default: all kernels use the same strength. """ - - if device is None: - device = ctx.devices[0] + from sumpy.kernel import ( + SourceTransformationRemover, + TargetTransformationRemover, + ) + txr = TargetTransformationRemover() + sxr = SourceTransformationRemover() if src_expansion is tgt_expansion: - from sumpy.kernel import ( - SourceTransformationRemover, - TargetTransformationRemover, - ) - tgt_expansion = src_expansion = src_expansion.with_kernel( - SourceTransformationRemover()( - TargetTransformationRemover()(src_expansion.kernel))) - + tgt_expansion = src_expansion = ( + src_expansion.with_kernel(sxr(txr(src_expansion.kernel)))) else: + src_expansion = ( + src_expansion.with_kernel(sxr(txr(src_expansion.kernel)))) + tgt_expansion = ( + tgt_expansion.with_kernel(sxr(txr(tgt_expansion.kernel)))) - from sumpy.kernel import ( - SourceTransformationRemover, - TargetTransformationRemover, - ) - src_expansion = src_expansion.with_kernel( - SourceTransformationRemover()( - TargetTransformationRemover()(src_expansion.kernel))) - tgt_expansion = tgt_expansion.with_kernel( - SourceTransformationRemover()( - TargetTransformationRemover()(tgt_expansion.kernel))) - - self.context = ctx self.src_expansion = src_expansion self.tgt_expansion = tgt_expansion self.name = name or self.default_name - self.device = device if src_expansion.dim != tgt_expansion.dim: raise ValueError("source and target expansions must have " @@ -111,8 +102,8 @@ def default_name(self): @memoize_method def get_translation_loopy_insns(self): - from sumpy.symbolic import make_sym_vector - dvec = make_sym_vector("d", self.dim) + import sumpy.symbolic as sym + dvec = sym.make_sym_vector("d", self.dim) src_coeff_exprs = [ sym.Symbol(f"src_coeff{i}") @@ -141,11 +132,7 @@ def get_translation_loopy_insns(self): ) def get_cache_key(self): - return ( - type(self).__name__, - self.src_expansion, - self.tgt_expansion, - ) + return (type(self).__name__, self.src_expansion, self.tgt_expansion) @abstractmethod def get_kernel(self): @@ -155,32 +142,25 @@ def get_optimized_kernel(self): # FIXME knl = self.get_kernel() knl = lp.split_iname(knl, "itgt_box", 64, outer_tag="g.0", inner_tag="l.0") - knl = register_optimization_preambles(knl, self.device) return knl # }}} -# {{{ translation from "compressed sparse row"-like source box lists +# {{{ E2EFromCSR: translation from "compressed sparse row"-like source box lists class E2EFromCSR(E2EBase): """Implements translation from a "compressed sparse row"-like source box list. """ - def __init__(self, ctx, src_expansion, tgt_expansion, - name=None, device=None): - super().__init__(ctx, src_expansion, tgt_expansion, - name=name, device=device) - @property def default_name(self): return "e2e_from_csr" def get_translation_loopy_insns(self): - from sumpy.symbolic import make_sym_vector - dvec = make_sym_vector("d", self.dim) + dvec = sym.make_sym_vector("d", self.dim) src_rscale = sym.Symbol("src_rscale") tgt_rscale = sym.Symbol("tgt_rscale") @@ -219,12 +199,11 @@ def get_kernel(self): # (same for itgt_box, tgt_ibox) from sumpy.tools import gather_loopy_arguments - loopy_knl = lp.make_kernel( - [ - "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] @@ -253,7 +232,7 @@ def get_kernel(self): """ for coeffidx in range(ncoeff_tgt)] + [""" end """], - [ + kernel_data=[ lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), lp.ValueArg("src_rscale,tgt_rscale", None), lp.GlobalArg("src_box_starts, src_box_lists", @@ -273,9 +252,7 @@ def get_kernel(self): name=self.name, assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_expn*)", - default_offset=lp.auto, fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION ) for knl in [self.src_expansion.kernel, self.tgt_expansion.kernel]: @@ -287,15 +264,15 @@ def get_kernel(self): return loopy_knl + @override def get_optimized_kernel(self): # FIXME knl = self.get_kernel() knl = lp.split_iname(knl, "itgt_box", 64, outer_tag="g.0", inner_tag="l.0") - knl = register_optimization_preambles(knl, self.device) return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: ArrayContext, **kwargs): """ :arg src_expansions: :arg src_box_starts: @@ -310,13 +287,18 @@ def __call__(self, queue, **kwargs): src_rscale = centers.dtype.type(kwargs.pop("src_rscale")) tgt_rscale = centers.dtype.type(kwargs.pop("tgt_rscale")) - knl = self.get_cached_kernel_executor() + knl = self.get_cached_kernel() + result = actx.call_loopy( + knl, + centers=centers, + src_rscale=src_rscale, tgt_rscale=tgt_rscale, + **kwargs) + + return result["tgt_expansions"] +# }}} - return knl(queue, - centers=centers, - src_rscale=src_rscale, tgt_rscale=tgt_rscale, - **kwargs) +# {{{ M2LUsingTranslationClassesDependentData class M2LUsingTranslationClassesDependentData(E2EFromCSR): """Implements translation from a "compressed sparse row"-like source box @@ -328,8 +310,8 @@ def default_name(self): return "m2l_using_translation_classes_dependent_data" def get_translation_loopy_insns(self, result_dtype): - from sumpy.symbolic import make_sym_vector - dvec = make_sym_vector("d", self.dim) + import sumpy.symbolic as sym + dvec = sym.make_sym_vector("d", self.dim) src_rscale = sym.Symbol("src_rscale") tgt_rscale = sym.Symbol("tgt_rscale") @@ -387,11 +369,13 @@ def get_inner_loopy_kernel(self, result_dtype): ncoeff_src = len(self.src_expansion) ncoeff_tgt = len(self.tgt_expansion) + import pymbolic as prim + domains: list[str] = [] insns = self.get_translation_loopy_insns(result_dtype) - tgt_coeffs = pymbolic.var("tgt_coeffs") + tgt_coeffs = prim.var("tgt_coeffs") for i in range(ncoeff_tgt): - expr = pymbolic.var(f"tgt_coeff{i}") + expr = prim.var(f"tgt_coeff{i}") insn = lp.Assignment(assignee=tgt_coeffs[i], expression=tgt_coeffs[i] + expr) insns.append(insn) @@ -434,13 +418,12 @@ def get_kernel(self, result_dtype): translation_knl = self.get_inner_loopy_kernel(result_dtype) from sumpy.tools import gather_loopy_arguments - loopy_knl = lp.make_kernel( - [ - "{[itgt_box]: 0<=itgt_box d[idim] = m2l_translation_vectors[idim, \ @@ -600,7 +585,7 @@ def get_kernel(self, result_dtype): ) {id=update,dep=set_d} end """], - [ + kernel_data=[ lp.ValueArg("src_rscale", None), lp.GlobalArg("m2l_translation_classes_dependent_data", None, shape=("ntranslation_classes", @@ -616,12 +601,10 @@ def get_kernel(self, result_dtype): ], name=self.name, assumptions="ntranslation_classes>=1", - default_offset=lp.auto, fixed_parameters={ "dim": self.dim, "m2l_translation_classes_dependent_ndata": ( m2l_translation_classes_dependent_ndata)}, - lang_version=MOST_RECENT_LANGUAGE_VERSION ) for expr_knl in [self.src_expansion.kernel, self.tgt_expansion.kernel]: @@ -643,11 +626,10 @@ def get_optimized_kernel(self, result_dtype): knl = self.get_kernel(result_dtype) knl = lp.tag_inames(knl, "idim*:unr") knl = lp.tag_inames(knl, {"itr_class": "g.0"}) - knl = register_optimization_preambles(knl, self.device) return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: ArrayContext, **kwargs): """ :arg src_rscale: :arg translation_classes_level_start: @@ -664,14 +646,16 @@ def __call__(self, queue, **kwargs): "m2l_translation_classes_dependent_data") result_dtype = m2l_translation_classes_dependent_data.dtype - knl = self.get_cached_kernel_executor(result_dtype=result_dtype) + knl = self.get_cached_kernel(result_dtype=result_dtype) + result = actx.call_loopy( + knl, + src_rscale=src_rscale, + m2l_translation_vectors=m2l_translation_vectors, + m2l_translation_classes_dependent_data=( + m2l_translation_classes_dependent_data), + **kwargs) - return knl(queue, - src_rscale=src_rscale, - m2l_translation_vectors=m2l_translation_vectors, - m2l_translation_classes_dependent_data=( - m2l_translation_classes_dependent_data), - **kwargs) + return result["m2l_translation_classes_dependent_data"] # }}} @@ -701,7 +685,7 @@ def get_kernel(self, result_dtype): result_dtype) from sumpy.tools import gather_loopy_arguments - loopy_knl = lp.make_kernel( + loopy_knl = make_loopy_program( [ "{[isrc_box]: 0<=isrc_box tgt_ibox = target_boxes[itgt_box] @@ -931,7 +916,7 @@ def get_kernel(self): end end """], - [ + kernel_data=[ lp.GlobalArg("target_boxes", None, shape=lp.auto, offset=lp.auto), lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), @@ -952,7 +937,7 @@ def get_kernel(self): assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_expn*)", fixed_parameters={"dim": self.dim, "nchildren": 2**self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) for knl in [self.src_expansion.kernel, self.tgt_expansion.kernel]: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -963,7 +948,7 @@ def get_kernel(self): return loopy_knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: ArrayContext, **kwargs): """ :arg src_expansions: :arg src_box_starts: @@ -972,23 +957,25 @@ def __call__(self, queue, **kwargs): :arg tgt_rscale: :arg centers: """ - knl = self.get_cached_kernel_executor() - centers = kwargs.pop("centers") # "1" may be passed for rscale, which won't have its type # meaningfully inferred. Make the type of rscale explicit. src_rscale = centers.dtype.type(kwargs.pop("src_rscale")) tgt_rscale = centers.dtype.type(kwargs.pop("tgt_rscale")) - return knl(queue, - centers=centers, - src_rscale=src_rscale, tgt_rscale=tgt_rscale, - **kwargs) + knl = self.get_cached_kernel() + result = actx.call_loopy( + knl, + centers=centers, + src_rscale=src_rscale, tgt_rscale=tgt_rscale, + **kwargs) + + return result["tgt_expansions"] # }}} -# {{{ translation from a box's parent +# {{{ E2EFromParent: translation from a box's parent class E2EFromParent(E2EBase): @property @@ -1007,11 +994,10 @@ def get_kernel(self): # (same for itgt_box, tgt_ibox) from sumpy.tools import gather_loopy_arguments - loopy_knl = lp.make_kernel( - [ - "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] @@ -1038,7 +1024,7 @@ def get_kernel(self): """ for i in range(ncoeffs_tgt)] + [""" end """], - [ + kernel_data=[ lp.GlobalArg("target_boxes", None, shape=lp.auto, offset=lp.auto), lp.GlobalArg("centers", None, shape="dim, naligned_boxes"), @@ -1054,10 +1040,11 @@ def get_kernel(self): "...", *gather_loopy_arguments([self.src_expansion, self.tgt_expansion]) ], - name=self.name, assumptions="ntgt_boxes>=1", + name=self.name, + assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_expn*)", fixed_parameters={"dim": self.dim, "nchildren": 2**self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) for knl in [self.src_expansion.kernel, self.tgt_expansion.kernel]: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -1068,7 +1055,7 @@ def get_kernel(self): return loopy_knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: ArrayContext, **kwargs): """ :arg src_expansions: :arg src_box_starts: @@ -1077,18 +1064,20 @@ def __call__(self, queue, **kwargs): :arg tgt_rscale: :arg centers: """ - knl = self.get_cached_kernel_executor() - centers = kwargs.pop("centers") # "1" may be passed for rscale, which won't have its type # meaningfully inferred. Make the type of rscale explicit. src_rscale = centers.dtype.type(kwargs.pop("src_rscale")) tgt_rscale = centers.dtype.type(kwargs.pop("tgt_rscale")) - return knl(queue, - centers=centers, - src_rscale=src_rscale, tgt_rscale=tgt_rscale, - **kwargs) + knl = self.get_cached_kernel() + result = actx.call_loopy( + knl, + centers=centers, + src_rscale=src_rscale, tgt_rscale=tgt_rscale, + **kwargs) + + return result["tgt_expansions"] # }}} diff --git a/sumpy/e2p.py b/sumpy/e2p.py index 87e2f139..400d7b44 100644 --- a/sumpy/e2p.py +++ b/sumpy/e2p.py @@ -24,16 +24,22 @@ """ from abc import ABC, abstractmethod +from typing import TYPE_CHECKING import numpy as np import loopy as lp -from loopy.version import MOST_RECENT_LANGUAGE_VERSION +import pytools.obj_array as obj_array +from loopy.version import MOST_RECENT_LANGUAGE_VERSION # noqa: F401 -from sumpy.codegen import register_optimization_preambles +from sumpy.array_context import make_loopy_program from sumpy.tools import KernelCacheMixin, gather_loopy_arguments +if TYPE_CHECKING: + from arraycontext import ArrayContext + + __doc__ = """ Expansion-to-particle @@ -46,11 +52,10 @@ """ -# {{{ E2P base class +# {{{ E2PBase: base class class E2PBase(KernelCacheMixin, ABC): - def __init__(self, ctx, expansion, kernels, - name=None, device=None): + def __init__(self, expansion, kernels, name=None): """ :arg expansion: a subclass of :class:`sympy.expansion.ExpansionBase` :arg strength_usage: A list of integers indicating which expression @@ -59,31 +64,28 @@ def __init__(self, ctx, expansion, kernels, Default: all kernels use the same strength. """ - if device is None: - device = ctx.devices[0] - from sumpy.kernel import ( SourceTransformationRemover, TargetTransformationRemover, ) sxr = SourceTransformationRemover() txr = TargetTransformationRemover() - expansion = expansion.with_kernel( - sxr(expansion.kernel)) + expansion = expansion.with_kernel(sxr(expansion.kernel)) kernels = [sxr(knl) for knl in kernels] for knl in kernels: assert txr(knl) == expansion.kernel - self.context = ctx self.expansion = expansion self.kernels = kernels self.name = name or self.default_name - self.device = device self.dim = expansion.dim @property + def nresults(self): + return len(self.kernels) + @abstractmethod def default_name(self): pass @@ -117,7 +119,7 @@ def get_kernel_scaling_assignment(self): # }}} -# {{{ E2P to single box (L2P, likely) +# {{{ E2PFromSingleBox: E2P to single box (L2P, likely) class E2PFromSingleBox(E2PBase): @property @@ -128,7 +130,7 @@ def get_kernel(self): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() - loopy_knl = lp.make_kernel( + loopy_knl = make_loopy_program( [ "{[itgt_box]: 0<=itgt_box LocalExpansionBase class SumpyTreeIndependentDataForWrangler(TreeIndependentDataForWrangler): """Objects of this type serve as a place to keep the code needed for :class:`SumpyExpansionWrangler`. Since :class:`SumpyExpansionWrangler` - necessarily must have a :class:`pyopencl.CommandQueue`, but this queue - is allowed to be more ephemeral than the code, the code's lifetime - is decoupled by storing it in this object. - - Timing results returned by this wrangler contain the values *wall_elapsed* - which measures elapsed wall time. This requires a command queue with - profiling enabled. + contains data that is allowed to be more ephemeral than the code, the code's + lifetime is decoupled by storing it in this object. """ - cl_context: cl.Context multipole_expansion_factory: MultipoleExpansionFromOrderFactory local_expansion_factory: LocalExpansionFromOrderFactory source_kernels: Sequence[Kernel] | None @@ -120,7 +113,7 @@ class SumpyTreeIndependentDataForWrangler(TreeIndependentDataForWrangler): strength_usage: Sequence[int] | None def __init__(self, - cl_context: cl.Context, + array_context: ArrayContext, multipole_expansion_factory: MultipoleExpansionFromOrderFactory, local_expansion_factory: LocalExpansionFromOrderFactory, target_kernels: Sequence[Kernel], @@ -138,6 +131,10 @@ def __init__(self, :arg strength_usage: passed unchanged to p2l, p2m and p2p. :arg source_kernels: passed unchanged to p2l, p2m and p2p. """ + super().__init__() + + self._setup_actx: ArrayContext = array_context + self.multipole_expansion_factory = multipole_expansion_factory self.local_expansion_factory = local_expansion_factory self.source_kernels = source_kernels @@ -146,10 +143,6 @@ def __init__(self, self.use_rscale = use_rscale self.strength_usage = strength_usage - super().__init__() - - self.cl_context = cl_context - @memoize_method def get_base_kernel(self): from pytools import single_valued @@ -175,21 +168,21 @@ def m2l_translation(self): @memoize_method def p2m(self, tgt_order): - return P2EFromSingleBox(self.cl_context, + return P2EFromSingleBox( kernels=self.source_kernels, expansion=self.multipole_expansion(tgt_order), strength_usage=self.strength_usage, name="p2m") @memoize_method def p2l(self, tgt_order): - return P2EFromCSR(self.cl_context, + return P2EFromCSR( kernels=self.source_kernels, expansion=self.local_expansion(tgt_order), strength_usage=self.strength_usage, name="p2l") @memoize_method def m2m(self, src_order, tgt_order): - return E2EFromChildren(self.cl_context, + return E2EFromChildren( self.multipole_expansion(src_order), self.multipole_expansion(tgt_order), name="m2m") @@ -200,118 +193,56 @@ def m2l(self, src_order, tgt_order, m2l_class = M2LUsingTranslationClassesDependentData else: m2l_class = E2EFromCSR - return m2l_class(self.cl_context, + return m2l_class( self.multipole_expansion(src_order), self.local_expansion(tgt_order), name="m2l") @memoize_method def m2l_translation_class_dependent_data_kernel(self, src_order, tgt_order): - return M2LGenerateTranslationClassesDependentData(self.cl_context, + return M2LGenerateTranslationClassesDependentData( self.multipole_expansion(src_order), self.local_expansion(tgt_order)) @memoize_method def m2l_preprocess_mpole_kernel(self, src_order, tgt_order): - return M2LPreprocessMultipole(self.cl_context, + return M2LPreprocessMultipole( self.multipole_expansion(src_order), self.local_expansion(tgt_order)) @memoize_method def m2l_postprocess_local_kernel(self, src_order, tgt_order): - return M2LPostprocessLocal(self.cl_context, + return M2LPostprocessLocal( self.multipole_expansion(src_order), self.local_expansion(tgt_order)) @memoize_method def l2l(self, src_order, tgt_order): - return E2EFromParent(self.cl_context, + return E2EFromParent( self.local_expansion(src_order), self.local_expansion(tgt_order), name="l2l") @memoize_method def m2p(self, src_order): - return E2PFromCSR(self.cl_context, + return E2PFromCSR( self.multipole_expansion(src_order), self.target_kernels, name="m2p") @memoize_method def l2p(self, src_order): - return E2PFromSingleBox(self.cl_context, + return E2PFromSingleBox( self.local_expansion(src_order), self.target_kernels, name="l2p") @memoize_method def p2p(self): - return P2PFromCSR(self.cl_context, target_kernels=self.target_kernels, + return P2PFromCSR(target_kernels=self.target_kernels, source_kernels=self.source_kernels, exclude_self=self.exclude_self, strength_usage=self.strength_usage, name="p2p") @memoize_method def opencl_fft_app(self, shape, dtype, inverse): - with cl.CommandQueue(self.cl_context) as queue: - return get_opencl_fft_app(queue, shape, dtype, inverse) - -# }}} - - -# {{{ timing future - -_SECONDS_PER_NANOSECOND = 1e-9 - - -class CLEventLike(Protocol): - """ - EventLike objects have an attribute native_event that returns - a cl.Event that indicates the end of the event. - """ - native_event: cl.Event - - -class UnableToCollectTimingData(UserWarning): - pass - - -class SumpyTimingFuture: - - def __init__(self, queue, events: list[cl.Event | CLEventLike]): - self.queue = queue - self.events = events - - @property - def native_events(self) -> list[cl.Event]: - return [evt if isinstance(evt, cl.Event) else evt.native_event - for evt in self.events] - - @memoize_method - def result(self): - from boxtree.timing import TimingResult - - if not self.queue.properties & cl.command_queue_properties.PROFILING_ENABLE: - from warnings import warn - warn( - "Profiling was not enabled in the command queue. " - "Timing data will not be collected.", - category=UnableToCollectTimingData, - stacklevel=3) - return TimingResult(wall_elapsed=None) - - if self.events: - cl.wait_for_events(self.native_events) - - result = 0 - for event in self.events: - result += ( - (event.profile.end - event.profile.start) - * _SECONDS_PER_NANOSECOND) - - return TimingResult(wall_elapsed=result) - - def done(self): - return all( - event.get_info(cl.event_info.COMMAND_EXECUTION_STATUS) - == cl.command_execution_status.COMPLETE - for event in self.native_events) + return get_opencl_fft_app(self._setup_actx, shape, dtype, inverse=inverse) # }}} @@ -377,7 +308,6 @@ def __init__(self, _disable_translation_classes=False ): super().__init__(tree_indep, traversal) - self.issued_timing_data_warning = False self.dtype = np.dtype(dtype) @@ -417,13 +347,14 @@ def __init__(self, self.supports_translation_classes = False else: if translation_classes_data is None: - with cl.CommandQueue(self.tree_indep.cl_context) as queue: - from boxtree.translation_classes import TranslationClassesBuilder - translation_classes_builder = TranslationClassesBuilder( - queue.context) - translation_classes_data, _ = translation_classes_builder( - queue, traversal, self.tree, - is_translation_per_level=True) + from boxtree.translation_classes import TranslationClassesBuilder + + actx = tree_indep._setup_actx + translation_classes_builder = TranslationClassesBuilder(actx) + translation_classes_data, _ = translation_classes_builder( + actx, traversal, self.tree, is_translation_per_level=True) + translation_classes_data = actx.freeze(translation_classes_data) + self.supports_translation_classes = True self.translation_classes_data = translation_classes_data @@ -449,9 +380,17 @@ def level_to_rscale(self, level: int) -> float: # {{{ data vector utilities + @property + @memoize_method + def tree_level_start_box_nrs(self): + # NOTE: a host version of `level_start_box_nrs` is used repeatedly and + # this simply caches it to avoid repeated transfers + actx = self.tree_indep._setup_actx + return actx.to_numpy(self.tree.level_start_box_nrs) + def _expansions_level_starts(self, order_to_size: Callable[[int], int]): return build_csr_level_starts(self.level_orders, order_to_size, - self.tree.level_start_box_nrs) + self.tree_level_start_box_nrs) @memoize_method def multipole_expansions_level_starts(self): @@ -465,9 +404,10 @@ def local_expansions_level_starts(self): @memoize_method def m2l_translation_class_level_start_box_nrs(self): - with cl.CommandQueue(self.tree_indep.cl_context) as queue: - data = self.translation_classes_data - return data.from_sep_siblings_translation_classes_level_starts.get(queue) + actx = self.tree_indep._setup_actx + return actx.to_numpy( + self.translation_classes_data + .from_sep_siblings_translation_classes_level_starts) @memoize_method def m2l_translation_classes_dependent_data_level_starts(self): @@ -481,68 +421,65 @@ def order_to_size(order: int): return build_csr_level_starts(self.level_orders, order_to_size, level_starts=self.m2l_translation_class_level_start_box_nrs()) - def multipole_expansion_zeros(self, template_ary): + def multipole_expansion_zeros(self, actx: ArrayContext) -> Array: """Return an expansions array (which must support addition) capable of holding one multipole or local expansion for every box in the tree. - :arg template_ary: an array (not necessarily of the same shape or dtype as - the one to be created) whose run-time environment - (e.g. :class:`pyopencl.CommandQueue`) the returned array should - reuse. """ - return cl_array.zeros( - template_ary.queue, + return actx.zeros( self.multipole_expansions_level_starts()[-1], dtype=self.dtype) - def local_expansion_zeros(self, template_ary): + def local_expansion_zeros(self, actx) -> Array: """Return an expansions array (which must support addition) capable of holding one multipole or local expansion for every box in the tree. - :arg template_ary: an array (not necessarily of the same shape or dtype as - the one to be created) whose run-time environment - (e.g. :class:`pyopencl.CommandQueue`) the returned array should - reuse. """ - return cl_array.zeros( - template_ary.queue, + return actx.zeros( self.local_expansions_level_starts()[-1], dtype=self.dtype) - def m2l_translation_classes_dependent_data_zeros(self, queue): + def m2l_translation_classes_dependent_data_zeros( + self, actx: ArrayContext): + data_level_starts = ( + self.m2l_translation_classes_dependent_data_level_starts()) + level_start_box_nrs = ( + self.m2l_translation_class_level_start_box_nrs()) + result = [] for level in range(self.tree.nlevels): - expn_start, expn_stop = \ - self.m2l_translation_classes_dependent_data_level_starts()[ - level:level+2] - translation_class_start, translation_class_stop = \ - self.m2l_translation_class_level_start_box_nrs()[level:level+2] - exprs_level = cl_array.zeros(queue, expn_stop - expn_start, - dtype=self.preprocessed_mpole_dtype) - result.append(exprs_level.reshape( - translation_class_stop - translation_class_start, -1)) + expn_start, expn_stop = data_level_starts[level:level + 2] + translation_class_start, translation_class_stop = ( + level_start_box_nrs[level:level + 2]) + + exprs_level = actx.zeros( + expn_stop - expn_start, + dtype=self.preprocessed_mpole_dtype + ).reshape(translation_class_stop - translation_class_start, -1) + result.append(exprs_level) + return result def multipole_expansions_view(self, mpole_exps, level): - expn_start, expn_stop = \ - self.multipole_expansions_level_starts()[level:level+2] - box_start, box_stop = self.tree.level_start_box_nrs[level:level+2] + expn_start, expn_stop = ( + self.multipole_expansions_level_starts()[level:level + 2]) + box_start, box_stop = self.tree_level_start_box_nrs[level:level + 2] return (box_start, mpole_exps[expn_start:expn_stop].reshape(box_stop-box_start, -1)) def local_expansions_view(self, local_exps, level): - expn_start, expn_stop = \ - self.local_expansions_level_starts()[level:level+2] - box_start, box_stop = self.tree.level_start_box_nrs[level:level+2] + expn_start, expn_stop = ( + self.local_expansions_level_starts()[level:level + 2]) + box_start, box_stop = self.tree_level_start_box_nrs[level:level + 2] return (box_start, local_exps[expn_start:expn_stop].reshape(box_stop-box_start, -1)) def m2l_translation_classes_dependent_data_view(self, m2l_translation_classes_dependent_data, level): - translation_class_start, _ = \ - self.m2l_translation_class_level_start_box_nrs()[level:level+2] + translation_class_start, _ = ( + self.m2l_translation_class_level_start_box_nrs()[level:level + 2]) exprs_level = m2l_translation_classes_dependent_data[level] return (translation_class_start, exprs_level) @@ -556,48 +493,48 @@ def order_to_size(order): return res return build_csr_level_starts(self.level_orders, order_to_size, - level_starts=self.tree.level_start_box_nrs) + level_starts=self.tree_level_start_box_nrs) + + def m2l_preproc_mpole_expansion_zeros( + self, actx: ArrayContext, template_ary): + level_starts = self.m2l_preproc_mpole_expansions_level_starts() - def m2l_preproc_mpole_expansion_zeros(self, template_ary): result = [] for level in range(self.tree.nlevels): - expn_start, expn_stop = \ - self.m2l_preproc_mpole_expansions_level_starts()[level:level+2] - box_start, box_stop = self.tree.level_start_box_nrs[level:level+2] - exprs_level = cl_array.zeros(template_ary.queue, expn_stop - expn_start, - dtype=self.preprocessed_mpole_dtype) - result.append(exprs_level.reshape(box_stop - box_start, -1)) + expn_start, expn_stop = level_starts[level:level+2] + box_start, box_stop = self.tree_level_start_box_nrs[level:level+2] + + exprs_level = actx.zeros( + expn_stop - expn_start, + dtype=self.preprocessed_mpole_dtype, + ).reshape(box_stop - box_start, -1) + result.append(exprs_level) + return result def m2l_preproc_mpole_expansions_view(self, mpole_exps, level): - box_start, _ = self.tree.level_start_box_nrs[level:level+2] + box_start, _ = self.tree_level_start_box_nrs[level:level+2] return (box_start, mpole_exps[level]) m2l_work_array_view = m2l_preproc_mpole_expansions_view m2l_work_array_zeros = m2l_preproc_mpole_expansion_zeros - m2l_work_array_level_starts = \ - m2l_preproc_mpole_expansions_level_starts + m2l_work_array_level_starts = m2l_preproc_mpole_expansions_level_starts - def output_zeros(self, template_ary): + def output_zeros(self, + actx: ArrayContext + ) -> obj_array.ObjectArray1D[Array]: """Return a potentials array (which must support addition) capable of holding a potential value for each target in the tree. Note that :func:`drive_fmm` makes no assumptions about *potential* other than that it supports addition--it may consist of potentials, gradients of the potential, or arbitrary other per-target output data. - :arg template_ary: an array (not necessarily of the same shape or dtype as - the one to be created) whose run-time environment - (e.g. :class:`pyopencl.CommandQueue`) the returned array should - reuse. """ return obj_array.new_1d([ - cl_array.zeros( - template_ary.queue, - self.tree.ntargets, - dtype=self.dtype) + actx.zeros(self.tree.ntargets, dtype=self.dtype) for k in self.tree_indep.target_kernels]) def reorder_sources(self, source_array): - return source_array.with_queue(source_array.queue)[self.tree.user_source_ids] + return source_array[self.tree.user_source_ids] def reorder_potentials(self, potentials): import numpy as np @@ -614,20 +551,18 @@ def reorder(x): @property @memoize_method def max_nsources_in_one_box(self): - with cl.CommandQueue(self.tree_indep.cl_context) as queue: - return int( - cast("cl_array.Array", - cl_array.max(self.tree.box_source_counts_nonchild, queue)) - .get()) + actx = self.tree_indep._setup_actx + return actx.to_numpy( + actx.np.max(self.tree.box_source_counts_nonchild) + ).item() @property @memoize_method def max_ntargets_in_one_box(self): - with cl.CommandQueue(self.tree_indep.cl_context) as queue: - return int( - cast("cl_array.Array", - cl_array.max(self.tree.box_target_counts_nonchild, queue)) - .get()) + actx = self.tree_indep._setup_actx + return actx.to_numpy( + actx.np.max(self.tree.box_target_counts_nonchild) + ).item() # }}} @@ -651,22 +586,29 @@ def box_target_list_kwargs(self): # }}} - def run_opencl_fft(self, queue, input_vec, inverse, wait_for): + def run_opencl_fft(self, actx: ArrayContext, + input_vec, inverse, wait_for): app = self.tree_indep.opencl_fft_app(input_vec.shape, input_vec.dtype, inverse) - return run_opencl_fft(app, queue, input_vec, inverse, wait_for) + evt, result = run_opencl_fft( + actx, app, input_vec, inverse=inverse, wait_for=wait_for) + + from sumpy.tools import get_native_event + input_vec.add_event(get_native_event(evt)) + result.add_event(get_native_event(evt)) + + return result def form_multipoles(self, + actx: ArrayContext, level_start_source_box_nrs, source_boxes, src_weight_vecs): - mpoles = self.multipole_expansion_zeros(src_weight_vecs[0]) + mpoles = self.multipole_expansion_zeros(actx) + level_start_source_box_nrs = actx.to_numpy(level_start_source_box_nrs) kwargs = dict(self.extra_kwargs) kwargs.update(self.box_source_list_kwargs()) - events = [] - queue = src_weight_vecs[0].queue - for lev in range(self.tree.nlevels): p2m = self.tree_indep.p2m(self.level_orders[lev]) start, stop = level_start_source_box_nrs[lev:lev+2] @@ -676,8 +618,8 @@ def form_multipoles(self, level_start_ibox, mpoles_view = self.multipole_expansions_view( mpoles, lev) - evt, (mpoles_res,) = p2m( - queue, + mpoles_res = p2m( + actx, source_boxes=source_boxes[start:stop], centers=self.tree.box_centers, strengths=src_weight_vecs, @@ -686,20 +628,19 @@ def form_multipoles(self, rscale=self.level_to_rscale(lev), **kwargs) - events.append(evt) assert mpoles_res is mpoles_view - return (mpoles, SumpyTimingFuture(queue, events)) + return mpoles def coarsen_multipoles(self, + actx: ArrayContext, level_start_source_parent_box_nrs, source_parent_boxes, mpoles): tree = self.tree - - events = [] - queue = mpoles.queue + level_start_source_parent_box_nrs = ( + actx.to_numpy(level_start_source_parent_box_nrs)) # nlevels-1 is the last valid level index # nlevels-2 is the last valid level that could have children @@ -721,13 +662,13 @@ def coarsen_multipoles(self, self.level_orders[source_level], self.level_orders[target_level]) - source_level_start_ibox, source_mpoles_view = \ - self.multipole_expansions_view(mpoles, source_level) - target_level_start_ibox, target_mpoles_view = \ - self.multipole_expansions_view(mpoles, target_level) + source_level_start_ibox, source_mpoles_view = ( + self.multipole_expansions_view(mpoles, source_level)) + target_level_start_ibox, target_mpoles_view = ( + self.multipole_expansions_view(mpoles, target_level)) - evt, (mpoles_res,) = m2m( - queue, + mpoles_res = m2m( + actx, src_expansions=source_mpoles_view, src_base_ibox=source_level_start_ibox, tgt_expansions=target_mpoles_view, @@ -741,28 +682,23 @@ def coarsen_multipoles(self, tgt_rscale=self.level_to_rscale(target_level), **self.kernel_extra_kwargs) - events.append(evt) assert mpoles_res is target_mpoles_view - if events: - mpoles.add_event(events[-1]) - - return (mpoles, SumpyTimingFuture(queue, events)) + return mpoles - def eval_direct(self, target_boxes, source_box_starts, + def eval_direct(self, + actx: ArrayContext, + target_boxes, source_box_starts, source_box_lists, src_weight_vecs): - pot = self.output_zeros(src_weight_vecs[0]) + pot = self.output_zeros(actx) kwargs = dict(self.extra_kwargs) kwargs.update(self.self_extra_kwargs) kwargs.update(self.box_source_list_kwargs()) kwargs.update(self.box_target_list_kwargs()) - events = [] - queue = src_weight_vecs[0].queue - - evt, pot_res = self.tree_indep.p2p()(queue, + pot_res = self.tree_indep.p2p()(actx, target_boxes=target_boxes, source_box_starts=source_box_starts, source_box_lists=source_box_lists, @@ -771,67 +707,65 @@ def eval_direct(self, target_boxes, source_box_starts, max_nsources_in_one_box=self.max_nsources_in_one_box, max_ntargets_in_one_box=self.max_ntargets_in_one_box, **kwargs) - events.append(evt) for pot_i, pot_res_i in zip(pot, pot_res, strict=True): assert pot_i is pot_res_i - pot_i.add_event(evt) - return (pot, SumpyTimingFuture(queue, events)) + return pot @memoize_method def multipole_to_local_precompute(self): - result = [] - with cl.CommandQueue(self.tree_indep.cl_context) as queue: - m2l_translation_classes_dependent_data = \ - self.m2l_translation_classes_dependent_data_zeros(queue) - for lev in range(self.tree.nlevels): - src_rscale = self.level_to_rscale(lev) - order = self.level_orders[lev] - precompute_kernel = \ - self.tree_indep.m2l_translation_class_dependent_data_kernel( - order, order) - - translation_classes_level_start, \ - m2l_translation_classes_dependent_data_view = \ - self.m2l_translation_classes_dependent_data_view( - m2l_translation_classes_dependent_data, lev) - - ntranslation_classes = \ - m2l_translation_classes_dependent_data_view.shape[0] + actx = self.tree_indep._setup_actx - if ntranslation_classes == 0: - result.append(cl_array.empty_like( - m2l_translation_classes_dependent_data_view)) - continue + result = [] + m2l_translation_classes_dependent_data = ( + self.m2l_translation_classes_dependent_data_zeros(actx)) - data = self.translation_classes_data - m2l_translation_vectors = ( - data.from_sep_siblings_translation_class_to_distance_vector) - - evt, _ = precompute_kernel( - queue, - src_rscale=src_rscale, - translation_classes_level_start=translation_classes_level_start, - ntranslation_classes=ntranslation_classes, - m2l_translation_classes_dependent_data=( - m2l_translation_classes_dependent_data_view), - m2l_translation_vectors=m2l_translation_vectors, - ntranslation_vectors=m2l_translation_vectors.shape[1], - **self.kernel_extra_kwargs + for lev in range(self.tree.nlevels): + src_rscale = self.level_to_rscale(lev) + order = self.level_orders[lev] + precompute_kernel = ( + self.tree_indep.m2l_translation_class_dependent_data_kernel( + order, order) ) - if self.tree_indep.m2l_translation.use_fft: - _, m2l_translation_classes_dependent_data_view = \ - self.run_opencl_fft(queue, - m2l_translation_classes_dependent_data_view, - inverse=False, wait_for=[evt]) - result.append(m2l_translation_classes_dependent_data_view) + translation_classes_level_start, \ + m2l_translation_classes_dependent_data_view = \ + self.m2l_translation_classes_dependent_data_view( + m2l_translation_classes_dependent_data, lev) - for lev in range(self.tree.nlevels): - result[lev].finish() + ntranslation_classes = ( + m2l_translation_classes_dependent_data_view.shape[0]) - result = [arr.with_queue(None) for arr in result] + if ntranslation_classes == 0: + result.append(actx.np.zeros_like( + m2l_translation_classes_dependent_data_view)) + continue + + data = self.translation_classes_data + m2l_translation_vectors = ( + data.from_sep_siblings_translation_class_to_distance_vector) + + precompute_kernel( + actx, + src_rscale=src_rscale, + translation_classes_level_start=translation_classes_level_start, + ntranslation_classes=ntranslation_classes, + m2l_translation_classes_dependent_data=( + m2l_translation_classes_dependent_data_view), + m2l_translation_vectors=m2l_translation_vectors, + ntranslation_vectors=m2l_translation_vectors.shape[1], + **self.kernel_extra_kwargs + ) + + if self.tree_indep.m2l_translation.use_fft: + m2l_translation_classes_dependent_data_view = ( + self.run_opencl_fft(actx, + m2l_translation_classes_dependent_data_view, + inverse=False, wait_for=None)) + result.append(m2l_translation_classes_dependent_data_view) + + result = [actx.freeze(arr) for arr in result] return result def _add_m2l_precompute_kwargs(self, kwargs_for_m2l, @@ -856,17 +790,18 @@ def _add_m2l_precompute_kwargs(self, kwargs_for_m2l, self.translation_classes_data.from_sep_siblings_translation_classes def multipole_to_local(self, + actx: ArrayContext, level_start_target_box_nrs, target_boxes, src_box_starts, src_box_lists, mpole_exps): - queue = mpole_exps.queue - local_exps = self.local_expansion_zeros(mpole_exps) + local_exps = self.local_expansion_zeros(actx) + level_start_target_box_nrs = actx.to_numpy(level_start_target_box_nrs) if self.tree_indep.m2l_translation.use_preprocessing: - preprocessed_mpole_exps = \ - self.m2l_preproc_mpole_expansion_zeros(mpole_exps) - m2l_work_array = self.m2l_work_array_zeros(local_exps) + preprocessed_mpole_exps = ( + self.m2l_preproc_mpole_expansion_zeros(actx, mpole_exps)) + m2l_work_array = self.m2l_work_array_zeros(actx, local_exps) mpole_exps_view_func = self.m2l_preproc_mpole_expansions_view local_exps_view_func = self.m2l_work_array_view else: @@ -875,12 +810,8 @@ def multipole_to_local(self, mpole_exps_view_func = self.multipole_expansions_view local_exps_view_func = self.local_expansions_view - preprocess_evts = [] - translate_evts = [] - postprocess_evts = [] - for lev in range(self.tree.nlevels): - wait_for = [] + wait_for: list[pyopencl.Event] = [] start, stop = level_start_target_box_nrs[lev:lev+2] if start == stop: @@ -899,25 +830,20 @@ def multipole_to_local(self, # There is no M2L happening in this level continue - evt, _ = preprocess_mpole_kernel( - queue, + preprocess_mpole_kernel( + actx, src_expansions=source_mpoles_view, preprocessed_src_expansions=preprocessed_mpole_exps[lev], src_rscale=self.level_to_rscale(lev), wait_for=wait_for, **self.kernel_extra_kwargs ) - wait_for.append(evt) if self.tree_indep.m2l_translation.use_fft: - evt_fft, preprocessed_mpole_exps[lev] = \ - self.run_opencl_fft(queue, + preprocessed_mpole_exps[lev] = \ + self.run_opencl_fft(actx, preprocessed_mpole_exps[lev], inverse=False, wait_for=wait_for) - wait_for.append(get_native_event(evt_fft)) - evt = AggregateProfilingEvent([evt, evt_fft]) - - preprocess_evts.append(evt) order = self.level_orders[lev] m2l = self.tree_indep.m2l(order, order, @@ -949,9 +875,7 @@ def multipole_to_local(self, kwargs["m2l_translation_classes_dependent_data"].size == 0: # There is nothing to do for this level continue - evt, _ = m2l(queue, **kwargs, wait_for=wait_for) - wait_for.append(evt) - translate_evts.append(evt) + m2l(actx, **kwargs, wait_for=wait_for) if self.tree_indep.m2l_translation.use_preprocessing: order = self.level_orders[lev] @@ -971,14 +895,13 @@ def multipole_to_local(self, continue if self.tree_indep.m2l_translation.use_fft: - evt_fft, target_locals_before_postprocessing_view = \ - self.run_opencl_fft(queue, + target_locals_before_postprocessing_view = \ + self.run_opencl_fft(actx, target_locals_before_postprocessing_view, inverse=True, wait_for=wait_for) - wait_for.append(get_native_event(evt_fft)) - evt, _ = postprocess_local_kernel( - queue, + postprocess_local_kernel( + actx, tgt_expansions=target_locals_view, tgt_expansions_before_postprocessing=( target_locals_before_postprocessing_view), @@ -988,27 +911,17 @@ def multipole_to_local(self, **self.kernel_extra_kwargs, ) - if self.tree_indep.m2l_translation.use_fft: - postprocess_evts.append(AggregateProfilingEvent([evt_fft, evt])) - else: - postprocess_evts.append(evt) - - timing_events = preprocess_evts + translate_evts + postprocess_evts - - return (local_exps, SumpyTimingFuture(queue, timing_events)) + return local_exps def eval_multipoles(self, + actx: ArrayContext, target_boxes_by_source_level, source_boxes_by_level, mpole_exps): - pot = self.output_zeros(mpole_exps) + pot = self.output_zeros(actx) kwargs = dict(self.kernel_extra_kwargs) kwargs.update(self.box_target_list_kwargs()) - events = [] - queue = mpole_exps.queue - wait_for = mpole_exps.events - for isrc_level, ssn in enumerate(source_boxes_by_level): if len(target_boxes_by_source_level[isrc_level]) == 0: continue @@ -1018,8 +931,8 @@ def eval_multipoles(self, source_level_start_ibox, source_mpoles_view = \ self.multipole_expansions_view(mpole_exps, isrc_level) - evt, pot_res = m2p( - queue, + pot_res = m2p( + actx, src_expansions=source_mpoles_view, src_base_ibox=source_level_start_ibox, @@ -1035,33 +948,26 @@ def eval_multipoles(self, wait_for=wait_for, **kwargs) - events.append(evt) - - wait_for = [evt] for pot_i, pot_res_i in zip(pot, pot_res, strict=True): assert pot_i is pot_res_i - if events: - for pot_i in pot: - pot_i.add_event(events[-1]) - - return (pot, SumpyTimingFuture(queue, events)) + return pot def form_locals(self, + actx: ArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, starts, lists, src_weight_vecs): - local_exps = self.local_expansion_zeros(src_weight_vecs[0]) + local_exps = self.local_expansion_zeros(actx) + level_start_target_or_target_parent_box_nrs = ( + actx.to_numpy(level_start_target_or_target_parent_box_nrs)) kwargs = dict(self.extra_kwargs) kwargs.update(self.box_source_list_kwargs()) - events = [] - queue = src_weight_vecs[0].queue - for lev in range(self.tree.nlevels): - start, stop = \ - level_start_target_or_target_parent_box_nrs[lev:lev+2] + start, stop = ( + level_start_target_or_target_parent_box_nrs[lev:lev+2]) if start == stop: continue @@ -1070,8 +976,8 @@ def form_locals(self, target_level_start_ibox, target_local_exps_view = \ self.local_expansions_view(local_exps, lev) - evt, (result,) = p2l( - queue, + result = p2l( + actx, target_boxes=target_or_target_parent_boxes[start:stop], source_box_starts=starts[start:stop+1], source_box_lists=lists, @@ -1084,23 +990,22 @@ def form_locals(self, rscale=self.level_to_rscale(lev), **kwargs) - events.append(evt) assert result is target_local_exps_view - return (local_exps, SumpyTimingFuture(queue, events)) + return local_exps def refine_locals(self, + actx: ArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, local_exps): - - events = [] - queue = local_exps.queue + level_start_target_or_target_parent_box_nrs = ( + actx.to_numpy(level_start_target_or_target_parent_box_nrs)) for target_lev in range(1, self.tree.nlevels): - start, stop = level_start_target_or_target_parent_box_nrs[ - target_lev:target_lev+2] + start, stop = ( + level_start_target_or_target_parent_box_nrs[target_lev:target_lev+2]) if start == stop: continue @@ -1114,7 +1019,7 @@ def refine_locals(self, target_level_start_ibox, target_local_exps_view = \ self.local_expansions_view(local_exps, target_lev) - evt, (local_exps_res,) = l2l(queue, + local_exps_res = l2l(actx, src_expansions=source_local_exps_view, src_base_ibox=source_level_start_ibox, tgt_expansions=target_local_exps_view, @@ -1128,24 +1033,20 @@ def refine_locals(self, tgt_rscale=self.level_to_rscale(target_lev), **self.kernel_extra_kwargs) - events.append(evt) assert local_exps_res is target_local_exps_view - for evt in events: - local_exps.add_event(evt) + return local_exps - return (local_exps, SumpyTimingFuture(queue, events)) - - def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): - pot = self.output_zeros(local_exps) + def eval_locals(self, + actx: ArrayContext, + level_start_target_box_nrs, target_boxes, local_exps): + pot = self.output_zeros(actx) + level_start_target_box_nrs = actx.to_numpy(level_start_target_box_nrs) kwargs = dict(self.kernel_extra_kwargs) kwargs.update(self.box_target_list_kwargs()) - events = [] - queue = local_exps.queue - for lev in range(self.tree.nlevels): start, stop = level_start_target_box_nrs[lev:lev+2] if start == stop: @@ -1156,8 +1057,8 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): source_level_start_ibox, source_local_exps_view = \ self.local_expansions_view(local_exps, lev) - evt, pot_res = l2p( - queue, + pot_res = l2p( + actx, src_expansions=source_local_exps_view, src_base_ibox=source_level_start_ibox, @@ -1169,14 +1070,13 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): rscale=self.level_to_rscale(lev), **kwargs) - events.append(evt) for pot_i, pot_res_i in zip(pot, pot_res, strict=True): assert pot_i is pot_res_i - return (pot, SumpyTimingFuture(queue, events)) + return pot - def finalize_potentials(self, potentials, template_ary): + def finalize_potentials(self, actx: ArrayContext, potentials): return potentials # }}} diff --git a/sumpy/p2e.py b/sumpy/p2e.py index 30237d39..63d45246 100644 --- a/sumpy/p2e.py +++ b/sumpy/p2e.py @@ -24,16 +24,20 @@ """ import logging +from typing import TYPE_CHECKING import numpy as np import loopy as lp -from loopy.version import MOST_RECENT_LANGUAGE_VERSION -from sumpy.codegen import register_optimization_preambles +from sumpy.array_context import make_loopy_program from sumpy.tools import KernelCacheMixin, KernelComputation +if TYPE_CHECKING: + from arraycontext import ArrayContext + + logger = logging.getLogger(__name__) @@ -48,7 +52,7 @@ """ -# {{{ P2E base class +# {{{ P2EBase: base class class P2EBase(KernelCacheMixin, KernelComputation): """Common input processing for kernel computations. @@ -56,8 +60,7 @@ class P2EBase(KernelCacheMixin, KernelComputation): .. automethod:: __init__ """ - def __init__(self, ctx, expansion, kernels=None, - name=None, device=None, strength_usage=None): + def __init__(self, expansion, kernels=None, name=None, strength_usage=None): """ :arg expansion: a subclass of :class:`sumpy.expansion.ExpansionBase` :arg kernels: if not provided, the kernel of the *expansion* is used. @@ -83,10 +86,10 @@ def __init__(self, ctx, expansion, kernels=None, assert txr(knl) == knl assert sxr(knl) == expansion.kernel - KernelComputation.__init__(self, ctx=ctx, target_kernels=[], + KernelComputation.__init__(self, target_kernels=[], source_kernels=kernels, strength_usage=strength_usage, value_dtypes=None, - name=name, device=device) + name=name) self.expansion = expansion self.dim = expansion.dim @@ -123,28 +126,33 @@ def get_optimized_kernel(self, sources_is_obj_array, centers_is_obj_array): knl = self._allow_redundant_execution_of_knl_scaling(knl) knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") - knl = register_optimization_preambles(knl, self.device) return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: ArrayContext, **kwargs): from sumpy.tools import is_obj_array_like sources = kwargs.pop("sources") centers = kwargs.pop("centers") - knl = self.get_cached_kernel_executor( - sources_is_obj_array=is_obj_array_like(sources), - centers_is_obj_array=is_obj_array_like(centers)) # "1" may be passed for rscale, which won't have its type # meaningfully inferred. Make the type of rscale explicit. dtype = centers[0].dtype if is_obj_array_like(centers) else centers.dtype rscale = dtype.type(kwargs.pop("rscale")) - return knl(queue, sources=sources, centers=centers, rscale=rscale, **kwargs) + knl = self.get_cached_kernel( + sources_is_obj_array=is_obj_array_like(sources), + centers_is_obj_array=is_obj_array_like(centers)) + + result = actx.call_loopy( + knl, + sources=sources, centers=centers, rscale=rscale, + **kwargs) + + return result["tgt_expansions"] # }}} -# {{{ P2E from single box (P2M, likely) +# {{{ P2EFromSingleBox: P2E from single box (P2M, likely) class P2EFromSingleBox(P2EBase): """ @@ -159,7 +167,7 @@ def get_kernel(self): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() - loopy_knl = lp.make_kernel([ + loopy_knl = make_loopy_program([ "{[isrc_box]: 0 <= isrc_box < nsrc_boxes}", "{[isrc]: isrc_start <= isrc < isrc_end}", "{[idim]: 0 <= idim < dim}", @@ -216,11 +224,9 @@ def get_kernel(self): name=self.name, assumptions="nsrc_boxes>=1", silenced_warnings="write_race(write_expn*)", - default_offset=lp.auto, fixed_parameters={ "dim": self.dim, "nstrengths": self.strength_count, - "ncoeffs": ncoeffs}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + "ncoeffs": ncoeffs}) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_inames(loopy_knl, "istrength*:unr") @@ -237,7 +243,7 @@ def get_optimized_kernel(self, sources_is_obj_array, centers_is_obj_array): knl = lp.split_iname(knl, "isrc_box", 16, outer_tag="g.0") return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: ArrayContext, **kwargs): """ :arg source_boxes: an array of integer indices into *box_source_starts* and *box_source_counts_nonchild*. @@ -257,12 +263,12 @@ def __call__(self, queue, **kwargs): :returns: an array of *tgt_expansions*. """ - return super().__call__(queue, **kwargs) + return super().__call__(actx, **kwargs) # }}} -# {{{ P2E from CSR-like interaction list +# {{{ P2EFromCSR: P2E from CSR-like interaction list class P2EFromCSR(P2EBase): """ @@ -297,7 +303,7 @@ def get_kernel(self): ... ]) - loopy_knl = lp.make_kernel( + loopy_knl = make_loopy_program( [ "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}", "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_stop}", @@ -345,15 +351,13 @@ def get_kernel(self): dep=update_result:init_coeffs} end """], - arguments, + kernel_data=arguments, name=self.name, assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_expn*)", - default_offset=lp.auto, fixed_parameters={"dim": self.dim, "nstrengths": self.strength_count, - "ncoeffs": ncoeffs}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + "ncoeffs": ncoeffs}) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_inames(loopy_knl, "istrength*:unr") @@ -370,7 +374,7 @@ def get_optimized_kernel(self, sources_is_obj_array, centers_is_obj_array): knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0") return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: ArrayContext, **kwargs): """ :arg target_boxes: array of integer indices into *source_box_starts* and *centers*. @@ -389,7 +393,7 @@ def __call__(self, queue, **kwargs): :arg tgt_base_ibox: see :meth:`P2EFromSingleBox.__call__`. :arg tgt_expansion: see :meth:`P2EFromSingleBox.__call__`. """ - return super().__call__(queue, **kwargs) + return super().__call__(actx, **kwargs) # }}} diff --git a/sumpy/p2p.py b/sumpy/p2p.py index a5d2bb89..e1ef9078 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -26,25 +26,29 @@ THE SOFTWARE. """ +import logging from typing import TYPE_CHECKING, Any import numpy as np +from typing_extensions import override import loopy as lp -from loopy.version import MOST_RECENT_LANGUAGE_VERSION +import pytools.obj_array as obj_array +from arraycontext import PyOpenCLArrayContext -from sumpy.codegen import register_optimization_preambles +from sumpy.array_context import make_loopy_program from sumpy.tools import KernelCacheMixin, KernelComputation, is_obj_array_like if TYPE_CHECKING: from collections.abc import Sequence - import pyopencl as cl - from arraycontext import Array + from arraycontext import Array, ArrayContext from pytools.obj_array import ObjectArray1D +logger = logging.getLogger(__name__) + __doc__ = """ Particle-to-particle @@ -62,11 +66,11 @@ # LATER: # - Optimization for source == target (postpone) -# {{{ p2p base class +# {{{ P2PBase: base class class P2PBase(KernelCacheMixin, KernelComputation): - def __init__(self, ctx, target_kernels, exclude_self, strength_usage=None, - value_dtypes=None, name=None, device=None, source_kernels=None): + def __init__(self, target_kernels, exclude_self, strength_usage=None, + value_dtypes=None, name=None, source_kernels=None): """ :arg target_kernels: list of :class:`sumpy.kernel.Kernel` instances with only target derivatives. @@ -99,23 +103,19 @@ def __init__(self, ctx, target_kernels, exclude_self, strength_usage=None, base_target_kernel = single_valued(txr(knl) for knl in target_kernels) assert base_source_kernel == base_target_kernel - KernelComputation.__init__(self, ctx=ctx, target_kernels=target_kernels, + KernelComputation.__init__(self, target_kernels=target_kernels, source_kernels=source_kernels, strength_usage=strength_usage, - value_dtypes=value_dtypes, name=name, device=device) - - import pyopencl as cl - self.is_gpu = not (self.device.type & cl.device_type.CPU) + value_dtypes=value_dtypes, name=name) self.exclude_self = exclude_self - - self.dim = single_valued(knl.dim for knl in - list(self.target_kernels) + list(self.source_kernels)) + self.dim = single_valued([ + knl.dim for knl in self.target_kernels + self.source_kernels + ]) def get_cache_key(self): return (type(self).__name__, tuple(self.target_kernels), self.exclude_self, tuple(self.strength_usage), tuple(self.value_dtypes), - tuple(self.source_kernels), - self.device.hashable_model_and_version_identifier) + tuple(self.source_kernels)) def get_loopy_insns_and_result_names(self): from pymbolic import var @@ -190,7 +190,10 @@ def get_default_src_tgt_arguments(self): if self.exclude_self else []) + gather_loopy_source_arguments(self.source_kernels)) - def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): + def get_optimized_kernel(self, *, + targets_is_obj_array: bool = False, + sources_is_obj_array: bool = False, + **kwargs: Any) -> lp.TranslationUnit: # FIXME knl = self.get_kernel() @@ -201,17 +204,14 @@ def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0") knl = self._allow_redundant_execution_of_knl_scaling(knl) - knl = lp.set_options(knl, - enforce_variable_access_ordered="no_check") - - knl = register_optimization_preambles(knl, self.device) + knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") return knl # }}} -# {{{ P2P point-interaction calculation +# {{{ P2P: point-interaction calculation class P2P(P2PBase): """Direct applier for P2P interactions.""" @@ -230,7 +230,7 @@ def get_kernel(self): shape="nresults, ntargets", dim_tags="sep,C") ] - loopy_knl = lp.make_kernel([""" + loopy_knl = make_loopy_program([""" {[itgt, isrc, idim]: \ 0 <= itgt < ntargets and \ 0 <= isrc < nsources and \ @@ -249,15 +249,14 @@ def get_kernel(self): simul_reduce(sum, isrc, pair_result_{iknl}) {{inames=itgt}} """ for iknl in range(len(self.target_kernels))] + ["end"], - arguments, + kernel_data=arguments, assumptions="nsources>=1 and ntargets>=1", name=self.name, - default_offset=lp.auto, fixed_parameters={ "dim": self.dim, "nstrengths": self.strength_count, "nresults": len(self.target_kernels)}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -267,23 +266,29 @@ def get_kernel(self): return loopy_knl def __call__(self, - queue: cl.CommandQueue, - targets: ObjectArray1D[Array] | Array, - sources: ObjectArray1D[Array] | Array, - strength: Sequence[Array], - **kwargs: Any, - ) -> tuple[cl.Event, Sequence[Array]]: - knl = self.get_cached_kernel_executor( + actx: ArrayContext, + targets: ObjectArray1D[Array] | Array, + sources: ObjectArray1D[Array] | Array, + strength: Sequence[Array], + **kwargs: Any, + ) -> ObjectArray1D[Array]: + knl = self.get_cached_kernel( targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources)) - return knl(queue, sources=sources, targets=targets, strength=strength, - **kwargs) + result = actx.call_loopy( + knl, + sources=sources, + targets=targets, + strength=strength, + **kwargs) + + return obj_array.new_1d([result[f"result_s{i}"] for i in range(self.nresults)]) # }}} -# {{{ P2P matrix writer +# {{{ P2PMatrixGenerator: matrix writer class P2PMatrixGenerator(P2PBase): """Generator for P2P interaction matrix entries.""" @@ -304,7 +309,7 @@ def get_kernel(self): for i, dtype in enumerate(self.value_dtypes) ]) - loopy_knl = lp.make_kernel([""" + loopy_knl = make_loopy_program([""" {[itgt, isrc, idim]: \ 0 <= itgt < ntargets and \ 0 <= isrc < nsources and \ @@ -325,7 +330,7 @@ def get_kernel(self): assumptions="nsources>=1 and ntargets>=1", name=self.name, fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -335,21 +340,22 @@ def get_kernel(self): return loopy_knl def __call__(self, - queue: cl.CommandQueue, + actx: ArrayContext, targets: ObjectArray1D[Array] | Array, sources: ObjectArray1D[Array] | Array, **kwargs: Any, - ) -> tuple[cl.Event, Sequence[Array]]: - knl = self.get_cached_kernel_executor( + ) -> ObjectArray1D[Array]: + knl = self.get_cached_kernel( targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources)) - return knl(queue, sources=sources, targets=targets, **kwargs) + result = actx.call_loopy(knl, sources=sources, targets=targets, **kwargs) + return obj_array.new_1d([result[f"result_{i}"] for i in range(self.nresults)]) # }}} -# {{{ P2P matrix subset generator +# {{{ P2PMatrixSubsetGenerator: matrix subset generator class P2PMatrixSubsetGenerator(P2PBase): """Generator for a subset of P2P interaction matrix entries. @@ -381,7 +387,7 @@ def get_kernel(self): for i, dtype in enumerate(self.value_dtypes) ]) - loopy_knl = lp.make_kernel( + loopy_knl = make_loopy_program( "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}", self.get_kernel_scaling_assignments() # NOTE: itgt, isrc need to always be defined in case a statement @@ -409,7 +415,7 @@ def get_kernel(self): silenced_warnings="write_race(write_p2p*)", name=self.name, fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.add_dtypes( @@ -433,19 +439,18 @@ def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): knl = self._allow_redundant_execution_of_knl_scaling(knl) knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") - knl = register_optimization_preambles(knl, self.device) return knl def __call__(self, - queue: cl.CommandQueue, + actx: ArrayContext, targets: ObjectArray1D[Array] | Array, sources: ObjectArray1D[Array] | Array, *, tgtindices: Array, srcindices: Array, **kwargs: Any, - ) -> tuple[cl.Event, Sequence[Array]]: + ) -> ObjectArray1D[Array]: """Evaluate a subset of the P2P matrix interactions. :arg targets: target point coordinates, which can be an object @@ -461,28 +466,36 @@ def __call__(self, :returns: a one-dimensional array of interactions, for each index pair in (*srcindices*, *tgtindices*) """ - knl = self.get_cached_kernel_executor( + knl = self.get_cached_kernel( targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources)) - return knl(queue, - targets=targets, - sources=sources, - tgtindices=tgtindices, - srcindices=srcindices, **kwargs) + result = actx.call_loopy( + knl, + targets=targets, + sources=sources, + tgtindices=tgtindices, + srcindices=srcindices, **kwargs) + + return obj_array.new_1d([result[f"result_{i}"] for i in range(self.nresults)]) # }}} -# {{{ P2P from CSR-like interaction list +# {{{ P2PFromCSR: P2P from CSR-like interaction list class P2PFromCSR(P2PBase): @property + @override def default_name(self): return "p2p_from_csr" - def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, - work_items_per_group=32): + @override + def get_kernel(self, *, + max_nsources_in_one_box: int = 32, + max_ntargets_in_one_box: int = 32, + work_items_per_group: int = 32, + is_gpu: bool = False, **kwargs: Any) -> lp.TranslationUnit: loopy_insns, _result_names = self.get_loopy_insns_and_result_names() arguments = [ *self.get_default_src_tgt_arguments(), @@ -503,7 +516,7 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, lp.GlobalArg("result", None, shape="noutputs, ntargets", dim_tags="sep,C"), lp.TemporaryVariable("tgt_center", shape=(self.dim,)), - "..." + ... ] domains = [ @@ -515,7 +528,7 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, tgt_outer_limit = (max_ntargets_in_one_box - 1) // work_items_per_group - if self.is_gpu: + if is_gpu: arguments += [ lp.TemporaryVariable("local_isrc", shape=(self.dim, max_nsources_in_one_box)), @@ -538,7 +551,7 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, # There are two algorithms here because pocl-pthread 1.9 miscompiles # the "gpu" kernel with prefetching. - if self.is_gpu: + if is_gpu: instructions = (self.get_kernel_scaling_assignments() + [""" for itgt_box @@ -659,13 +672,15 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, end """]) - loopy_knl = lp.make_kernel( + loopy_knl = make_loopy_program( domains, instructions, - arguments, + kernel_data=arguments, assumptions="ntgt_boxes>=1", name=self.name, - silenced_warnings=["write_race(write_csr*)", "write_race(prefetch_src)", + silenced_warnings=[ + "write_race(write_csr*)", + "write_race(prefetch_src)", "write_race(prefetch_charge)"], fixed_parameters={ "dim": self.dim, @@ -675,10 +690,12 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, "work_items_per_group": work_items_per_group, "tgt_outer_limit": tgt_outer_limit, "noutputs": len(self.target_kernels)}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) - loopy_knl = lp.add_dtypes( - loopy_knl, {"nsources": np.int32, "ntargets": np.int32}) + loopy_knl = lp.add_dtypes(loopy_knl, { + "nsources": np.dtype(np.int32), + "ntargets": np.dtype(np.int32), + }) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_inames(loopy_knl, "istrength*:unr") @@ -690,25 +707,38 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, return loopy_knl - def get_optimized_kernel(self, max_nsources_in_one_box, - max_ntargets_in_one_box, source_dtype, strength_dtype): - if not self.is_gpu: - knl = self.get_kernel(max_nsources_in_one_box, - max_ntargets_in_one_box) + @override + def get_optimized_kernel(self, *, + max_nsources_in_one_box: int = 32, + max_ntargets_in_one_box: int = 32, + strength_dtype: np.dtype[Any] | None = None, + source_dtype: np.dtype[Any] | None = None, + local_mem_size: int = 32, + is_gpu: bool = False, **kwargs) -> lp.TranslationUnit: + if not is_gpu: + knl = self.get_kernel( + max_nsources_in_one_box=max_nsources_in_one_box, + max_ntargets_in_one_box=max_ntargets_in_one_box, + is_gpu=is_gpu) knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0") knl = self._allow_redundant_execution_of_knl_scaling(knl) else: + assert strength_dtype is not None + assert source_dtype is not None + dtype_size = np.dtype(strength_dtype).alignment work_items_per_group = min(256, max_ntargets_in_one_box) total_local_mem = max_nsources_in_one_box * \ (self.dim + self.strength_count) * dtype_size # multiplying by 2 here to make sure at least 2 work groups # can be scheduled at the same time for latency hiding - nprefetch = (2 * total_local_mem - 1) // self.device.local_mem_size + 1 + nprefetch = (2 * total_local_mem - 1) // local_mem_size + 1 - knl = self.get_kernel(max_nsources_in_one_box, - max_ntargets_in_one_box, - work_items_per_group=work_items_per_group) + knl = self.get_kernel( + max_nsources_in_one_box=max_nsources_in_one_box, + max_ntargets_in_one_box=max_ntargets_in_one_box, + work_items_per_group=work_items_per_group, + is_gpu=is_gpu) knl = lp.tag_inames(knl, {"itgt_box": "g.0", "inner": "l.0"}) knl = lp.set_temporary_address_space(knl, ["local_isrc", "local_isrc_strength"], lp.AddressSpace.LOCAL) @@ -768,42 +798,43 @@ def get_optimized_kernel(self, max_nsources_in_one_box, knl = lp.add_inames_to_insn(knl, "inner", "id:init_* or id:*_scaling or id:src_box_insn_*") knl = lp.add_inames_to_insn(knl, "itgt_box", "id:*_scaling") - # knl = lp.set_options(knl, write_code=True) - - knl = lp.set_options(knl, - enforce_variable_access_ordered="no_check") - - knl = register_optimization_preambles(knl, self.device) + knl = lp.set_options(knl, enforce_variable_access_ordered="no_check") return knl def __call__(self, - queue: cl.CommandQueue, + actx: ArrayContext, targets: ObjectArray1D[Array] | Array, sources: ObjectArray1D[Array] | Array, *, max_nsources_in_one_box: int, max_ntargets_in_one_box: int, **kwargs: Any, - ) -> tuple[cl.Event, Sequence[Array]]: + ) -> ObjectArray1D[Array]: + from sumpy.array_context import is_cl_cpu - if self.is_gpu: - source_dtype = sources[0].dtype - strength_dtype = kwargs.get("strength").dtype + is_gpu = not is_cl_cpu(actx) + if is_gpu: + source_dtype = kwargs["sources"][0].dtype + strength_dtype = kwargs["strength"].dtype else: # these are unused for not GPU and defeats the caching # set them to None to keep the caching across dtypes source_dtype = None strength_dtype = None - knl = self.get_cached_kernel_executor( + assert isinstance(actx, PyOpenCLArrayContext) + knl = self.get_cached_kernel( max_nsources_in_one_box=max_nsources_in_one_box, max_ntargets_in_one_box=max_ntargets_in_one_box, + local_mem_size=actx.queue.device.local_mem_size, + is_gpu=is_gpu, source_dtype=source_dtype, strength_dtype=strength_dtype, - ) + ) - return knl(queue, targets=targets, sources=sources, **kwargs) + result = actx.call_loopy(knl, targets=targets, sources=sources, **kwargs) + return obj_array.new_1d([result[f"result_s{i}"] for i in range(self.nresults)]) # }}} diff --git a/sumpy/qbx.py b/sumpy/qbx.py index d38fe6aa..926367a9 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -28,23 +28,24 @@ import logging from abc import ABC -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import numpy as np from typing_extensions import override import loopy as lp import pytools.obj_array as obj_array -from loopy.version import MOST_RECENT_LANGUAGE_VERSION -from pymbolic import parse, var +from pymbolic import parse from pytools import memoize_method import sumpy.symbolic as sym +from sumpy.array_context import is_cl_cpu, make_loopy_program from sumpy.tools import KernelCacheMixin, KernelComputation, is_obj_array_like if TYPE_CHECKING: - import pyopencl as cl + + from arraycontext import ArrayContext from sumpy.expansion.local import ( LineTaylorLocalExpansion, @@ -67,7 +68,7 @@ """ -def stringify_expn_index(i): +def stringify_expn_index(i: tuple[int, ...] | int) -> str: if isinstance(i, tuple): return "_".join(stringify_expn_index(i_i) for i_i in i) else: @@ -80,13 +81,12 @@ def stringify_expn_index(i): # {{{ layer potential computation -# {{{ base class +# {{{ LayerPotentialBase: base class class LayerPotentialBase(KernelCacheMixin, KernelComputation, ABC): expansion: LineTaylorLocalExpansion | LocalExpansionBase def __init__(self, - ctx: cl.Context, expansion: LineTaylorLocalExpansion | LocalExpansionBase, strength_usage=None, value_dtypes=None, @@ -96,9 +96,12 @@ def __init__(self, target_kernels=None): from pytools import single_valued - KernelComputation.__init__(self, ctx=ctx, target_kernels=target_kernels, - strength_usage=strength_usage, source_kernels=source_kernels, - value_dtypes=value_dtypes, name=name, device=device) + KernelComputation.__init__(self, + target_kernels=target_kernels, + source_kernels=source_kernels, + strength_usage=strength_usage, + value_dtypes=value_dtypes, + name=name) self.dim = single_valued(knl.dim for knl in self.target_kernels) self.expansion = expansion @@ -106,8 +109,7 @@ def __init__(self, def get_cache_key(self): return (type(self).__name__, self.expansion, tuple(self.target_kernels), tuple(self.source_kernels), tuple(self.strength_usage), - tuple(self.value_dtypes), - self.device.hashable_model_and_version_identifier) + tuple(self.value_dtypes)) def _expand(self, sac, avec, bvec, rscale, isrc): from sumpy.symbolic import PymbolicToSympyMapper @@ -152,12 +154,13 @@ def get_loopy_insns_and_result_names(self): logger.info("compute expansion expressions: start") + import pymbolic as prim rscale = sym.Symbol("rscale") - isrc_sym = var("isrc") + isrc_sym = prim.var("isrc") coefficients = self._expand(sac, avec, bvec, rscale, isrc_sym) result_names = [self._evaluate(sac, avec, bvec, rscale, i, coefficients) - for i in range(len(self.target_kernels))] + for i in range(self.nresults)] logger.info("compute expansion expressions: done") @@ -178,10 +181,12 @@ def get_loopy_insns_and_result_names(self): return loopy_insns, result_names def get_strength_or_not(self, isrc, kernel_idx): - return var(f"strength_{self.strength_usage[kernel_idx]}_isrc") + import pymbolic as prim + return prim.var(f"strength_{self.strength_usage[kernel_idx]}_isrc") def get_kernel_exprs(self, result_names): - exprs = [var(name) for i, name in enumerate(result_names)] + import pymbolic as prim + exprs = [prim.var(name) for i, name in enumerate(result_names)] return [lp.Assignment(id=None, assignee=f"pair_result_{i}", @@ -205,13 +210,15 @@ def get_default_src_tgt_arguments(self): *gather_loopy_source_arguments(self.source_kernels) ] - @override - def get_optimized_kernel(self, - targets_is_obj_array, sources_is_obj_array, centers_is_obj_array, + def get_optimized_kernel(self, *, + is_cpu: bool = True, + targets_is_obj_array: bool = False, + sources_is_obj_array: bool = False, + centers_is_obj_array: bool = False, # Used by pytential to override the name of the loop to be # parallelized. In the case of QBX, that's the loop over QBX # targets (not global targets). - itgt_name="itgt"): + itgt_name: str = "itgt", **kwargs: Any) -> lp.TranslationUnit: # FIXME specialize/tune for GPU/CPU loopy_knl = self.get_kernel() @@ -222,9 +229,7 @@ def get_optimized_kernel(self, if centers_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") - import pyopencl as cl - dev = self.context.devices[0] - if dev.type & cl.device_type.CPU: + if is_cpu: loopy_knl = lp.split_iname(loopy_knl, itgt_name, 16, outer_tag="g.0", inner_tag="l.0") loopy_knl = lp.split_iname(loopy_knl, "isrc", 256) @@ -232,9 +237,11 @@ def get_optimized_kernel(self, ["isrc_outer", f"{itgt_name}_inner"]) else: from warnings import warn - warn(f"don't know how to tune layer potential computation for '{dev}'", - stacklevel=1) + warn( + "Do not know how to tune layer potential computation for " + "non-CPU targets", stacklevel=1) loopy_knl = lp.split_iname(loopy_knl, itgt_name, 128, outer_tag="g.0") + loopy_knl = self._allow_redundant_execution_of_knl_scaling(loopy_knl) return loopy_knl @@ -242,7 +249,7 @@ def get_optimized_kernel(self, # }}} -# {{{ direct applier +# {{{ LayerPotential: direct applier class LayerPotential(LayerPotentialBase): """Direct applier for the layer potential. @@ -270,7 +277,7 @@ def get_kernel(self): for i in range(len(self.target_kernels)) ]) - loopy_knl = lp.make_kernel([""" + loopy_knl = make_loopy_program([""" {[itgt, isrc, idim]: \ 0 <= itgt < ntargets and \ 0 <= isrc < nsources and \ @@ -291,13 +298,12 @@ def get_kernel(self): """ for iknl in range(len(self.target_kernels))] + ["end"], - arguments, + kernel_data=arguments, name=self.name, assumptions="ntargets>=1 and nsources>=1", - default_offset=lp.auto, silenced_warnings="write_race(write_lpot*)", fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for knl in [*self.target_kernels, *self.source_kernels]: @@ -305,14 +311,16 @@ def get_kernel(self): return loopy_knl - def __call__(self, queue, targets, sources, centers, strengths, expansion_radii, + def __call__(self, actx: ArrayContext, + targets, sources, centers, strengths, expansion_radii, **kwargs): """ :arg strengths: are required to have area elements and quadrature weights already multiplied in. """ - knl = self.get_cached_kernel_executor( + knl = self.get_cached_kernel( + is_cpu=is_cl_cpu(actx), targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources), centers_is_obj_array=is_obj_array_like(centers)) @@ -320,13 +328,20 @@ def __call__(self, queue, targets, sources, centers, strengths, expansion_radii, for i, dens in enumerate(strengths): kwargs[f"strength_{i}"] = dens - return knl(queue, sources=sources, targets=targets, center=centers, - expansion_radii=expansion_radii, **kwargs) + result = actx.call_loopy( + knl, + sources=sources, + targets=targets, + center=centers, + expansion_radii=expansion_radii, + **kwargs) + + return obj_array.new_1d([result[f"result_{i}"] for i in range(self.nresults)]) # }}} -# {{{ matrix writer +# {{{ LayerPotentialMatrixGenerator: matrix writer class LayerPotentialMatrixGenerator(LayerPotentialBase): """Generator for layer potential matrix entries.""" @@ -350,7 +365,7 @@ def get_kernel(self): for i, dtype in enumerate(self.value_dtypes) ]) - loopy_knl = lp.make_kernel([""" + loopy_knl = make_loopy_program([""" {[itgt, isrc, idim]: \ 0 <= itgt < ntargets and \ 0 <= isrc < nsources and \ @@ -369,12 +384,11 @@ def get_kernel(self): """ for iknl in range(len(self.target_kernels))] + ["end"], - arguments, + kernel_data=arguments, name=self.name, assumptions="ntargets>=1 and nsources>=1", - default_offset=lp.auto, fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for expn in [*self.source_kernels, *self.target_kernels]: @@ -382,19 +396,28 @@ def get_kernel(self): return loopy_knl - def __call__(self, queue, targets, sources, centers, expansion_radii, **kwargs): - knl = self.get_cached_kernel_executor( + def __call__(self, actx: ArrayContext, + targets, sources, centers, expansion_radii, **kwargs): + knl = self.get_cached_kernel( + is_cpu=is_cl_cpu(actx), targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources), centers_is_obj_array=is_obj_array_like(centers)) - return knl(queue, sources=sources, targets=targets, center=centers, - expansion_radii=expansion_radii, **kwargs) + result = actx.call_loopy( + knl, + sources=sources, + targets=targets, + center=centers, + expansion_radii=expansion_radii, + **kwargs) + + return obj_array.new_1d([result[f"result_{i}"] for i in range(self.nresults)]) # }}} -# {{{ matrix subset generator +# {{{ LayerPotentialMatrixSubsetGenerator: matrix subset generator class LayerPotentialMatrixSubsetGenerator(LayerPotentialBase): """Generator for a subset of the layer potential matrix entries. @@ -425,7 +448,7 @@ def get_kernel(self): for i, dtype in enumerate(self.value_dtypes) ]) - loopy_knl = lp.make_kernel([ + loopy_knl = make_loopy_program([ "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}" ], self.get_kernel_scaling_assignments() @@ -448,13 +471,12 @@ def get_kernel(self): """ for iknl in range(len(self.target_kernels))] + ["end"], - arguments, + kernel_data=arguments, name=self.name, assumptions="nresult>=1", - default_offset=lp.auto, silenced_warnings="write_race(write_lpot*)", fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.add_dtypes( @@ -481,8 +503,9 @@ def get_optimized_kernel(self, loopy_knl = self._allow_redundant_execution_of_knl_scaling(loopy_knl) return loopy_knl - def __call__(self, queue, targets, sources, centers, expansion_radii, - tgtindices, srcindices, **kwargs): + def __call__(self, actx: ArrayContext, + targets, sources, centers, expansion_radii, + tgtindices, srcindices, **kwargs): """Evaluate a subset of the QBX matrix interactions. :arg targets: target point coordinates, which can be an object @@ -504,18 +527,21 @@ def __call__(self, queue, targets, sources, centers, expansion_radii, in (*srcindices*, *tgtindices*) """ - knl = self.get_cached_kernel_executor( + knl = self.get_cached_kernel( targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources), centers_is_obj_array=is_obj_array_like(centers)) - return knl(queue, - sources=sources, - targets=targets, - center=centers, - expansion_radii=expansion_radii, - tgtindices=tgtindices, - srcindices=srcindices, **kwargs) + result = actx.call_loopy( + knl, + sources=sources, + targets=targets, + center=centers, + expansion_radii=expansion_radii, + tgtindices=tgtindices, + srcindices=srcindices, **kwargs) + + return obj_array.new_1d([result[f"result_{i}"] for i in range(self.nresults)]) # }}} @@ -617,6 +643,7 @@ class _JumpTermSymbolicArgumentProvider: data was requested. This tracking allows assembling the argument list of the resulting computational kernel. """ + dim: int def __init__(self, data_args, dim, density_var_name, density_dtype, geometry_dtype): @@ -630,26 +657,29 @@ def __init__(self, data_args, dim, density_var_name, @property @memoize_method def density(self): + import pymbolic as prim self.arguments[self.density_var_name] = \ lp.GlobalArg(self.density_var_name, self.density_dtype, shape="ntargets", order="C") - return parse(f"{self.density_var_name}[itgt]") + return prim.parse(f"{self.density_var_name}[itgt]") @property @memoize_method def density_prime(self): + import pymbolic as prim prime_var_name = f"{self.density_var_name}_prime" self.arguments[prime_var_name] = ( lp.GlobalArg(prime_var_name, self.density_dtype, shape="ntargets", order="C")) - return parse(f"{prime_var_name}[itgt]") + return prim.parse(f"{prime_var_name}[itgt]") @property @memoize_method def side(self): + import pymbolic as prim self.arguments["side"] = ( lp.GlobalArg("side", self.geometry_dtype, shape="ntargets")) - return parse("side[itgt]") + return prim.parse("side[itgt]") @property @memoize_method @@ -674,11 +704,12 @@ def tangent(self): @property @memoize_method def mean_curvature(self): + import pymbolic as prim self.arguments["mean_curvature"] = ( lp.GlobalArg("mean_curvature", self.geometry_dtype, shape="ntargets", order="C")) - return parse("mean_curvature[itgt]") + return prim.parse("mean_curvature[itgt]") @property @memoize_method diff --git a/sumpy/test/test_distributed.py b/sumpy/test/test_distributed.py index f9bb00e4..29f339bb 100644 --- a/sumpy/test/test_distributed.py +++ b/sumpy/test/test_distributed.py @@ -23,13 +23,23 @@ THE SOFTWARE. """ +import logging import os from functools import partial import numpy as np import pytest -import pyopencl as cl +from arraycontext import pytest_generate_tests_for_array_contexts + +from sumpy.array_context import PytestPyOpenCLArrayContextFactory, _acf + + +logger = logging.getLogger(__name__) + +pytest_generate_tests = pytest_generate_tests_for_array_contexts([ + PytestPyOpenCLArrayContextFactory, + ]) # Note: Do not import mpi4py.MPI object at the module level, because OpenMPI does not @@ -57,14 +67,13 @@ def _test_against_single_rank( set_cache_dir(mpi_rank) # Configure array context - cl_context = cl.create_some_context() - queue = cl.CommandQueue(cl_context) + actx = _acf() def fmm_level_to_order(base_kernel, kernel_arg_set, tree, level): return max(level, 3) from boxtree.traversal import FMMTraversalBuilder - traversal_builder = FMMTraversalBuilder(cl_context, well_sep_is_n_away=2) + traversal_builder = FMMTraversalBuilder(actx, well_sep_is_n_away=2) from sumpy.expansion import DefaultExpansionFactory from sumpy.kernel import LaplaceKernel @@ -78,34 +87,30 @@ def fmm_level_to_order(base_kernel, kernel_arg_set, tree, level): from sumpy.fmm import SumpyTreeIndependentDataForWrangler tree_indep = SumpyTreeIndependentDataForWrangler( - cl_context, multipole_expansion_factory, local_expansion_factory, [kernel]) + actx, multipole_expansion_factory, local_expansion_factory, [kernel]) global_tree_dev = None - sources_weights = cl.array.empty(queue, 0, dtype=dtype) + sources_weights = actx.np.zeros(0, dtype=dtype) if mpi_rank == 0: # Generate random particles and source weights from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(queue, nsources, dims, dtype, seed=15) - targets = p_normal(queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=18) # FIXME: Use arraycontext instead of raw PyOpenCL arrays - from pyopencl.clrandom import PhiloxGenerator - rng = PhiloxGenerator(cl_context, seed=20) - sources_weights = rng.uniform(queue, nsources, dtype=np.float64) - - rng = PhiloxGenerator(cl_context, seed=22) - target_radii = rng.uniform( - queue, ntargets, a=0, b=0.05, dtype=np.float64) + rng = np.random.default_rng(20) + sources_weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) + target_radii = actx.from_numpy(0.05 * rng.random(ntargets, dtype=np.float64)) # Build the tree and interaction lists from boxtree import TreeBuilder - tb = TreeBuilder(cl_context) + tb = TreeBuilder(actx) global_tree_dev, _ = tb( - queue, sources, targets=targets, target_radii=target_radii, + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.25, max_particles_in_box=30, debug=True) - global_trav_dev, _ = traversal_builder(queue, global_tree_dev, debug=True) + global_trav_dev, _ = traversal_builder(actx, global_tree_dev, debug=True) from sumpy.fmm import SumpyExpansionWrangler wrangler = SumpyExpansionWrangler(tree_indep, global_trav_dev, dtype, @@ -113,32 +118,29 @@ def fmm_level_to_order(base_kernel, kernel_arg_set, tree, level): # Compute FMM with one MPI rank from boxtree.fmm import drive_fmm - shmem_potential = drive_fmm(wrangler, [sources_weights]) + shmem_potential = drive_fmm(actx, wrangler, [sources_weights]) # Compute FMM using the distributed implementation def wrangler_factory(local_traversal, global_traversal): from sumpy.distributed import DistributedSumpyExpansionWrangler return DistributedSumpyExpansionWrangler( - cl_context, comm, tree_indep, local_traversal, global_traversal, dtype, + actx, comm, tree_indep, local_traversal, global_traversal, dtype, fmm_level_to_order, communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce) from boxtree.distributed import DistributedFMMRunner distributed_fmm_info = DistributedFMMRunner( - queue, global_tree_dev, traversal_builder, wrangler_factory, comm=comm) + actx, global_tree_dev, traversal_builder, wrangler_factory, comm=comm) - timing_data = {} - distributed_potential = distributed_fmm_info.drive_dfmm( - [sources_weights], timing_data=timing_data) - assert timing_data + distributed_potential = distributed_fmm_info.drive_dfmm(actx, [sources_weights]) if mpi_rank == 0: assert shmem_potential.shape == (1,) assert distributed_potential.shape == (1,) - shmem_potential = shmem_potential[0].get() - distributed_potential = distributed_potential[0].get() + shmem_potential = actx.to_numpy(shmem_potential[0]) + distributed_potential = actx.to_numpy(distributed_potential[0]) error = (np.linalg.norm(distributed_potential - shmem_potential, ord=np.inf) / np.linalg.norm(shmem_potential, ord=np.inf)) diff --git a/sumpy/test/test_fmm.py b/sumpy/test/test_fmm.py index 0ddae297..5f54e327 100644 --- a/sumpy/test/test_fmm.py +++ b/sumpy/test/test_fmm.py @@ -34,7 +34,11 @@ import pytest import pytools.obj_array as obj_array -from arraycontext import ArrayContextFactory, pytest_generate_tests_for_array_contexts +from arraycontext import ( + ArrayContextFactory, + PyOpenCLArrayContext, + pytest_generate_tests_for_array_contexts, +) from sumpy.array_context import PytestPyOpenCLArrayContextFactory, _acf # noqa: F401 from sumpy.expansion.local import ( @@ -146,17 +150,24 @@ def _test_sumpy_fmm( actx = actx_factory() + if fft_backend == "pyvkfft": + from pyopencl.characterize import get_pocl_version + if (isinstance(actx, PyOpenCLArrayContext) + and get_pocl_version(actx.queue.device.platform) >= (7,)): + pytest.skip("pocl 7 and pyvkfft don't get along: " + "https://github.com/pocl/pocl/issues/2069") + nsources = 1000 ntargets = 300 dtype = np.float64 from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) if 1: offset = np.zeros(knl.dim) offset[0] = 0.1 - targets = offset + p_normal(actx.queue, ntargets, knl.dim, dtype, seed=18) + targets = offset + p_normal(actx, ntargets, knl.dim, dtype, seed=18) del offset else: @@ -166,20 +177,19 @@ def _test_sumpy_fmm( targets = obj_array.new_1d([fp.points[i] for i in range(knl.dim)]) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) # {{{ plot tree if 0: - host_tree = tree.get(actx.queue) - host_trav = trav.get(actx.queue) + host_tree = actx.to_numpy(tree) + host_trav = actx.to_numpy(trav) if 0: logger.info("src_box: %s", host_tree.find_box_nr_for_source(403)) @@ -247,7 +257,7 @@ def _test_sumpy_fmm( local_expansion_factory = partial(local_expn_class, knl) tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), local_expansion_factory, target_kernels) @@ -266,12 +276,11 @@ def fmm_level_to_order(kernel, kernel_args, tree, lev): from boxtree.fmm import drive_fmm - pot, = drive_fmm(wrangler, (weights,)) + pot, = drive_fmm(actx, wrangler, (weights,)) from sumpy import P2P - p2p = P2P(actx.context, target_kernels, exclude_self=False) - _evt, (ref_pot,) = p2p(actx.queue, targets, sources, (weights,), - **extra_kwargs) + p2p = P2P(target_kernels, exclude_self=False) + ref_pot, = p2p(actx, targets, sources, (weights,), **extra_kwargs) pot = actx.to_numpy(pot) ref_pot = actx.to_numpy(ref_pot) @@ -305,21 +314,19 @@ def test_coeff_magnitude_rscale(actx_factory: ArrayContextFactory, knl): from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) offset = np.zeros(knl.dim) offset[0] = 0.1 - targets = offset + p_normal(actx.queue, ntargets, knl.dim, dtype, seed=18) + targets = offset + p_normal(actx, ntargets, knl.dim, dtype, seed=18) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, - max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(31) weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) @@ -338,7 +345,7 @@ def test_coeff_magnitude_rscale(actx_factory: ArrayContextFactory, knl): target_kernels = [knl] tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), target_kernels) @@ -351,9 +358,10 @@ def fmm_level_to_order(kernel, kernel_args, tree, lev): kernel_extra_kwargs=extra_kwargs) weights = wrangler.reorder_sources(weights) - (weights,) = wrangler.distribute_source_weights((weights,), None) + (weights,) = wrangler.distribute_source_weights(actx, (weights,), None) - local_result, _ = wrangler.form_locals( + local_result = wrangler.form_locals( + actx, trav.level_start_target_or_target_parent_box_nrs, trav.target_or_target_parent_boxes, trav.from_sep_bigger_starts, @@ -393,22 +401,20 @@ def test_unified_single_and_double(actx_factory: ArrayContextFactory, visualize= from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) offset = np.zeros(knl.dim) offset[0] = 0.1 - targets = offset + p_normal(actx.queue, ntargets, knl.dim, dtype, seed=18) + targets = offset + p_normal(actx, ntargets, knl.dim, dtype, seed=18) del offset from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, - max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(44) weights = ( @@ -439,7 +445,7 @@ def test_unified_single_and_double(actx_factory: ArrayContextFactory, visualize= if deriv_knl in source_kernels: source_extra_kwargs["dir_vec"] = dir_vec tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), target_kernels=target_kernels, source_kernels=source_kernels, @@ -450,7 +456,7 @@ def test_unified_single_and_double(actx_factory: ArrayContextFactory, visualize= from boxtree.fmm import drive_fmm - pot = drive_fmm(wrangler, weights) + pot = drive_fmm(actx, wrangler, weights) results.append(np.array([actx.to_numpy(pot[0]), actx.to_numpy(pot[1])])) ref_pot = results[0] + results[1] @@ -488,16 +494,15 @@ def test_sumpy_fmm_timing_data_collection(ctx_factory, use_fft, visualize=False) mpole_expn_class = VolumeTaylorMultipoleExpansion order = 1 - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(44) weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) @@ -515,7 +520,7 @@ def test_sumpy_fmm_timing_data_collection(ctx_factory, use_fft, visualize=False) knl, local_expn_class)() tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl, m2l_translation_override=m2l_translation), target_kernels) @@ -524,11 +529,7 @@ def test_sumpy_fmm_timing_data_collection(ctx_factory, use_fft, visualize=False) fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order) from boxtree.fmm import drive_fmm - timing_data = {} - _pot, = drive_fmm(wrangler, (weights,), timing_data=timing_data) - logger.info("timing_data:\n%s", timing_data) - - assert timing_data + _pot, = drive_fmm(actx, wrangler, (weights,)) def test_sumpy_fmm_exclude_self(actx_factory: ArrayContextFactory, visualize=False): @@ -547,16 +548,16 @@ def test_sumpy_fmm_exclude_self(actx_factory: ArrayContextFactory, visualize=Fal mpole_expn_class = VolumeTaylorMultipoleExpansion order = 10 - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - tree, _ = tb(actx.queue, sources, max_particles_in_box=30, debug=True) + tree, _ = tb(actx, sources, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(44) weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) @@ -567,7 +568,7 @@ def test_sumpy_fmm_exclude_self(actx_factory: ArrayContextFactory, visualize=Fal target_kernels = [knl] tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), target_kernels, @@ -579,12 +580,11 @@ def test_sumpy_fmm_exclude_self(actx_factory: ArrayContextFactory, visualize=Fal from boxtree.fmm import drive_fmm - pot, = drive_fmm(wrangler, (weights,)) + pot, = drive_fmm(actx, wrangler, (weights,)) from sumpy import P2P - p2p = P2P(actx.context, target_kernels, exclude_self=True) - _evt, (ref_pot,) = p2p(actx.queue, sources, sources, (weights,), - **self_extra_kwargs) + p2p = P2P(target_kernels, exclude_self=True) + ref_pot, = p2p(actx, sources, sources, (weights,), **self_extra_kwargs) pot = actx.to_numpy(pot) ref_pot = actx.to_numpy(ref_pot) @@ -617,17 +617,15 @@ def test_sumpy_axis_source_derivative( mpole_expn_class = VolumeTaylorMultipoleExpansion order = 10 - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, - max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(12) weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) @@ -642,7 +640,7 @@ def test_sumpy_axis_source_derivative( (AxisTargetDerivative(0, knl), knl), (knl, AxisSourceDerivative(0, knl))]: tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), target_kernels=[tgt_knl], @@ -655,7 +653,7 @@ def test_sumpy_axis_source_derivative( from boxtree.fmm import drive_fmm - pot, = drive_fmm(wrangler, (weights,)) + pot, = drive_fmm(actx, wrangler, (weights,)) pots.append(actx.to_numpy(pot)) rel_err = la.norm(pots[0] + pots[1]) / la.norm(pots[0]) @@ -688,17 +686,17 @@ def test_sumpy_target_point_multiplier( mpole_expn_class = VolumeTaylorMultipoleExpansion order = 5 - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - tree, _ = tb(actx.queue, sources, + tree, _ = tb(actx, sources, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(12) weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) @@ -714,7 +712,7 @@ def test_sumpy_target_point_multiplier( tgt_knls[1] = AxisTargetDerivative(axis, tgt_knls[1]) tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), target_kernels=tgt_knls, @@ -727,7 +725,7 @@ def test_sumpy_target_point_multiplier( from boxtree.fmm import drive_fmm - pot0, pot1, pot2 = drive_fmm(wrangler, (weights,)) + pot0, pot1, pot2 = drive_fmm(actx, wrangler, (weights,)) pot0, pot1, pot2 = actx.to_numpy(pot0), actx.to_numpy(pot1), actx.to_numpy(pot2) if deriv_axes == (0,): ref_pot = pot1 * actx.to_numpy(sources[0]) + pot2 diff --git a/sumpy/test/test_kernels.py b/sumpy/test/test_kernels.py index b0553807..2167a56a 100644 --- a/sumpy/test/test_kernels.py +++ b/sumpy/test/test_kernels.py @@ -89,9 +89,7 @@ def test_p2p(actx_factory: ArrayContextFactory, exclude_self): from sumpy.p2p import P2P lknl = LaplaceKernel(dimensions) - knl = P2P(actx.context, - [lknl, AxisTargetDerivative(0, lknl)], - exclude_self=exclude_self) + knl = P2P([lknl, AxisTargetDerivative(0, lknl)], exclude_self=exclude_self) rng = np.random.default_rng(42) targets = rng.random(size=(dimensions, n)) @@ -104,8 +102,8 @@ def test_p2p(actx_factory: ArrayContextFactory, exclude_self): extra_kwargs["target_to_source"] = ( actx.from_numpy(np.arange(n, dtype=np.int32))) - _evt, (potential, _x_derivative) = knl( - actx.queue, + potential, _ = knl( + actx, actx.from_numpy(targets), actx.from_numpy(sources), [actx.from_numpy(strengths)], @@ -202,11 +200,11 @@ def test_p2e_multiple( rscale = 0.5 # pick something non-1 # apply p2e at the same time - p2e = P2EFromSingleBox(actx.context, expn, + p2e = P2EFromSingleBox(expn, kernels=source_kernels, strength_usage=[0, 1]) - _evt, (mpoles,) = p2e(actx.queue, + mpoles = p2e(actx, source_boxes=source_boxes, box_source_starts=box_source_starts, box_source_counts_nonchild=box_source_counts_nonchild, @@ -218,7 +216,6 @@ def test_p2e_multiple( rscale=rscale, dir_vec=dir_vec, **extra_kwargs) - actual_result = actx.to_numpy(mpoles) # apply p2e separately @@ -228,10 +225,10 @@ def test_p2e_multiple( if isinstance(source_kernel, DirectionalSourceDerivative): extra_source_kwargs["dir_vec"] = dir_vec - p2e = P2EFromSingleBox(actx.context, expn, + p2e = P2EFromSingleBox(expn, kernels=[source_kernel], strength_usage=[i]) - _evt, (mpoles,) = p2e(actx.queue, + mpoles = p2e(actx, source_boxes=source_boxes, box_source_starts=box_source_starts, box_source_counts_nonchild=box_source_counts_nonchild, @@ -318,9 +315,9 @@ def test_p2e2p( expn = expn_class(knl, order=order) from sumpy import P2P, E2PFromSingleBox, P2EFromSingleBox - p2e = P2EFromSingleBox(actx.context, expn, kernels=[knl]) - e2p = E2PFromSingleBox(actx.context, expn, kernels=target_kernels) - p2p = P2P(actx.context, target_kernels, exclude_self=False) + p2e = P2EFromSingleBox(expn, kernels=[knl]) + e2p = E2PFromSingleBox(expn, kernels=target_kernels) + p2p = P2P(target_kernels, exclude_self=False) from pytools.convergence import EOCRecorder eoc_rec_pot = EOCRecorder() @@ -372,7 +369,7 @@ def test_p2e2p( # {{{ apply p2e - _evt, (mpoles,) = p2e(actx.queue, + mpoles = p2e(actx, source_boxes=source_boxes, box_source_starts=box_source_starts, box_source_counts_nonchild=box_source_counts_nonchild, @@ -394,8 +391,8 @@ def test_p2e2p( box_target_counts_nonchild = ( actx.from_numpy(np.array([ntargets], dtype=np.int32))) - _evt, (pot, grad_x, ) = e2p( - actx.queue, + pot, grad_x = e2p( + actx, src_expansions=mpoles, src_base_ibox=0, target_boxes=source_boxes, @@ -412,8 +409,8 @@ def test_p2e2p( # {{{ compute (direct) reference solution - _evt, (pot_direct, grad_x_direct, ) = p2p( - actx.queue, + pot_direct, grad_x_direct = p2p( + actx, targets, sources, (strengths,), **extra_source_kwargs) pot_direct = actx.to_numpy(pot_direct) @@ -594,7 +591,6 @@ def test_translations( m2l_translation = m2l_factory.get_m2l_translation_class(knl, local_expn_class)() toy_ctx = t.ToyContext( - actx.context, kernel=knl, local_expn_class=partial(local_expn_class, m2l_translation_override=m2l_translation), @@ -603,7 +599,7 @@ def test_translations( ) p = t.PointSources(toy_ctx, sources, weights=strengths) - p2p = p.eval(targets) + p2p = p.eval(actx, targets) m1_rscale = 0.5 m2_rscale = 0.25 @@ -611,27 +607,32 @@ def test_translations( l2_rscale = 0.25 for order in orders: - print(centers[:, 0].shape) - p2m = t.multipole_expand(p, centers[:, 0], order, m1_rscale) - p2m2p = p2m.eval(targets) + logger.info("Centers: %s", centers[:, 0].shape) + + p2m = t.multipole_expand(actx, p, centers[:, 0], + order=order, rscale=m1_rscale) + p2m2p = p2m.eval(actx, targets) err = la.norm((p2m2p - p2p) / res**2) err = err / (la.norm(p2p) / res**2) pconv_verifier_p2m2p.add_data_point(order, err) - p2m2m = t.multipole_expand(p2m, centers[:, 1], order, m2_rscale) - p2m2m2p = p2m2m.eval(targets) + p2m2m = t.multipole_expand(actx, p2m, centers[:, 1], + order=order, rscale=m2_rscale) + p2m2m2p = p2m2m.eval(actx, targets) err = la.norm((p2m2m2p - p2p)/res**2) err = err / (la.norm(p2p) / res**2) pconv_verifier_p2m2m2p.add_data_point(order, err) - p2m2m2l = t.local_expand(p2m2m, centers[:, 2], order, l1_rscale) - p2m2m2l2p = p2m2m2l.eval(targets) + p2m2m2l = t.local_expand(actx, p2m2m, centers[:, 2], + order=order, rscale=l1_rscale) + p2m2m2l2p = p2m2m2l.eval(actx, targets) err = la.norm((p2m2m2l2p - p2p)/res**2) err = err / (la.norm(p2p) / res**2) pconv_verifier_p2m2m2l2p.add_data_point(order, err) - p2m2m2l2l = t.local_expand(p2m2m2l, centers[:, 3], order, l2_rscale) - p2m2m2l2l2p = p2m2m2l2l.eval(targets) + p2m2m2l2l = t.local_expand(actx, p2m2m2l, centers[:, 3], + order=order, rscale=l2_rscale) + p2m2m2l2l2p = p2m2m2l2l.eval(actx, targets) err = la.norm((p2m2m2l2l2p - p2p)/res**2) err = err / (la.norm(p2p) / res**2) pconv_verifier_full.add_data_point(order, err) @@ -825,7 +826,6 @@ def test_m2m_compressed_error_helmholtz(actx_factory: ArrayContextFactory, dim, for i, (mpole_expn_class, local_expn_class) in \ enumerate(zip(mpole_expn_classes, local_expn_classes, strict=True)): tctx = toys.ToyContext( - actx.context, knl, extra_kernel_kwargs=extra_kernel_kwargs, local_expn_class=local_expn_class, @@ -837,15 +837,15 @@ def test_m2m_compressed_error_helmholtz(actx_factory: ArrayContextFactory, dim, np.ones(sources.shape[-1]) ) - mexp = toys.multipole_expand(pt_src, + mexp = toys.multipole_expand(actx, pt_src, center=mpole_center.reshape(dim), order=order, rscale=h) - mexp2 = toys.multipole_expand(mexp, + mexp2 = toys.multipole_expand(actx, mexp, center=second_center.reshape(dim), order=order, rscale=h) - m2m_vals[i] = mexp2.eval(targets) + m2m_vals[i] = mexp2.eval(actx, targets) err = np.linalg.norm(m2m_vals[1] - m2m_vals[0]) \ / np.linalg.norm(m2m_vals[1]) @@ -902,13 +902,13 @@ def test_jump( dlp_knl = DirectionalSourceDerivative(kernel) from sumpy.qbx import LayerPotential - lpot = LayerPotential(actx.context, + lpot = LayerPotential( expansion=LineTaylorLocalExpansion(kernel, order=order), source_kernels=(dlp_knl,), target_kernels=(kernel,), value_dtypes=np.complex128,) - _evt, (y,) = lpot(actx.queue, + y, = lpot(actx, actx.from_numpy(targets), actx.from_numpy(geo.nodes), actx.from_numpy(centers), diff --git a/sumpy/test/test_matrixgen.py b/sumpy/test/test_matrixgen.py index 2a9a1669..0bc4406a 100644 --- a/sumpy/test/test_matrixgen.py +++ b/sumpy/test/test_matrixgen.py @@ -75,9 +75,9 @@ def _build_subset_indices(actx, ntargets, nsources, factor): rng = np.random.default_rng() if abs(factor - 1.0) > 1.0e-14: tgtindices = rng.choice(tgtindices, - size=int(factor * ntargets), replace=False) + size=int(factor * ntargets), replace=False) srcindices = rng.choice(srcindices, - size=int(factor * nsources), replace=False) + size=int(factor * nsources), replace=False) else: rng.shuffle(tgtindices) rng.shuffle(srcindices) @@ -120,59 +120,61 @@ def test_qbx_direct( expn = LineTaylorLocalExpansion(knl, order) from sumpy.qbx import LayerPotential - lpot = LayerPotential(actx.context, expansion=expn, source_kernels=(knl,), - target_kernels=(base_knl,)) + lpot = LayerPotential( + expansion=expn, + source_kernels=(knl,), + target_kernels=(base_knl,)) from sumpy.qbx import LayerPotentialMatrixGenerator - mat_gen = LayerPotentialMatrixGenerator(actx.context, - expansion=expn, - source_kernels=(knl,), - target_kernels=(base_knl,)) + mat_gen = LayerPotentialMatrixGenerator( + expansion=expn, + source_kernels=(knl,), + target_kernels=(base_knl,)) from sumpy.qbx import LayerPotentialMatrixSubsetGenerator - blk_gen = LayerPotentialMatrixSubsetGenerator(actx.context, - expansion=expn, - source_kernels=(knl,), - target_kernels=(base_knl,)) + blk_gen = LayerPotentialMatrixSubsetGenerator( + expansion=expn, + source_kernels=(knl,), + target_kernels=(base_knl,)) for n in [200, 300, 400]: - targets, sources, centers, expansion_radii, sigma = \ - _build_geometry(actx, n, n, mode_nr, target_radius=1.2) + targets, sources, centers, expansion_radii, sigma = ( + _build_geometry(actx, n, n, mode_nr, target_radius=1.2)) h = 2 * np.pi / n strengths = (sigma * h,) - tgtindices, srcindices = _build_subset_indices(actx, - ntargets=n, nsources=n, factor=factor) + tgtindices, srcindices = ( + _build_subset_indices(actx, ntargets=n, nsources=n, factor=factor)) extra_kwargs = {} if lpot_id == 2: extra_kwargs["dsource_vec"] = ( - actx.from_numpy(obj_array.new_1d(np.ones((ndim, n)))) - ) + actx.from_numpy(obj_array.new_1d(np.ones((ndim, n)))) + ) - _, (result_lpot,) = lpot(actx.queue, - targets=targets, - sources=sources, - centers=centers, - expansion_radii=expansion_radii, - strengths=strengths, **extra_kwargs) + result_lpot, = lpot(actx, + targets=targets, + sources=sources, + centers=centers, + expansion_radii=expansion_radii, + strengths=strengths, **extra_kwargs) result_lpot = actx.to_numpy(result_lpot) - _, (mat,) = mat_gen(actx.queue, - targets=targets, - sources=sources, - centers=centers, - expansion_radii=expansion_radii, **extra_kwargs) + mat, = mat_gen(actx, + targets=targets, + sources=sources, + centers=centers, + expansion_radii=expansion_radii, **extra_kwargs) mat = actx.to_numpy(mat) result_mat = mat @ actx.to_numpy(strengths[0]) - _, (blk,) = blk_gen(actx.queue, - targets=targets, - sources=sources, - centers=centers, - expansion_radii=expansion_radii, - tgtindices=tgtindices, - srcindices=srcindices, **extra_kwargs) + blk, = blk_gen(actx, + targets=targets, + sources=sources, + centers=centers, + expansion_radii=expansion_radii, + tgtindices=tgtindices, + srcindices=srcindices, **extra_kwargs) blk = actx.to_numpy(blk) tgtindices = actx.to_numpy(tgtindices) @@ -214,14 +216,15 @@ def test_p2p_direct( raise ValueError(f"unknown lpot_id: '{lpot_id}'") from sumpy.p2p import P2P - lpot = P2P(actx.context, [lknl], exclude_self=exclude_self) + lpot = P2P(target_kernels=[lknl], exclude_self=exclude_self) from sumpy.p2p import P2PMatrixGenerator - mat_gen = P2PMatrixGenerator(actx.context, [lknl], exclude_self=exclude_self) + mat_gen = P2PMatrixGenerator( + target_kernels=[lknl], exclude_self=exclude_self) from sumpy.p2p import P2PMatrixSubsetGenerator blk_gen = P2PMatrixSubsetGenerator( - actx.context, [lknl], exclude_self=exclude_self) + target_kernels=[lknl], exclude_self=exclude_self) for n in [200, 300, 400]: targets, sources, _, _, sigma = ( @@ -229,8 +232,8 @@ def test_p2p_direct( h = 2 * np.pi / n strengths = (sigma * h,) - tgtindices, srcindices = _build_subset_indices(actx, - ntargets=n, nsources=n, factor=factor) + tgtindices, srcindices = ( + _build_subset_indices(actx, ntargets=n, nsources=n, factor=factor)) extra_kwargs = {} if exclude_self: @@ -239,25 +242,25 @@ def test_p2p_direct( ) if lpot_id == 2: extra_kwargs["dsource_vec"] = ( - actx.from_numpy(obj_array.new_1d(np.ones((ndim, n))))) + actx.from_numpy(obj_array.new_1d(np.ones((ndim, n))))) - _, (result_lpot,) = lpot(actx.queue, + result_lpot, = lpot(actx, targets=targets, sources=sources, strength=strengths, **extra_kwargs) result_lpot = actx.to_numpy(result_lpot) - _, (mat,) = mat_gen(actx.queue, - targets=targets, - sources=sources, **extra_kwargs) + mat, = mat_gen(actx, + targets=targets, + sources=sources, **extra_kwargs) mat = actx.to_numpy(mat) result_mat = mat @ actx.to_numpy(strengths[0]) - _, (blk,) = blk_gen(actx.queue, - targets=targets, - sources=sources, - tgtindices=tgtindices, - srcindices=srcindices, **extra_kwargs) + blk, = blk_gen(actx, + targets=targets, + sources=sources, + tgtindices=tgtindices, + srcindices=srcindices, **extra_kwargs) blk = actx.to_numpy(blk) tgtindices = actx.to_numpy(tgtindices) diff --git a/sumpy/test/test_misc.py b/sumpy/test/test_misc.py index 6cede60a..41bf03b2 100644 --- a/sumpy/test/test_misc.py +++ b/sumpy/test/test_misc.py @@ -135,7 +135,7 @@ def test_pde_check_kernels(actx_factory: ArrayContextFactory, knl_info, order=5) actx = actx_factory() dim = knl_info.kernel.dim - tctx = t.ToyContext(actx.context, knl_info.kernel, + tctx = t.ToyContext(knl_info.kernel, extra_source_kwargs=knl_info.extra_kwargs) rng = np.random.default_rng(42) @@ -151,7 +151,7 @@ def test_pde_check_kernels(actx_factory: ArrayContextFactory, knl_info, order=5) for h in [0.1, 0.05, 0.025]: cp = CalculusPatch(np.array([1, 0, 0])[:dim], h=h, order=order) - pot = pt_src.eval(cp.points) + pot = pt_src.eval(actx, cp.points) pde = knl_info.pde_func(cp, pot) @@ -334,7 +334,7 @@ def test_toy_p2e2e2p(actx_factory: ArrayContextFactory, case): from sumpy.expansion import VolumeTaylorExpansionFactory actx = actx_factory() - ctx = t.ToyContext(actx.context, + ctx = t.ToyContext( LaplaceKernel(dim), expansion_factory=VolumeTaylorExpansionFactory(), m2l_use_fft=case.m2l_use_fft) @@ -342,12 +342,12 @@ def test_toy_p2e2e2p(actx_factory: ArrayContextFactory, case): errors = [] src_pot = t.PointSources(ctx, src, weights=np.array([1.])) - pot_actual = src_pot.eval(tgt).item() + pot_actual = src_pot.eval(actx, tgt).item() for order in ORDERS_P2E2E2P: - expn = case.expansion1(src_pot, case.center1, order=order) - expn2 = case.expansion2(expn, case.center2, order=order) - pot_p2e2e2p = expn2.eval(tgt).item() + expn = case.expansion1(actx, src_pot, case.center1, order=order) + expn2 = case.expansion2(actx, expn, case.center2, order=order) + pot_p2e2e2p = expn2.eval(actx, tgt).item() errors.append(np.abs(pot_actual - pot_p2e2e2p)) conv_factor = approx_convergence_factor(1 + np.array(ORDERS_P2E2E2P), errors) diff --git a/sumpy/test/test_qbx.py b/sumpy/test/test_qbx.py index 4e9d8e71..7d7a6e6a 100644 --- a/sumpy/test/test_qbx.py +++ b/sumpy/test/test_qbx.py @@ -67,7 +67,7 @@ def test_direct_qbx_vs_eigval( from sumpy.qbx import LayerPotential - lpot = LayerPotential(actx.context, + lpot = LayerPotential( expansion=expn_class(lknl, order), target_kernels=(lknl,), source_kernels=(lknl,)) @@ -98,13 +98,14 @@ def test_direct_qbx_vs_eigval( expansion_radii = actx.from_numpy(radius * np.ones(n)) strengths = (actx.from_numpy(sigma * h),) - _evt, (result_qbx,) = lpot( - actx.queue, + result_qbx, = lpot( + actx, targets, sources, centers, strengths, expansion_radii=expansion_radii) result_qbx = actx.to_numpy(result_qbx) - eocrec.add_data_point(h, np.max(np.abs(result_ref - result_qbx))) + error = np.linalg.norm(result_ref - result_qbx, np.inf) + eocrec.add_data_point(h, error) logger.info("eoc:\n%s", eocrec) @@ -137,10 +138,14 @@ def test_direct_qbx_vs_eigval_with_tgt_deriv( from sumpy.qbx import LayerPotential - lpot_dx = LayerPotential(actx.context, expansion=expn_class(lknl, order), - target_kernels=(AxisTargetDerivative(0, lknl),), source_kernels=(lknl,)) - lpot_dy = LayerPotential(actx.context, expansion=expn_class(lknl, order), - target_kernels=(AxisTargetDerivative(1, lknl),), source_kernels=(lknl,)) + lpot_dx = LayerPotential( + expansion=expn_class(lknl, order), + target_kernels=(AxisTargetDerivative(0, lknl),), + source_kernels=(lknl,)) + lpot_dy = LayerPotential( + expansion=expn_class(lknl, order), + target_kernels=(AxisTargetDerivative(1, lknl),), + source_kernels=(lknl,)) mode_nr = 15 @@ -169,12 +174,12 @@ def test_direct_qbx_vs_eigval_with_tgt_deriv( expansion_radii = actx.from_numpy(radius * np.ones(n)) strengths = (actx.from_numpy(sigma * h),) - _evt, (result_qbx_dx,) = lpot_dx( - actx.queue, + result_qbx_dx, = lpot_dx( + actx, targets, sources, centers, strengths, expansion_radii=expansion_radii) - _evt, (result_qbx_dy,) = lpot_dy( - actx.queue, + result_qbx_dy, = lpot_dy( + actx, targets, sources, centers, strengths, expansion_radii=expansion_radii) @@ -184,7 +189,8 @@ def test_direct_qbx_vs_eigval_with_tgt_deriv( normals = unit_circle result_qbx = normals[0] * result_qbx_dx + normals[1] * result_qbx_dy - eocrec.add_data_point(h, np.max(np.abs(result_ref - result_qbx))) + error = np.linalg.norm(result_ref - result_qbx, np.inf) + eocrec.add_data_point(h, error) if expn_class is not LineTaylorLocalExpansion: logger.info("eoc:\n%s", eocrec) diff --git a/sumpy/test/test_target_deriv.py b/sumpy/test/test_target_deriv.py index cff0b69d..dc03c431 100644 --- a/sumpy/test/test_target_deriv.py +++ b/sumpy/test/test_target_deriv.py @@ -75,13 +75,11 @@ def test_lpot_dx_jump_relation_convergence( from sumpy.qbx import LayerPotential expansion = LineTaylorLocalExpansion(knl, qbx_order) lplot_dx = LayerPotential( - actx.context, expansion=expansion, target_kernels=(AxisTargetDerivative(0, knl),), source_kernels=(knl,) ) lplot_dy = LayerPotential( - actx.context, expansion=expansion, target_kernels=(AxisTargetDerivative(1, knl),), source_kernels=(knl,) @@ -96,34 +94,35 @@ def test_lpot_dx_jump_relation_convergence( weights_nodes = actx.from_numpy(weights_nodes_h) expansion_radii_h = 4 * target_geo.area_elements / nsources + expansion_radii = actx.from_numpy(expansion_radii_h) centers_in = actx.from_numpy( targets_h - target_geo.normals * expansion_radii_h) centers_out = actx.from_numpy( targets_h + target_geo.normals * expansion_radii_h) strengths = (weights_nodes,) - _, (eval_in_dx,) = lplot_dx( - actx.queue, + (eval_in_dx,) = lplot_dx( + actx, targets, sources, centers_in, strengths, - expansion_radii=expansion_radii_h + expansion_radii=expansion_radii ) - _, (eval_in_dy,) = lplot_dy( - actx.queue, + (eval_in_dy,) = lplot_dy( + actx, targets, sources, centers_in, strengths, - expansion_radii=expansion_radii_h + expansion_radii=expansion_radii ) - _, (eval_out_dx,) = lplot_dx( - actx.queue, + (eval_out_dx,) = lplot_dx( + actx, targets, sources, centers_out, strengths, - expansion_radii=expansion_radii_h + expansion_radii=expansion_radii ) - _, (eval_out_dy,) = lplot_dy( - actx.queue, + (eval_out_dy,) = lplot_dy( + actx, targets, sources, centers_out, strengths, - expansion_radii=expansion_radii_h + expansion_radii=expansion_radii ) eval_in_dx = actx.to_numpy(eval_in_dx) diff --git a/sumpy/test/test_tools.py b/sumpy/test/test_tools.py index 89ec231d..9ba35f1e 100644 --- a/sumpy/test/test_tools.py +++ b/sumpy/test/test_tools.py @@ -98,7 +98,7 @@ def test_fft(actx_factory: ArrayContextFactory, size: int): n_batch_dims=len(inp.shape) - 1, inverse=False, complex_dtype=inp.dtype.type) - _evt, (out_dev,) = fft_func(actx.queue, y=inp_dev) + out_dev = actx.call_loopy(fft_func, y=inp_dev)["x"] assert np.allclose(actx.to_numpy(out_dev), out) diff --git a/sumpy/tools.py b/sumpy/tools.py index 036b745c..25dc4bdc 100644 --- a/sumpy/tools.py +++ b/sumpy/tools.py @@ -38,13 +38,13 @@ import numpy as np import loopy as lp -import pytools.obj_array as obj_array from pymbolic.mapper.dependency import DependencyMapper from pyopencl.characterize import get_pocl_version from pytools import memoize_method from pytools.tag import Tag, tag_dataclass import sumpy.symbolic as sym +from sumpy.array_context import PyOpenCLArrayContext, make_loopy_program if TYPE_CHECKING: @@ -53,7 +53,7 @@ from optype.numpy import Array2D import pyopencl - import pyopencl as cl + from arraycontext import ArrayContext from pymbolic.primitives import Variable from pymbolic.typing import Expression @@ -70,8 +70,6 @@ .. autofunction:: to_complex_dtype .. autofunction:: is_obj_array_like -.. autofunction:: vector_to_device -.. autofunction:: vector_from_device .. autoclass:: OrderedSet Multi-index Helpers @@ -232,27 +230,6 @@ def build_matrix(op, dtype=None, shape=None): return mat -def vector_to_device(queue, vec): - from pyopencl.array import to_device - - def to_dev(ary): - return to_device(queue, ary) - - return obj_array.vectorize(to_dev, vec) - - -def vector_from_device(queue, vec): - def from_dev(ary): - from numbers import Number - if isinstance(ary, np.number | Number): - # zero, most likely - return ary - - return ary.get(queue=queue) - - return obj_array.vectorize(from_dev, vec) - - def _merge_kernel_arguments( dictionary: dict[str, KernelArgument], arg: KernelArgument): @@ -306,13 +283,12 @@ class KernelComputation(ABC): .. automethod:: get_kernel """ - def __init__(self, ctx: cl.Context, - target_kernels: Sequence[Kernel], - source_kernels: Sequence[Kernel], - strength_usage: Sequence[int] | None = None, - value_dtypes: Sequence[numpy.dtype[Any]] | numpy.dtype[Any] | None = None, - name: str | None = None, - device: Any | None = None) -> None: + def __init__(self, + target_kernels: list[Kernel], + source_kernels: list[Kernel], + strength_usage: list[int] | None = None, + value_dtypes: list[numpy.dtype[Any]] | None = None, + name: str | None = None) -> None: """ :arg target_kernels: list of :class:`~sumpy.kernel.Kernel` instances, with :class:`sumpy.kernel.AxisTargetDerivative` as @@ -353,20 +329,18 @@ def __init__(self, ctx: cl.Context, # }}} - if device is None: - device = ctx.devices[0] - - self.context: cl.Context = ctx - self.device: cl.Device = device - - self.source_kernels: Sequence[Kernel] = tuple(source_kernels) - self.target_kernels: Sequence[Kernel] = tuple(target_kernels) - self.value_dtypes: Sequence[np.dtype[Any]] = value_dtypes - self.strength_usage: Sequence[int] = strength_usage - self.strength_count: int = strength_count + self.source_kernels = tuple(source_kernels) + self.target_kernels = tuple(target_kernels) + self.value_dtypes = value_dtypes + self.strength_usage = strength_usage + self.strength_count = strength_count self.name = name or self.default_name + @property + def nresults(self): + return len(self.target_kernels) + @property @abstractmethod def default_name(self) -> str: @@ -464,7 +438,6 @@ def __eq__(self, other): class KernelCacheMixin(ABC): - context: cl.Context name: str @abstractmethod @@ -472,15 +445,15 @@ def get_cache_key(self) -> tuple[Hashable, ...]: ... @abstractmethod - def get_kernel(self) -> lp.TranslationUnit: + def get_kernel(self, **kwargs: Any) -> lp.TranslationUnit: ... @abstractmethod - def get_optimized_kernel(self) -> lp.TranslationUnit: + def get_optimized_kernel(self, **kwargs: Any) -> lp.TranslationUnit: ... @memoize_method - def get_cached_kernel_executor(self, **kwargs) -> lp.ExecutorBase: + def get_cached_kernel(self, **kwargs) -> lp.TranslationUnit: from sumpy import CACHING_ENABLED, NO_CACHE_KERNELS, OPT_ENABLED, code_cache if CACHING_ENABLED and not ( @@ -498,7 +471,7 @@ def get_cached_kernel_executor(self, **kwargs) -> lp.ExecutorBase: try: result = code_cache[cache_key] logger.debug("%s: kernel cache hit [key=%s]", self.name, cache_key) - return result.executor(self.context) + return result except KeyError: pass @@ -519,7 +492,7 @@ def get_cached_kernel_executor(self, **kwargs) -> lp.ExecutorBase: NO_CACHE_KERNELS and self.name in NO_CACHE_KERNELS): code_cache.store_if_not_present(cache_key, knl) - return knl.executor(self.context) + return knl @staticmethod def _allow_redundant_execution_of_knl_scaling( @@ -928,12 +901,11 @@ def loopy_fft( if name is None: name = f"ifft_{n}" if inverse else f"fft_{n}" - knl = lp.make_kernel( + knl = make_loopy_program( domains, insns, kernel_data=kernel_data, name=name, fixed_parameters=fixed_parameters, - lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, index_dtype=index_dtype, ) @@ -1001,17 +973,18 @@ def _get_fft_backend(queue: pyopencl.CommandQueue) -> FFTBackend: def get_opencl_fft_app( - queue: pyopencl.CommandQueue, + actx: ArrayContext, shape: tuple[int, ...], dtype: numpy.dtype[Any], inverse: bool) -> Any: """Setup an object for out-of-place FFT on with given shape and dtype on given queue. """ + assert isinstance(actx, PyOpenCLArrayContext) assert dtype.type in (np.float32, np.float64, np.complex64, np.complex128) - backend = _get_fft_backend(queue) + backend = _get_fft_backend(actx.queue) if backend == FFTBackend.loopy: return loopy_fft( @@ -1021,15 +994,17 @@ def get_opencl_fft_app( complex_dtype=dtype.type), backend elif backend == FFTBackend.pyvkfft: from pyvkfft.opencl import VkFFTApp - app = VkFFTApp(shape=shape, dtype=dtype, queue=queue, ndim=1, inplace=False) + app = VkFFTApp( + shape=shape, dtype=dtype, # pyright: ignore[reportArgumentType] + queue=actx.queue, ndim=1, inplace=False) return app, backend else: raise RuntimeError(f"Unsupported FFT backend {backend}") def run_opencl_fft( + actx: ArrayContext, fft_app: tuple[Any, FFTBackend], - queue: pyopencl.CommandQueue, input_vec: Any, inverse: bool = False, wait_for: list[pyopencl.Event] | None = None @@ -1039,18 +1014,20 @@ def run_opencl_fft( vector. Only supports in-order queues. """ + assert isinstance(actx, PyOpenCLArrayContext) + app, backend = fft_app if backend == FFTBackend.loopy: - evt, (output_vec,) = app(queue, y=input_vec, wait_for=wait_for) - return (evt, output_vec) + evt, output_vec = app(actx.queue, y=input_vec, wait_for=wait_for) + return (evt, output_vec["x"]) elif backend == FFTBackend.pyvkfft: if wait_for is None: wait_for = [] import pyopencl as cl - import pyopencl.array as cla + queue = actx.queue if queue.device.platform.name == "NVIDIA CUDA": # NVIDIA OpenCL gives wrong event profile values with wait_for # Not passing wait_for will wait for all events queued before @@ -1066,7 +1043,8 @@ def run_opencl_fft( if app.inplace: raise RuntimeError("inplace fft is not supported") else: - output_vec = cla.empty_like(input_vec, queue) + # FIXME: All very imperative. FFT functionality should move into the actx? + output_vec = actx.np.empty_like(input_vec) # pyright: ignore[reportUnknownMemberType, reportAttributeAccessIssue] meth = app.ifft if inverse else app.fft @@ -1094,7 +1072,7 @@ def run_opencl_fft( } -def __getattr__(name): +def __getattr__(name: str): replacement_and_obj = _depr_name_to_replacement_and_obj.get(name) if replacement_and_obj is not None: replacement, obj, year = replacement_and_obj diff --git a/sumpy/toys.py b/sumpy/toys.py index e095a40a..c249ca1b 100644 --- a/sumpy/toys.py +++ b/sumpy/toys.py @@ -26,9 +26,13 @@ THE SOFTWARE. """ +import logging from functools import partial from numbers import Number -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Any + +import numpy as np +from typing_extensions import override import pytools.obj_array as obj_array from pytools import memoize_method @@ -39,7 +43,7 @@ if TYPE_CHECKING: from collections.abc import Mapping, Sequence - import pyopencl + from arraycontext import ArrayContext from sumpy.expansion import ( ExpansionFactoryBase, @@ -50,14 +54,6 @@ from sumpy.kernel import Kernel from sumpy.visualization import FieldPlotter -import logging - -import numpy as np - -import loopy as lp # noqa: F401 -import pyopencl as cl -import pyopencl.array as cl_array - logger = logging.getLogger(__name__) @@ -112,8 +108,6 @@ class ToyContext: .. automethod:: __init__ """ - cl_context: pyopencl.Context - queue: pyopencl.CommandQueue kernel: Kernel no_target_deriv_kernel: Kernel @@ -125,7 +119,6 @@ class ToyContext: extra_source_and_kernel_kwargs: Mapping[str, object] def __init__(self, - cl_context: pyopencl.Context, kernel: Kernel, mpole_expn_class: MultipoleExpansionFactory | None = None, local_expn_class: LocalExpansionFactory | None = None, @@ -133,18 +126,17 @@ def __init__(self, extra_source_kwargs: Mapping[str, object] | None = None, extra_kernel_kwargs: Mapping[str, object] | None = None, m2l_use_fft: bool | None = None): - self.cl_context = cl_context - self.queue = cl.CommandQueue(self.cl_context) self.kernel = kernel - self.no_target_deriv_kernel = TargetTransformationRemover()(kernel) if expansion_factory is None: from sumpy.expansion import DefaultExpansionFactory expansion_factory = DefaultExpansionFactory() + if mpole_expn_class is None: - mpole_expn_class = \ - expansion_factory.get_multipole_expansion_class(kernel) + mpole_expn_class = ( + expansion_factory.get_multipole_expansion_class(kernel)) + if local_expn_class is None: from sumpy.expansion.m2l import ( FFTM2LTranslationClassFactory, @@ -185,40 +177,40 @@ def __init__(self, @memoize_method def get_p2p(self): from sumpy.p2p import P2P - return P2P(self.cl_context, (self.kernel,), exclude_self=False) + return P2P((self.kernel,), exclude_self=False) @memoize_method def get_p2m(self, order: int): from sumpy import P2EFromSingleBox - return P2EFromSingleBox(self.cl_context, + return P2EFromSingleBox( self.mpole_expn_class(self.no_target_deriv_kernel, order), kernels=(self.kernel,)) @memoize_method def get_p2l(self, order: int): from sumpy import P2EFromSingleBox - return P2EFromSingleBox(self.cl_context, + return P2EFromSingleBox( self.local_expn_class(self.no_target_deriv_kernel, order), kernels=(self.kernel,)) @memoize_method def get_m2p(self, order: int): from sumpy import E2PFromSingleBox - return E2PFromSingleBox(self.cl_context, + return E2PFromSingleBox( self.mpole_expn_class(self.no_target_deriv_kernel, order), (self.kernel,)) @memoize_method def get_l2p(self, order: int): from sumpy import E2PFromSingleBox - return E2PFromSingleBox(self.cl_context, + return E2PFromSingleBox( self.local_expn_class(self.no_target_deriv_kernel, order), (self.kernel,)) @memoize_method def get_m2m(self, from_order: int, to_order: int): from sumpy import E2EFromCSR - return E2EFromCSR(self.cl_context, + return E2EFromCSR( self.mpole_expn_class(self.no_target_deriv_kernel, from_order), self.mpole_expn_class(self.no_target_deriv_kernel, to_order)) @@ -239,7 +231,7 @@ def get_m2l(self, from_order: int, to_order: int): m2l_class = M2LUsingTranslationClassesDependentData else: m2l_class = E2EFromCSR - return m2l_class(self.cl_context, + return m2l_class( self.mpole_expn_class(self.no_target_deriv_kernel, from_order), self.local_expn_class(self.no_target_deriv_kernel, to_order)) @@ -248,7 +240,7 @@ def get_m2l_translation_class_dependent_data_kernel(self, from_order: int, to_order: int): from sumpy import M2LGenerateTranslationClassesDependentData - return M2LGenerateTranslationClassesDependentData(self.cl_context, + return M2LGenerateTranslationClassesDependentData( self.mpole_expn_class(self.no_target_deriv_kernel, from_order), self.local_expn_class(self.no_target_deriv_kernel, to_order)) @@ -261,21 +253,21 @@ def get_m2l_expansion_size(self, from_order: int, to_order: int): @memoize_method def get_m2l_preprocess_mpole_kernel(self, from_order: int, to_order: int): from sumpy import M2LPreprocessMultipole - return M2LPreprocessMultipole(self.cl_context, + return M2LPreprocessMultipole( self.mpole_expn_class(self.no_target_deriv_kernel, from_order), self.local_expn_class(self.no_target_deriv_kernel, to_order)) @memoize_method def get_m2l_postprocess_local_kernel(self, from_order: int, to_order: int): from sumpy import M2LPostprocessLocal - return M2LPostprocessLocal(self.cl_context, + return M2LPostprocessLocal( self.mpole_expn_class(self.no_target_deriv_kernel, from_order), self.local_expn_class(self.no_target_deriv_kernel, to_order)) @memoize_method def get_l2l(self, from_order: int, to_order: int): from sumpy import E2EFromCSR - return E2EFromCSR(self.cl_context, + return E2EFromCSR( self.local_expn_class(self.no_target_deriv_kernel, from_order), self.local_expn_class(self.no_target_deriv_kernel, to_order)) @@ -284,59 +276,56 @@ def get_l2l(self, from_order: int, to_order: int): # {{{ helpers -def _p2e(psource, center, rscale, order: int, p2e, expn_class, expn_kwargs): +def _p2e(actx, psource, center, rscale, order: int, p2e, expn_class, expn_kwargs): toy_ctx = psource.toy_ctx - queue = toy_ctx.queue - source_boxes = cl_array.to_device( - queue, np.array([0], dtype=np.int32)) - box_source_starts = cl_array.to_device( - queue, np.array([0], dtype=np.int32)) - box_source_counts_nonchild = cl_array.to_device( - queue, np.array([psource.points.shape[-1]], dtype=np.int32)) + source_boxes = actx.from_numpy( + np.array([0], dtype=np.int32)) + box_source_starts = actx.from_numpy( + np.array([0], dtype=np.int32)) + box_source_counts_nonchild = actx.from_numpy( + np.array([psource.points.shape[-1]], dtype=np.int32)) center = np.asarray(center) - centers = cl_array.to_device( - queue, + centers = actx.from_numpy( np.array(center, dtype=np.float64).reshape(toy_ctx.kernel.dim, 1)) - _evt, (coeffs,) = p2e(toy_ctx.queue, + coeffs = p2e( + actx, source_boxes=source_boxes, box_source_starts=box_source_starts, box_source_counts_nonchild=box_source_counts_nonchild, centers=centers, - sources=cl_array.to_device(queue, psource.points), - strengths=(cl_array.to_device(queue, psource.weights),), + sources=actx.from_numpy(psource.points), + strengths=(actx.from_numpy(psource.weights),), rscale=rscale, nboxes=1, tgt_base_ibox=0, **toy_ctx.extra_source_and_kernel_kwargs) - return expn_class(toy_ctx, center, rscale, order, coeffs[0].get(queue), + return expn_class( + toy_ctx, center, rscale, order, + actx.to_numpy(coeffs[0]), derived_from=psource, **expn_kwargs) -def _e2p(psource, targets, e2p): +def _e2p(actx, psource, targets, e2p): toy_ctx = psource.toy_ctx - queue = toy_ctx.queue ntargets = targets.shape[-1] - boxes = cl_array.to_device( - queue, np.array([0], dtype=np.int32)) - box_target_starts = cl_array.to_device( - queue, np.array([0], dtype=np.int32)) - box_target_counts_nonchild = cl_array.to_device( - queue, np.array([ntargets], dtype=np.int32)) - - centers = cl_array.to_device( - queue, + boxes = actx.from_numpy( + np.array([0], dtype=np.int32)) + box_target_starts = actx.from_numpy( + np.array([0], dtype=np.int32)) + box_target_counts_nonchild = actx.from_numpy( + np.array([ntargets], dtype=np.int32)) + + centers = actx.from_numpy( np.array(psource.center, dtype=np.float64).reshape(toy_ctx.kernel.dim, 1)) - from sumpy.tools import vector_to_device - - coeffs = cl_array.to_device(queue, np.array([psource.coeffs])) - _evt, (pot,) = e2p( - toy_ctx.queue, + coeffs = actx.from_numpy(np.array([psource.coeffs])) + pot, = e2p( + actx, src_expansions=coeffs, src_base_ibox=0, target_boxes=boxes, @@ -344,26 +333,25 @@ def _e2p(psource, targets, e2p): box_target_counts_nonchild=box_target_counts_nonchild, centers=centers, rscale=psource.rscale, - targets=vector_to_device(queue, obj_array.new_1d(targets)), + targets=actx.from_numpy(obj_array.new_1d(targets)), **toy_ctx.extra_kernel_kwargs) - return pot.get(queue) + return actx.to_numpy(pot) -def _e2e(psource, to_center, to_rscale, to_order: int, e2e, expn_class, expn_kwargs, +def _e2e(actx: ArrayContext, + psource, to_center, to_rscale, to_order: int, e2e, expn_class, expn_kwargs, extra_kernel_kwargs): toy_ctx = psource.toy_ctx - queue = toy_ctx.queue - target_boxes = cl_array.to_device( - queue, np.array([1], dtype=np.int32)) - src_box_starts = cl_array.to_device( - queue, np.array([0, 1], dtype=np.int32)) - src_box_lists = cl_array.to_device( - queue, np.array([0], dtype=np.int32)) + target_boxes = actx.from_numpy( + np.array([1], dtype=np.int32)) + src_box_starts = actx.from_numpy( + np.array([0, 1], dtype=np.int32)) + src_box_lists = actx.from_numpy( + np.array([0], dtype=np.int32)) - centers = cl_array.to_device( - queue, + centers = actx.from_numpy( np.array( [ # box 0: source @@ -375,9 +363,9 @@ def _e2e(psource, to_center, to_rscale, to_order: int, e2e, expn_class, expn_kwa dtype=np.float64).T.copy() ) - coeffs = cl_array.to_device(queue, np.array([psource.coeffs])) + coeffs = actx.from_numpy(np.array([psource.coeffs])) args = { - "queue": toy_ctx.queue, + "actx": actx, "src_expansions": coeffs, "src_base_ibox": 0, "tgt_base_ibox": 0, @@ -392,20 +380,19 @@ def _e2e(psource, to_center, to_rscale, to_order: int, e2e, expn_class, expn_kwa **toy_ctx.extra_kernel_kwargs, } - _evt, (to_coeffs,) = e2e(**args) - + to_coeffs = e2e(**args) return expn_class( - toy_ctx, to_center, to_rscale, to_order, to_coeffs[1].get(queue), + toy_ctx, to_center, to_rscale, to_order, + actx.to_numpy(to_coeffs[1]), derived_from=psource, **expn_kwargs) -def _m2l(psource, to_center, to_rscale, to_order: int, e2e, expn_class, expn_kwargs, +def _m2l(actx: ArrayContext, + psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, translation_classes_kwargs): toy_ctx = psource.toy_ctx - queue = toy_ctx.queue - - coeffs = cl_array.to_device(queue, np.array([psource.coeffs])) + coeffs = actx.from_numpy(np.array([psource.coeffs])) m2l_use_translation_classes_dependent_data = \ toy_ctx.use_translation_classes_dependent_data() @@ -416,36 +403,37 @@ def _m2l(psource, to_center, to_rscale, to_order: int, e2e, expn_class, expn_kwa expn_size = translation_classes_kwargs["m2l_expn_size"] # Preprocess the mpole expansion - preprocessed_src_expansions = cl_array.zeros(queue, (1, expn_size), - dtype=np.complex128) - evt, _ = preprocess_kernel(queue, + preprocessed_src_expansions = actx.zeros((1, expn_size), dtype=np.complex128) + preprocess_kernel( + actx, src_expansions=coeffs, preprocessed_src_expansions=preprocessed_src_expansions, src_rscale=np.float64(psource.rscale), **toy_ctx.extra_kernel_kwargs) - from sumpy.tools import get_native_event, get_opencl_fft_app, run_opencl_fft + from sumpy.tools import get_opencl_fft_app, run_opencl_fft if toy_ctx.use_fft: - fft_app = get_opencl_fft_app(queue, (1, expn_size,), + + fft_app = get_opencl_fft_app(actx, (1, expn_size,), dtype=preprocessed_src_expansions.dtype, inverse=False) - ifft_app = get_opencl_fft_app(queue, (1, expn_size,), + ifft_app = get_opencl_fft_app(actx, (1, expn_size,), dtype=preprocessed_src_expansions.dtype, inverse=True) - evt, preprocessed_src_expansions = run_opencl_fft(fft_app, queue, - preprocessed_src_expansions, inverse=False, wait_for=[evt]) + _evt, preprocessed_src_expansions = run_opencl_fft(actx, fft_app, + preprocessed_src_expansions, inverse=False) # Compute translation classes data - m2l_translation_classes_lists = cl_array.to_device(queue, - np.array([0], dtype=np.int32)) + m2l_translation_classes_lists = ( + actx.from_numpy(np.array([0], dtype=np.int32))) dist = np.array(to_center - psource.center, dtype=np.float64) dim = toy_ctx.kernel.dim - m2l_translation_vectors = cl_array.to_device(queue, dist.reshape(dim, 1)) - m2l_translation_classes_dependent_data = cl_array.zeros(queue, - (1, expn_size), dtype=np.complex128) + m2l_translation_vectors = actx.from_numpy(dist.reshape(dim, 1)) + m2l_translation_classes_dependent_data = ( + actx.zeros((1, expn_size), dtype=np.complex128)) - evt, _ = data_kernel( - queue, + data_kernel( + actx, src_rscale=np.float64(psource.rscale), ntranslation_classes=1, translation_classes_level_start=0, @@ -453,15 +441,15 @@ def _m2l(psource, to_center, to_rscale, to_order: int, e2e, expn_class, expn_kwa m2l_translation_classes_dependent_data=( m2l_translation_classes_dependent_data), ntranslation_vectors=1, - wait_for=[get_native_event(evt)], **toy_ctx.extra_kernel_kwargs) if toy_ctx.use_fft: - evt, m2l_translation_classes_dependent_data = run_opencl_fft(fft_app, - queue, m2l_translation_classes_dependent_data, inverse=False, - wait_for=[evt]) + _evt, m2l_translation_classes_dependent_data = run_opencl_fft( + actx, fft_app, + m2l_translation_classes_dependent_data, + inverse=False) - ret = _e2e(psource, to_center, to_rscale, to_order, + ret = _e2e(actx, psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, { "src_expansions": preprocessed_src_expansions, @@ -473,28 +461,32 @@ def _m2l(psource, to_center, to_rscale, to_order: int, e2e, expn_class, expn_kwa ) # Postprocess the local expansion - local_before = cl_array.to_device(queue, np.array([ret.coeffs])) - to_coeffs = cl_array.zeros(queue, (1, len(data_kernel.tgt_expansion)), - dtype=coeffs.dtype) + local_before = actx.from_numpy(np.array([ret.coeffs])) + to_coeffs = actx.zeros((1, len(data_kernel.tgt_expansion)), + dtype=coeffs.dtype) if toy_ctx.use_fft: - evt, local_before = run_opencl_fft(ifft_app, queue, - local_before, inverse=True, wait_for=[get_native_event(evt)]) + _evt, local_before = run_opencl_fft( + actx, ifft_app, + local_before, inverse=True) - evt, _ = postprocess_kernel(queue=queue, + postprocess_kernel( + actx, tgt_expansions_before_postprocessing=local_before, tgt_expansions=to_coeffs, src_rscale=np.float64(psource.rscale), tgt_rscale=np.float64(to_rscale), - wait_for=[get_native_event(evt)], **toy_ctx.extra_kernel_kwargs) return expn_class( - toy_ctx, to_center, to_rscale, to_order, to_coeffs.get(queue)[0], + toy_ctx, to_center, to_rscale, to_order, + actx.to_numpy(to_coeffs)[0], derived_from=psource, **expn_kwargs) else: - ret = _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class, - expn_kwargs, {}) + ret = _e2e( + actx, + psource, to_center, to_rscale, to_order, e2e, expn_class, + expn_kwargs, {}) return ret @@ -526,7 +518,7 @@ class PotentialSource: def __init__(self, toy_ctx: ToyContext): self.toy_ctx = toy_ctx - def eval(self, targets: np.ndarray) -> np.ndarray: + def eval(self, actx: ArrayContext, targets: np.ndarray) -> np.ndarray: """ :param targets: An array of shape ``(dim, ntargets)``. :returns: an array of shape ``(ntargets,)``. @@ -536,8 +528,7 @@ def eval(self, targets: np.ndarray) -> np.ndarray: def __neg__(self) -> PotentialSource: return -1*self - def __add__(self, other: Number_ish | PotentialSource - ) -> PotentialSource: + def __add__(self, other: Number_ish | PotentialSource) -> PotentialSource: if isinstance(other, Number | np.number): other = ConstantPotential(self.toy_ctx, other) elif not isinstance(other, PotentialSource): @@ -547,19 +538,15 @@ def __add__(self, other: Number_ish | PotentialSource __radd__ = __add__ - def __sub__(self, - other: Number_ish | PotentialSource) -> PotentialSource: + def __sub__(self, other: Number_ish | PotentialSource) -> PotentialSource: return self.__add__(-other) - # FIXME: mypy says " Forward operator "__sub__" is not callable" - # I don't know what it means. -AK, 2024-07-18 def __rsub__(self, # type:ignore[misc] other: Number_ish | PotentialSource - ) -> PotentialSource: + ) -> PotentialSource: return (-self).__add__(other) - def __mul__(self, - other: Number_ish | PotentialSource) -> PotentialSource: + def __mul__(self, other: Number_ish | PotentialSource) -> PotentialSource: if isinstance(other, Number | np.number): other = ConstantPotential(self.toy_ctx, other) elif not isinstance(other, PotentialSource): @@ -581,7 +568,7 @@ def __init__(self, toy_ctx: ToyContext, value): super().__init__(toy_ctx) self.value = np.array(value) - def eval(self, targets: np.ndarray) -> np.ndarray: + def eval(self, actx: ArrayContext, targets: np.ndarray) -> np.ndarray: pot = np.empty(targets.shape[-1], dtype=self.value.dtype) pot.fill(self.value) return pot @@ -595,13 +582,15 @@ class OneOnBallPotential(PotentialSource): .. automethod:: __init__ """ + def __init__(self, toy_ctx: ToyContext, center: np.ndarray, radius: float) -> None: super().__init__(toy_ctx) self.center = np.asarray(center) self.radius = radius - def eval(self, targets: np.ndarray) -> np.ndarray: + @override + def eval(self, actx: ArrayContext, targets: np.ndarray) -> np.ndarray: dist_vec = targets - self.center[:, np.newaxis] return (np.sum(dist_vec**2, axis=0) < self.radius**2).astype(np.float64) @@ -612,6 +601,7 @@ class HalfspaceOnePotential(PotentialSource): .. automethod:: __init__ """ + def __init__(self, toy_ctx: ToyContext, center: np.ndarray, axis: int, side: int = 1) -> None: super().__init__(toy_ctx) @@ -619,7 +609,7 @@ def __init__(self, toy_ctx: ToyContext, center: np.ndarray, self.axis = axis self.side = side - def eval(self, targets: np.ndarray) -> np.ndarray: + def eval(self, actx: ArrayContext, targets: np.ndarray) -> np.ndarray: return ( (self.side*(targets[self.axis] - self.center[self.axis])) >= 0 ).astype(np.float64) @@ -645,16 +635,16 @@ def __init__(self, self.weights = weights self._center = center - def eval(self, targets: np.ndarray) -> np.ndarray: - queue = self.toy_ctx.queue - _evt, (potential,) = self.toy_ctx.get_p2p()( - queue, - cl_array.to_device(queue, targets), - cl_array.to_device(queue, self.points), - [cl_array.to_device(queue, self.weights)], + @override + def eval(self, actx: ArrayContext, targets: np.ndarray) -> np.ndarray: + potential, = self.toy_ctx.get_p2p()( + actx, + actx.from_numpy(targets), + actx.from_numpy(self.points), + [actx.from_numpy(self.weights)], **self.toy_ctx.extra_source_and_kernel_kwargs) - return cast("cl_array.Array", potential).get(queue) + return actx.to_numpy(potential) @property def center(self): @@ -680,6 +670,7 @@ class ExpansionPotentialSource(PotentialSource): .. automethod:: __init__ """ + def __init__(self, toy_ctx, center, rscale, order: int, coeffs, derived_from, radius=None, expn_style=None, text_kwargs=None): super().__init__(toy_ctx) @@ -704,8 +695,8 @@ class MultipoleExpansion(ExpansionPotentialSource): Inherits from :class:`ExpansionPotentialSource`. """ - def eval(self, targets: np.ndarray) -> np.ndarray: - return _e2p(self, targets, self.toy_ctx.get_m2p(self.order)) + def eval(self, actx: ArrayContext, targets: np.ndarray) -> np.ndarray: + return _e2p(actx, self, targets, self.toy_ctx.get_m2p(self.order)) class LocalExpansion(ExpansionPotentialSource): @@ -713,8 +704,8 @@ class LocalExpansion(ExpansionPotentialSource): Inherits from :class:`ExpansionPotentialSource`. """ - def eval(self, targets: np.ndarray) -> np.ndarray: - return _e2p(self, targets, self.toy_ctx.get_l2p(self.order)) + def eval(self, actx: ArrayContext, targets: np.ndarray) -> np.ndarray: + return _e2p(actx, self, targets, self.toy_ctx.get_l2p(self.order)) class PotentialExpressionNode(PotentialSource): @@ -747,10 +738,11 @@ class Sum(PotentialExpressionNode): Inherits from :class:`PotentialExpressionNode`. """ - def eval(self, targets: np.ndarray) -> np.ndarray: + @override + def eval(self, actx: ArrayContext, targets: np.ndarray) -> np.ndarray: result = np.zeros(targets.shape[1]) for psource in self.psources: - result = result + psource.eval(targets) + result = result + psource.eval(actx, targets) return result @@ -760,31 +752,36 @@ class Product(PotentialExpressionNode): Inherits from :class:`PotentialExpressionNode`. """ - def eval(self, targets: np.ndarray) -> np.ndarray: - result = np.zeros(targets.shape[1]) + @override + def eval(self, actx: ArrayContext, targets: np.ndarray) -> np.ndarray: + result = np.ones(targets.shape[1]) for psource in self.psources: - result = result * psource.eval(targets) + result = result * psource.eval(actx, targets) return result - # }}} -def multipole_expand(psource: PotentialSource, - center: np.ndarray, order: int | None = None, - rscale: float = 1, **expn_kwargs) -> MultipoleExpansion: +def multipole_expand( + actx: ArrayContext, + psource: PotentialSource, + center: np.ndarray, *, + order: int | None = None, + rscale: float = 1, + **expn_kwargs: Any) -> MultipoleExpansion: if isinstance(psource, PointSources): if order is None: raise ValueError("order may not be None") - return _p2e(psource, center, rscale, order, psource.toy_ctx.get_p2m(order), + return _p2e(actx, + psource, center, rscale, order, psource.toy_ctx.get_p2m(order), MultipoleExpansion, expn_kwargs) elif isinstance(psource, MultipoleExpansion): if order is None: order = psource.order - return _e2e(psource, center, rscale, order, + return _e2e(actx, psource, center, rscale, order, psource.toy_ctx.get_m2m(psource.order, order), MultipoleExpansion, expn_kwargs, {}) @@ -793,14 +790,18 @@ def multipole_expand(psource: PotentialSource, def local_expand( + actx: ArrayContext, psource: PotentialSource, - center: np.ndarray, order: int | None = None, rscale: Number_ish = 1, - **expn_kwargs) -> LocalExpansion: + center: np.ndarray, *, + order: int | None = None, + rscale: float = 1, + **expn_kwargs: Any) -> LocalExpansion: if isinstance(psource, PointSources): if order is None: raise ValueError("order may not be None") - return _p2e(psource, center, rscale, order, psource.toy_ctx.get_p2l(order), + return _p2e(actx, + psource, center, rscale, order, psource.toy_ctx.get_p2l(order), LocalExpansion, expn_kwargs) elif isinstance(psource, MultipoleExpansion): @@ -825,7 +826,7 @@ def local_expand( translation_classes_kwargs["m2l_expn_size"] = \ toy_ctx.get_m2l_expansion_size(psource.order, order) - return _m2l(psource, center, rscale, order, + return _m2l(actx, psource, center, rscale, order, toy_ctx.get_m2l(psource.order, order), LocalExpansion, expn_kwargs, translation_classes_kwargs) @@ -834,7 +835,7 @@ def local_expand( if order is None: order = psource.order - return _e2e(psource, center, rscale, order, + return _e2e(actx, psource, center, rscale, order, psource.toy_ctx.get_l2l(psource.order, order), LocalExpansion, expn_kwargs, {}) @@ -842,9 +843,12 @@ def local_expand( raise TypeError(f"do not know how to expand '{type(psource).__name__}'") -def logplot(fp: FieldPlotter, psource: PotentialSource, **kwargs) -> None: +def logplot( + actx: ArrayContext, + fp: FieldPlotter, + psource: PotentialSource, **kwargs) -> None: fp.show_scalar_in_matplotlib( - np.log10(np.abs(psource.eval(fp.points) + 1e-15)), **kwargs) + np.log10(np.abs(psource.eval(actx, fp.points) + 1e-15)), **kwargs) def combine_inner_outer( @@ -897,7 +901,7 @@ def combine_halfspace_and_outer( psource_outer, radius, center) -def l_inf(psource: PotentialSource, radius: float, +def l_inf(actx: ArrayContext, psource: PotentialSource, radius: float, center: np.ndarray | None = None, npoints: int = 100, debug: bool = False) -> np.number: if center is None: @@ -908,7 +912,7 @@ def l_inf(psource: PotentialSource, radius: float, from sumpy.visualization import FieldPlotter fp = FieldPlotter(center, extent=2*radius, npoints=npoints) - z = restr.eval(fp.points) + z = restr.eval(actx, fp.points) if debug: fp.show_scalar_in_matplotlib(