diff --git a/Project.toml b/Project.toml index 5e60efb..b6671ca 100644 --- a/Project.toml +++ b/Project.toml @@ -4,10 +4,26 @@ authors = ["Michael Schlottke-Lakemper "] version = "0.1.0" [deps] +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +[extras] +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" + +[extensions] +CodexMachCUDAExt = "CUDA" +CodexMachMetalExt = "Metal" + [compat] +KernelAbstractions = "0.9.38" +CUDA = "5" +Metal = "1" Plots = "1.41.1" TimerOutputs = "0.5.29" julia = "1.11" diff --git a/README.md b/README.md index a4526c2..31fd0aa 100644 --- a/README.md +++ b/README.md @@ -27,21 +27,28 @@ The repository ships with a bare `docs/` folder ready to host Documenter.jl-base ## Example simulation -The `examples/linear_advection_demo.jl` script wires together mesh generation, -problem setup, RK2 time integration, and the high-level driver. Run it from the -repository root: +Run the linear advection demo on the CPU: ```bash -julia --project=. examples/linear_advection_demo.jl +julia --project=run examples/linear_advection_demo.jl ``` -It prints periodic RMS diagnostics along with the CFL number used for the run. -Tune parameters by editing the keyword arguments in `run_linear_advection_demo`. +Run the same demo on a Metal-capable GPU: + +```bash +julia --project=run -e 'using Metal; include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)' +``` + +The script wires together mesh generation, problem setup, RK2 time integration, +and the high-level driver. It prints periodic RMS diagnostics along with the CFL +number used for the run. Override the precision explicitly with +`state_eltype=Float64` if you want to run the GPU case in double precision (CUDA) +or stay on `Float32` for Metal. To capture diagnostics, pass output paths (created if missing): ```bash -julia --project=. examples/linear_advection_demo.jl diagnostics.csv final_state.csv +julia --project=run examples/linear_advection_demo.jl diagnostics.csv final_state.csv ``` The first file lists sampled step/time/RMS/CFL data, and the second stores the @@ -53,7 +60,7 @@ To render the sampled diagnostics (and optionally the final field), use the helper script: ```bash -julia --project=. examples/plot_linear_advection.jl diagnostics.csv final_state.csv plot.png +julia --project=run examples/plot_linear_advection.jl diagnostics.csv final_state.csv plot.png ``` Install `Plots.jl` in your environment first (`import Pkg; Pkg.add("Plots")`). @@ -61,10 +68,16 @@ The script falls back to a readable error if the dependency is missing. ## Convergence study -To verify spatial accuracy, run the periodic convergence sweep: +CPU run: ```bash -julia --project=. examples/convergence_linear_advection.jl +julia --project=run examples/convergence_linear_advection.jl +``` + +Metal GPU run: + +```bash +julia --project=run -e 'using Metal; include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)' ``` The driver evolves a sinusoidal field across five nested grids, prints the L₂ @@ -75,21 +88,39 @@ or refinement levels. ## Kelvin-Helmholtz instability -The compressible Euler path is exercised by -`examples/kelvin_helmholtz_euler.jl`, which seeds a periodic Kelvin-Helmholtz -roll-up on a square box: +CPU run: + +```bash +julia --project=run examples/kelvin_helmholtz_euler.jl +``` + +Metal GPU run: ```bash -julia --project=. examples/kelvin_helmholtz_euler.jl +julia --project=run -e 'using Metal; include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)' ``` By default the driver uses a 256×256 mesh, RK2 time stepping with adaptive CFL -control, and prints periodic log messages. Pass a file path via -`diagnostics_path` to capture per-step CFL and kinetic-energy measurements. The -routine returns the final state so you can post-process density, vorticity, or -other derived fields. Supply `pdf_path` to snapshot the terminal density field, -and `animation_path` (MP4 or GIF) plus `animation_every`/`animation_fps` to -produce a time-resolved movie. +control, and prints periodic log messages. Set `backend=:metal` (or `:cuda`) +to run the scenario on a GPU; the helper automatically falls back to `Float32` +storage for KernelAbstractions backends while keeping `Float64` for the serial +path. Pass a file path via `diagnostics_path` to capture per-step CFL and +kinetic-energy measurements. The routine returns the final state so you can +post-process density, vorticity, or other derived fields. Supply `pdf_path` to +snapshot the terminal density field, and `animation_path` (MP4 or GIF) plus +`animation_every`/`animation_fps` to produce a time-resolved movie. + +## Backend profiling + +Compare the serial and GPU paths with the profiling helper: + +```bash +julia --project=run examples/profile_backends.jl +``` + +It benchmarks the RK2 loops for linear advection and compressible Euler across +the serial and any registered KernelAbstractions backends (CPU, Metal, CUDA), +forcing `Float32` to keep Metal hardware supported. ## Maintainer diff --git a/docs/src/api.md b/docs/src/api.md index 197d5ce..b6d32b7 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -48,6 +48,7 @@ CompressibleEulerProblem setup_compressible_euler_problem CompressibleEulerState primitive_variables +_primitive_variables_cpu ``` ## Time Integration @@ -62,6 +63,20 @@ stable_timestep cfl_number ``` +## Cell Fields + +```@docs +CellField +allocate_cellfield +allocate_like +map_components! +cell_components +component +ncomponents +spatial_size +backend +``` + ## Simulation Drivers ```@docs diff --git a/docs/src/examples.md b/docs/src/examples.md index 2e8d9b1..5b75e0a 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -5,13 +5,21 @@ CurrentModule = CodexMach ``` The `examples/` directory contains reproducible scripts that exercise the core -numerics. You can run each script directly with Julia once the project -environment has been instantiated (`julia --project=.`). +numerics. You can run each script directly with Julia once the `run/` project +environment has been instantiated (`julia --project=run`). ## Linear Advection Demo +CPU: + +``` +julia --project=run examples/linear_advection_demo.jl +``` + +Metal GPU: + ``` -julia --project=. examples/linear_advection_demo.jl +julia --project=run -e 'using Metal; include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)' ``` This script transports a smooth sine blob around a rectangular periodic domain @@ -24,37 +32,56 @@ until it returns to its starting position. By default it: The function returns a named tuple with the step size, final time, and diagnostic history. When you pass `diagnostics_path="diagnostics.csv"` it writes a CSV mirroring the on-screen log, and `state_path="state.csv"` captures the final -solution for later visualisation. +solution for later visualisation. Override the precision explicitly with +`state_eltype=Float64` (for CUDA) or `state_eltype=Float32` if you want to minimise +device memory traffic. To plot the diagnostics or the final field, call ``` -julia --project=. examples/plot_linear_advection.jl diagnostics.csv state.csv +julia --project=run examples/plot_linear_advection.jl diagnostics.csv state.csv ``` which generates a PDF overlaying the analytic and numerical solutions. ## Linear Advection Convergence Study +CPU: + +``` +julia --project=run examples/convergence_linear_advection.jl +``` + +Metal GPU: + ``` -julia --project=. examples/convergence_linear_advection.jl +julia --project=run -e 'using Metal; include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)' ``` The convergence driver repeats the sinusoidal advection problem on a hierarchy of meshes. It prints the L² error for each resolution along with the observed order of accuracy. You should expect second-order convergence (EOC ≈ 2.0) once -the mesh is sufficiently fine. +the mesh is sufficiently fine. The serial path remains the default, so simple +invocations continue to run on the CPU without any extra keywords. ## Kelvin–Helmholtz Instability (Compressible Euler) +CPU: + +``` +julia --project=run examples/kelvin_helmholtz_euler.jl +``` + +Metal GPU: + ``` -julia --project=. examples/kelvin_helmholtz_euler.jl +julia --project=run -e 'using Metal; include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)' ``` This example launches a Kelvin–Helmholtz roll-up on a periodic square domain using the compressible Euler equations and the slope-limited Rusanov fluxes. The script logs CFL numbers, tracks the volume-averaged kinetic energy, and can -produce publication-ready figures: +produce publication-ready figures. Further output customisation includes: - set `diagnostics_path="diagnostics.csv"` to capture the per-step log; - set `pdf_path="khi_density.pdf"` for a static density snapshot at `t = final_time`; @@ -65,6 +92,17 @@ A stable run with the default settings reaches `t = 1.5` in roughly a thousand steps with CFL ≈ 0.4–0.45. The density field develops rolled vortices that match the reference figures checked into the repository (`khi_*.pdf/mp4`). +## Backend Profiling + +``` +julia --project=run examples/profile_backends.jl +``` + +This utility benchmarks the serial driver against available KernelAbstractions +backends for both linear advection and compressible Euler. It uses `Float32` +storage so Metal-backed GPUs remain supported and prints per-backend wall-clock +timings for quick regressions after kernel changes. + ## Tips - All drivers expose keyword arguments for mesh size, timestep control, and file diff --git a/examples/convergence_linear_advection.jl b/examples/convergence_linear_advection.jl index 8c15736..f970647 100644 --- a/examples/convergence_linear_advection.jl +++ b/examples/convergence_linear_advection.jl @@ -5,7 +5,8 @@ using Printf """ run_convergence_study(; base_resolution=(16, 16), levels=5, lengths=(1.0, 1.0), velocity=(1.0, 0.5), - final_time=0.5, cfl=0.45) + final_time=0.5, cfl=0.45, + backend=default_backend(), state_eltype=nothing) Execute a grid refinement study for periodic linear advection with a sinusoidal initial condition on a square domain. Prints the L₂ error for each resolution, @@ -16,7 +17,9 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), lengths::Tuple{<:Real,<:Real} = (1.0, 1.0), velocity::Tuple{<:Real,<:Real} = (1.0, 0.5), final_time::Real = 0.5, - cfl::Real = 0.45) + cfl::Real = 0.45, + backend::Union{ExecutionBackend,Symbol} = default_backend(), + state_eltype::Union{Nothing,Type} = nothing) @assert levels >= 2 "Need at least two levels to measure convergence" Lx, Ly = float(lengths[1]), float(lengths[2]) @@ -31,6 +34,8 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), println(" level nx ny dt L2 error EOC") init = _sinusoidal_initializer(Lx, Ly) + backend_obj = _resolve_example_backend(backend) + state_T = isnothing(state_eltype) ? _default_state_eltype(backend_obj) : state_eltype for level in 0:levels-1 nx = nx0 * 2^level @@ -38,7 +43,7 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), problem = setup_linear_advection_problem(nx, ny; lengths = (Lx, Ly), velocity = vel) - state = LinearAdvectionState(problem; init = init) + state = LinearAdvectionState(problem; init = init, T = state_T, backend = backend_obj) dt_stable = stable_timestep(problem; cfl = cfl) steps = max(1, ceil(Int, final_time / dt_stable)) @@ -46,7 +51,9 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), run_linear_advection!(state, problem; steps = steps, dt = dt) - u_num = solution(state) + u_field = solution(state) + u_num_backend = scalar_component(u_field) + u_num = Array(u_num_backend) # copy to host for error evaluation u_exact = _exact_solution(problem, vel, final_time, u_num) err = sqrt(sum(abs2, u_num .- u_exact) / length(u_num)) @@ -74,6 +81,22 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16), errors = errors, eocs = eocs) end +function _resolve_example_backend(spec::ExecutionBackend) + return spec +end + +function _resolve_example_backend(spec::Symbol) + return KernelAbstractionsBackend(spec) +end + +function _default_state_eltype(::ExecutionBackend) + return Float64 +end + +function _default_state_eltype(::KernelAbstractionsBackend) + return Float32 +end + function _sinusoidal_initializer(Lx::Float64, Ly::Float64) function init(x, y) return sin(2pi * x / Lx) * sin(2pi * y / Ly) diff --git a/examples/kelvin_helmholtz_euler.jl b/examples/kelvin_helmholtz_euler.jl index cf2ac25..9f038a9 100644 --- a/examples/kelvin_helmholtz_euler.jl +++ b/examples/kelvin_helmholtz_euler.jl @@ -5,7 +5,7 @@ using Plots """ run_kelvin_helmholtz(; nx=256, ny=256, gamma=1.4, final_time=1.5, - cfl=0.45, T=Float32, log_every=25, + cfl=0.45, T=Float32, backend=default_backend(), log_every=25, diagnostics_path=nothing, pdf_path=nothing, animation_path=nothing, @@ -22,7 +22,8 @@ function run_kelvin_helmholtz(; nx::Int = 256, gamma::Real = 1.4, final_time::Real = 1.5, cfl::Real = 0.45, - T::Type = Float32, + T::Union{Type,Nothing} = Float32, + backend::Union{ExecutionBackend,Symbol} = default_backend(), log_every::Integer = 25, diagnostics_path::Union{Nothing,AbstractString} = nothing, pdf_path::Union{Nothing,AbstractString} = nothing, @@ -42,14 +43,18 @@ function run_kelvin_helmholtz(; nx::Int = 256, gamma = gamma, boundary_conditions = PeriodicBoundaryConditions()) - init = _kelvin_helmholtz_initializer(T) - state = CompressibleEulerState(problem; T = T, init = init) + backend_obj = _resolve_example_backend(backend) + state_T = isnothing(T) ? _default_state_eltype(backend_obj) : T + + init = _kelvin_helmholtz_initializer(state_T) + state = CompressibleEulerState(problem; T = state_T, init = init, backend = backend_obj) sim_time = 0.0 step = 0 last_cfl = NaN records = diagnostics_path === nothing ? nothing : Vector{NamedTuple{(:step, :time, :cfl, :kinetic_energy),NTuple{4,Float64}}}() - prim_buffers = primitive_variables(problem, solution(state)) + prim_buffers = primitive_variables(problem, solution(state); + backend = CodexMach.backend(state)) last_progress = time() centers_x, centers_y = cell_centers(mesh(problem)) animation_obj = animation_path === nothing ? nothing : Animation() @@ -112,6 +117,22 @@ function run_kelvin_helmholtz(; nx::Int = 256, animation_path = animation_path) end +function _resolve_example_backend(spec::ExecutionBackend) + return spec +end + +function _resolve_example_backend(spec::Symbol) + return KernelAbstractionsBackend(spec) +end + +function _default_state_eltype(::ExecutionBackend) + return Float64 +end + +function _default_state_eltype(::KernelAbstractionsBackend) + return Float32 +end + function _kelvin_helmholtz_initializer(::Type{T}) where {T} slope = T(15) offset = T(7.5) @@ -132,19 +153,31 @@ function _volume_average_kinetic_energy(state::CompressibleEulerState, problem::CompressibleEulerProblem, buffers) prim = primitive_variables(problem, solution(state); + backend = CodexMach.backend(state), rho_out = buffers.rho, u_out = buffers.u, v_out = buffers.v, p_out = buffers.p) - nx, ny = size(prim.rho) - total = zero(eltype(prim.rho)) - half = convert(eltype(prim.rho), 0.5) - @inbounds for j in 1:ny, i in 1:nx - vel2 = prim.u[i, j]^2 + prim.v[i, j]^2 - total += half * prim.rho[i, j] * vel2 + ρ = prim.rho + u = prim.u + v = prim.v + nx, ny = size(ρ) + if ρ isa Array + total = zero(eltype(ρ)) + half = convert(eltype(ρ), 0.5) + @inbounds for j in 1:ny, i in 1:nx + vel2 = u[i, j]^2 + v[i, j]^2 + total += half * ρ[i, j] * vel2 + end + return float(total) / (nx * ny), prim + else + T = eltype(ρ) + half = inv(convert(T, 2)) + vel2 = u .* u .+ v .* v + total = sum(ρ .* vel2 .* half) + return float(total) / (nx * ny), prim end - return float(total) / (nx * ny), prim end function _write_khi_diagnostics(path::AbstractString, @@ -165,6 +198,7 @@ function _write_khi_pdf(path::AbstractString, centers_x, centers_y) prim = primitive_variables(problem, solution(state); + backend = CodexMach.backend(state), rho_out = buffers.rho, u_out = buffers.u, v_out = buffers.v, @@ -176,9 +210,10 @@ function _write_khi_pdf(path::AbstractString, end function _density_plot(centers_x, centers_y, density; title::AbstractString) + dens = density isa Array ? density : Array(density) heatmap(centers_x, centers_y, - density'; + dens'; xlabel = "x", ylabel = "y", title = title, diff --git a/examples/linear_advection_demo.jl b/examples/linear_advection_demo.jl index 09913e7..1ed2348 100644 --- a/examples/linear_advection_demo.jl +++ b/examples/linear_advection_demo.jl @@ -4,6 +4,7 @@ using CodexMach """ run_linear_advection_demo(; nx=256, ny=128, lengths=(2.0, 1.0), velocity=(1.0, 0.0), cfl=0.4, steps=100, + backend=default_backend(), state_eltype=nothing, sample_every=nothing, diagnostics_path=nothing, state_path=nothing, match_return_period=true) @@ -22,6 +23,8 @@ function run_linear_advection_demo(; nx::Int = 256, velocity::Tuple{<:Real,<:Real} = (1.0, 0.0), cfl::Real = 0.4, steps::Int = 100, + backend::Union{ExecutionBackend,Symbol} = default_backend(), + state_eltype::Union{Nothing,Type} = nothing, sample_every::Union{Nothing,Int} = nothing, diagnostics_path::Union{Nothing,AbstractString} = nothing, state_path::Union{Nothing,AbstractString} = nothing, @@ -37,8 +40,14 @@ function run_linear_advection_demo(; nx::Int = 256, problem = setup_linear_advection_problem(nx_cells, ny_cells; velocity = velocity, lengths = lengths_tuple) + backend_obj = _resolve_example_backend(backend) + Tstate = isnothing(state_eltype) ? _default_state_eltype(backend_obj) : state_eltype + init_field = _sine_blob_initializer(lengths_tuple) - state = LinearAdvectionState(problem; init = init_field) + state = LinearAdvectionState(problem; + init = init_field, + T = Tstate, + backend = backend_obj) dt_cfl = stable_timestep(problem; cfl = cfl) target_time = _return_period(lengths_tuple, velocity) @@ -78,7 +87,8 @@ function run_linear_advection_demo(; nx::Int = 256, _write_diagnostics_csv(diagnostics_path, records) end - final_state = copy(solution(state)) + final_field = copy(solution(state)) + final_state = scalar_component(final_field) if state_path !== nothing _write_state_csv(state_path, problem, final_state) end @@ -91,6 +101,22 @@ function run_linear_advection_demo(; nx::Int = 256, target_time = target_time)) end +function _resolve_example_backend(spec::ExecutionBackend) + return spec +end + +function _resolve_example_backend(spec::Symbol) + return KernelAbstractionsBackend(spec) +end + +function _default_state_eltype(::ExecutionBackend) + return Float64 +end + +function _default_state_eltype(::KernelAbstractionsBackend) + return Float32 +end + function _write_diagnostics_csv(path::AbstractString, records::Vector{NamedTuple{(:step, :time, :rms, :cfl), NTuple{4,Float64}}}) open(path, "w") do io diff --git a/examples/profile_backends.jl b/examples/profile_backends.jl new file mode 100644 index 0000000..f82012f --- /dev/null +++ b/examples/profile_backends.jl @@ -0,0 +1,56 @@ +using CodexMach +try + using Metal + CodexMach.register_backend!(:metal, () -> Metal.MetalBackend()) + println("Metal backend registered") +catch err + println("Metal unavailable: ", err) +end + +function profile(label, setup) + println("\n== $label ==") + for (name, state, prob, dt, steps) in setup() + GC.gc() + elapsed = @elapsed for _ in 1:steps + rk2_step!(state, prob, dt) + end + println(rpad(name, 10), ": ", round(elapsed, digits=3), " s") + end +end + +nx = 1024 +ny = 1024 +steps = 100 +prob_la = setup_linear_advection_problem(nx, ny; velocity=(1.0, 0.5)) +dt_la = 1e-3 +profile_T = Float32 + +function advection_cases() + cases = Tuple{String,Any,Any,Float64,Int}[] + push!(cases, ("Serial", LinearAdvectionState(prob_la; init=1.0, T=profile_T), prob_la, dt_la, steps)) + push!(cases, ("KA CPU", LinearAdvectionState(prob_la; init=1.0, T=profile_T, backend=KernelAbstractionsBackend(:cpu)), prob_la, dt_la, steps)) + if :metal in available_backends() + push!(cases, ("Metal", LinearAdvectionState(prob_la; init=1.0, T=profile_T, backend=KernelAbstractionsBackend(:metal)), prob_la, dt_la, steps)) + end + return cases +end + +profile("Linear Advection $(nx)x$(ny), steps=$(steps)", advection_cases) + +prob_eu = setup_compressible_euler_problem(nx÷2, ny÷2; gamma=1.4) +init(x,y) = (; rho = 1.0 + 0.1 * sinpi(x), v1 = 0.05*cospi(y), v2 = 0.02*sinpi(x+y), p = 1.0 + 0.05*cospi(x)) +serial_state = CompressibleEulerState(prob_eu; init=init) +dt_eu = stable_timestep(prob_eu, serial_state; cfl=0.3) +steps_eu = 100 + +function euler_cases() + cases = Tuple{String,Any,Any,Float64,Int}[] + push!(cases, ("Serial", CompressibleEulerState(prob_eu; init=init, T=profile_T), prob_eu, dt_eu, steps_eu)) + push!(cases, ("KA CPU", CompressibleEulerState(prob_eu; init=init, T=profile_T, backend=KernelAbstractionsBackend(:cpu)), prob_eu, dt_eu, steps_eu)) + if :metal in available_backends() + push!(cases, ("Metal", CompressibleEulerState(prob_eu; init=init, T=profile_T, backend=KernelAbstractionsBackend(:metal)), prob_eu, dt_eu, steps_eu)) + end + return cases +end + +profile("Compressible Euler $(nx÷2)x$(ny÷2), steps=$(steps_eu)", euler_cases) diff --git a/ext/CodexMachCUDAExt.jl b/ext/CodexMachCUDAExt.jl new file mode 100644 index 0000000..2b47f55 --- /dev/null +++ b/ext/CodexMachCUDAExt.jl @@ -0,0 +1,13 @@ +module CodexMachCUDAExt + +using CodexMach +using KernelAbstractions +using CUDA + +function __init__() + CodexMach.register_backend!(:cuda, () -> KernelAbstractions.CUDADevice()) +end + +CodexMach._default_array_type_from_symbol(::Val{:cuda}) = CUDA.CuArray + +end # module diff --git a/ext/CodexMachMetalExt.jl b/ext/CodexMachMetalExt.jl new file mode 100644 index 0000000..2fa8915 --- /dev/null +++ b/ext/CodexMachMetalExt.jl @@ -0,0 +1,14 @@ +module CodexMachMetalExt + +using CodexMach +using KernelAbstractions +using Metal + +function __init__() + CodexMach.register_backend!(:metal, () -> Metal.MetalBackend()) +end + +CodexMach._default_array_type_from_symbol(::Val{:metal}) = Metal.MtlArray +CodexMach._default_array_type_from_spec(::Metal.MetalBackend) = Metal.MtlArray + +end # module diff --git a/src/CodexMach.jl b/src/CodexMach.jl index 90d9995..fd9675d 100644 --- a/src/CodexMach.jl +++ b/src/CodexMach.jl @@ -8,6 +8,9 @@ simulation_timers() = _SIMULATION_TIMERS include("mesh.jl") include("boundary_conditions.jl") +include("fields.jl") +include("backends.jl") +include("kernel_abstractions_support.jl") include("equations.jl") include("time_integration.jl") include("simulation.jl") @@ -20,6 +23,22 @@ export greet, PeriodicBoundaryConditions, is_periodic, periodic_axes, + CellField, + cell_components, + ncomponents, + spatial_size, + component, + scalar_component, + allocate_cellfield, + allocate_like, + map_components!, + register_backend!, + available_backends, + ExecutionBackend, + SerialBackend, + KernelAbstractionsBackend, + default_backend, + describe, LinearAdvection, LinearAdvectionProblem, velocity, @@ -31,6 +50,7 @@ export greet, RK2Workspace, solution, workspace, + backend, compute_rhs!, rk2_step!, cfl_number, diff --git a/src/backends.jl b/src/backends.jl new file mode 100644 index 0000000..d699814 --- /dev/null +++ b/src/backends.jl @@ -0,0 +1,40 @@ +abstract type ExecutionBackend end + +struct SerialBackend <: ExecutionBackend +end + +struct KernelAbstractionsBackend{D,WS} <: ExecutionBackend + device::D + workgroupsize::WS +end + +function KernelAbstractionsBackend(device; workgroupsize = nothing) + return KernelAbstractionsBackend{typeof(device), typeof(workgroupsize)}(device, workgroupsize) +end + +KernelAbstractionsBackend(; device = :cpu, workgroupsize = nothing) = + KernelAbstractionsBackend(device; workgroupsize = workgroupsize) + +default_backend() = SerialBackend() + +function describe(backend::ExecutionBackend) + backend isa SerialBackend && return "SerialBackend" + backend isa KernelAbstractionsBackend && return "KernelAbstractionsBackend($(backend.device))" + return string(typeof(backend)) +end + +function default_array_type(::SerialBackend) + return Array +end + +function default_array_type(backend::KernelAbstractionsBackend) + return _default_array_type_from_spec(backend.device) +end + +_default_array_type_from_spec(::Any) = Array + +function _default_array_type_from_spec(spec::Symbol) + return _default_array_type_from_symbol(Val(spec)) +end + +_default_array_type_from_symbol(::Val) = Array diff --git a/src/equations.jl b/src/equations.jl index a307275..563f300 100644 --- a/src/equations.jl +++ b/src/equations.jl @@ -201,22 +201,69 @@ Convert a conserved-field array with layout `(4, nx, ny)` to primitive variables. Optionally provide preallocated output arrays via the keyword arguments. Returns a named tuple `(rho, u, v, p)`. """ -function primitive_variables(eq::CompressibleEuler, - conserved::AbstractArray{T,3}; - rho_out = nothing, - u_out = nothing, - v_out = nothing, - p_out = nothing) where {T} +function _primitive_variables_cpu(eq::CompressibleEuler, + conserved::CellField; + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) + nx, ny = spatial_size(conserved) + + T = component_eltype(conserved) + out_T = float(T) + + ρ = rho_out === nothing ? Array{out_T}(undef, nx, ny) : rho_out + u = u_out === nothing ? similar(ρ) : u_out + v = v_out === nothing ? similar(ρ) : v_out + p = p_out === nothing ? similar(ρ) : p_out + + size(ρ) == (nx, ny) || + throw(ArgumentError("rho_out size mismatch")) + (size(u) == (nx, ny) && size(v) == (nx, ny) && size(p) == (nx, ny)) || + throw(ArgumentError("Primitive output buffers must match conserved field shape")) + + ρc = component(conserved, 1) + rhouc = component(conserved, 2) + rhovc = component(conserved, 3) + Ec = component(conserved, 4) + + @inbounds for j in 1:ny, i in 1:nx + prim = primitive_variables(eq, + ρc[i, j], + rhouc[i, j], + rhovc[i, j], + Ec[i, j]) + ρ[i, j] = prim.ρ + u[i, j] = prim.u + v[i, j] = prim.v + p[i, j] = prim.p + end + + return (; rho = ρ, u = u, v = v, p = p) +end + +function _primitive_variables_cpu(eq::CompressibleEuler, + conserved::AbstractArray{T,3}; + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) where {T} size(conserved, 1) == 4 || throw(ArgumentError("Conserved field must have first dimension of length 4")) nx, ny = size(conserved, 2), size(conserved, 3) + out_T = float(T) - ρ = rho_out === nothing ? Array{float(T)}(undef, nx, ny) : rho_out + ρ = rho_out === nothing ? Array{out_T}(undef, nx, ny) : rho_out u = u_out === nothing ? similar(ρ) : u_out v = v_out === nothing ? similar(ρ) : v_out p = p_out === nothing ? similar(ρ) : p_out + size(ρ) == (nx, ny) || + throw(ArgumentError("rho_out size mismatch")) + (size(u) == (nx, ny) && size(v) == (nx, ny) && size(p) == (nx, ny)) || + throw(ArgumentError("Primitive output buffers must match conserved field shape")) + @inbounds for j in 1:ny, i in 1:nx prim = primitive_variables(eq, conserved[1, i, j], @@ -232,10 +279,100 @@ function primitive_variables(eq::CompressibleEuler, return (; rho = ρ, u = u, v = v, p = p) end +function _primitive_variables_gpu(eq::CompressibleEuler, + conserved::CellField, + backend::KernelAbstractionsBackend; + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) + size(conserved, 1) == 4 || + throw(ArgumentError("Conserved field must have first dimension of length 4")) + + ρc = component(conserved, 1) + rhouc = component(conserved, 2) + rhovc = component(conserved, 3) + Ec = component(conserved, 4) + + ρ = rho_out === nothing ? similar(ρc) : rho_out + u = u_out === nothing ? similar(ρc) : u_out + v = v_out === nothing ? similar(ρc) : v_out + p = p_out === nothing ? similar(ρc) : p_out + + size(ρ) == size(ρc) || + throw(ArgumentError("rho_out size mismatch")) + (size(u) == size(ρc) && size(v) == size(ρc) && size(p) == size(ρc)) || + throw(ArgumentError("Primitive output buffers must match conserved field shape")) + + T = eltype(ρc) + γT = convert(T, gamma(eq)) + γm1 = γT - one(γT) + epsT = eps(T) + + primitive_variables_kernel!(backend, + ρ, u, v, p, + ρc, rhouc, rhovc, Ec, + γm1, epsT) + + return (; rho = ρ, u = u, v = v, p = p) +end + +function primitive_variables(eq::CompressibleEuler, + conserved::CellField; + backend::ExecutionBackend = SerialBackend(), + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) + if backend isa KernelAbstractionsBackend + return _primitive_variables_gpu(eq, conserved, backend; + rho_out = rho_out, + u_out = u_out, + v_out = v_out, + p_out = p_out) + else + return _primitive_variables_cpu(eq, conserved; + rho_out = rho_out, + u_out = u_out, + v_out = v_out, + p_out = p_out) + end +end + +function primitive_variables(eq::CompressibleEuler, + conserved::AbstractArray{T,3}; + backend::ExecutionBackend = SerialBackend(), + rho_out = nothing, + u_out = nothing, + v_out = nothing, + p_out = nothing) where {T} + backend isa KernelAbstractionsBackend && + throw(ArgumentError("primitive_variables does not support generic AbstractArray inputs on GPU")) + return _primitive_variables_cpu(eq, conserved; + rho_out = rho_out, + u_out = u_out, + v_out = v_out, + p_out = p_out) +end + +primitive_variables(problem::CompressibleEulerProblem, + state; + backend::ExecutionBackend = backend(state), kwargs...) = + primitive_variables(pde(problem), + solution(state); + backend = backend, + kwargs...) + primitive_variables(problem::CompressibleEulerProblem, - state; kwargs...) = - primitive_variables(pde(problem), solution(state); kwargs...) + conserved::AbstractArray{T,3}; + backend::ExecutionBackend = SerialBackend(), kwargs...) where {T} = + primitive_variables(pde(problem), conserved; + backend = backend, + kwargs...) primitive_variables(problem::CompressibleEulerProblem, - conserved::AbstractArray{T,3}; kwargs...) where {T} = - primitive_variables(pde(problem), conserved; kwargs...) + conserved::CellField; + backend::ExecutionBackend = SerialBackend(), kwargs...) = + primitive_variables(pde(problem), conserved; + backend = backend, + kwargs...) diff --git a/src/fields.jl b/src/fields.jl new file mode 100644 index 0000000..4dca5fb --- /dev/null +++ b/src/fields.jl @@ -0,0 +1,200 @@ +""" + CellField(components...) + +Bundle one or more cell-centered arrays that share a common `(nx, ny)` shape. +Components are stored in a structure-of-arrays layout, which keeps per-variable +data contiguous for GPU and cache-friendly execution. `CellField` behaves like a +3D `AbstractArray` whose first dimension indexes the component. +""" +struct CellField{T,N,A<:NTuple{N,AbstractArray{T,2}}} <: AbstractArray{T,3} + components::A + function CellField(components::A) where {T,N,A<:NTuple{N,AbstractArray{T,2}}} + N > 0 || throw(ArgumentError("CellField must contain at least one component")) + dims = size(components[1]) + for (idx, comp) in pairs(components) + ndims(comp) == 2 || + throw(ArgumentError("Component $(idx) must be two-dimensional")) + size(comp) == dims || + throw(ArgumentError("Component $(idx) does not match CellField shape")) + end + return new{T,N,A}(components) + end +end + +CellField(component::AbstractArray{T,2}) where {T} = + CellField{T,1,Tuple{typeof(component)}}((component,)) + +""" + cell_components(field) + +Return the tuple of component arrays stored within `field`. +""" +cell_components(field::CellField) = field.components + +""" + ncomponents(field) + +Return the number of component arrays bundled inside `field`. +""" +ncomponents(::CellField{T,N}) where {T,N} = N + +""" + spatial_size(field) + +Return the `(nx, ny)` logical dimensions shared by all components. +""" +spatial_size(field::CellField) = size(cell_components(field)[1]) + +""" + component(field, i) + +Access the `i`-th component array stored inside `field`. +""" +component(field::CellField, i::Integer) = cell_components(field)[i] + +component(field::CellField, ::Val{i}) where {i} = component(field, i) + +component_eltype(::CellField{T}) where {T} = T + +scalar_component(field::CellField) = component(field, 1) + +Base.eltype(field::CellField) = component_eltype(field) + +Base.size(field::CellField) = (ncomponents(field), spatial_size(field)...) + +Base.axes(field::CellField) = (Base.OneTo(ncomponents(field)), + axes(component(field, 1))...) + +Base.IndexStyle(::Type{<:CellField}) = IndexCartesian() + +@inline function Base.getindex(field::CellField, i, j, k) + return component(field, i)[j, k] +end + +@inline function Base.getindex(field::CellField{T,1}, i, j) where {T} + return component(field, 1)[i, j] +end + +@inline function Base.getindex(field::CellField{T,1}, ::Colon, ::Colon) where {T} + return component(field, 1)[:, :] +end + +@inline function Base.getindex(field::CellField, I::CartesianIndex{3}) + return field[I[1], I[2], I[3]] +end + +@inline function Base.getindex(field::CellField, i, ::Colon, ::Colon) + return component(field, i)[:, :] +end + +@inline function Base.getindex(field::CellField, i, j, ::Colon) + return component(field, i)[j, :] +end + +@inline function Base.getindex(field::CellField, i, ::Colon, k) + return component(field, i)[:, k] +end + +@inline function Base.setindex!(field::CellField, value, i, j, k) + component(field, i)[j, k] = value + return field +end + +@inline function Base.setindex!(field::CellField{T,1}, value, i, j) where {T} + component(field, 1)[i, j] = value + return field +end + +@inline function Base.setindex!(field::CellField, value, I::CartesianIndex{3}) + field[I[1], I[2], I[3]] = value + return field +end + +""" + allocate_cellfield(array_type, T, dims, n) + +Allocate a `CellField` with `n` components, each created via +`array_type{T}(undef, dims...)`. +""" +function allocate_cellfield(array_type, + ::Type{T}, + dims::NTuple{2,<:Integer}, + n::Integer) where {T} + n > 0 || throw(ArgumentError("Number of components must be positive")) + dimsT = (Int(dims[1]), Int(dims[2])) + components = ntuple(_ -> array_type{T}(undef, dimsT...), n) + return CellField(components) +end + +""" + allocate_like(field) + +Allocate a new `CellField` whose components mirror the array types and shapes of +`field`. +""" +function allocate_like(field::CellField) + comps = ntuple(i -> similar(component(field, i)), ncomponents(field)) + return CellField(comps) +end + +""" + allocate_like(field, T) + +Allocate a new `CellField` with components similar to those in `field` but whose +entries have element type `T`. +""" +function allocate_like(field::CellField, ::Type{T}) where {T} + comps = ntuple(i -> similar(component(field, i), T, size(component(field, i))), + ncomponents(field)) + return CellField(comps) +end + +Base.fill!(field::CellField, value::Number) = begin + for comp in cell_components(field) + fill!(comp, value) + end + return field +end + +function Base.fill!(field::CellField, values::Union{Tuple,NTuple}) + length(values) == ncomponents(field) || + throw(ArgumentError("Tuple length must match number of components")) + for (comp, val) in zip(cell_components(field), values) + fill!(comp, val) + end + return field +end + +function Base.copyto!(dest::CellField, src::CellField) + ncomponents(dest) == ncomponents(src) || + throw(ArgumentError("Component count mismatch in copy")) + for i in 1:ncomponents(dest) + copyto!(component(dest, i), component(src, i)) + end + return dest +end + +Base.similar(field::CellField) = allocate_like(field) + +Base.similar(field::CellField, ::Type{T}) where {T} = allocate_like(field, T) + +""" + map_components!(f, dest, inputs...) + +Apply `f` to each component array in `dest`, passing the corresponding +components drawn from each `inputs...` collection. All inputs must either be +`CellField`s with matching component counts or scalar arrays reused for every +component. +""" +function map_components!(f, dest::CellField, inputs...) + n = ncomponents(dest) + nin = length(inputs) + for i in 1:n + args = ntuple(j -> begin + input = inputs[j] + input isa CellField ? component(input, i) : input + end, nin) + f(component(dest, i), args...) + end + return dest +end diff --git a/src/kernel_abstractions_support.jl b/src/kernel_abstractions_support.jl new file mode 100644 index 0000000..ad3c2e5 --- /dev/null +++ b/src/kernel_abstractions_support.jl @@ -0,0 +1,401 @@ +using KernelAbstractions + +const _DEVICE_REGISTRY = Dict{Symbol,Function}() + +function register_backend!(name::Symbol, factory::Function) + _DEVICE_REGISTRY[name] = factory + return name +end + +available_backends() = collect(keys(_DEVICE_REGISTRY)) + +function _resolve_ka_device(spec) + if spec isa Symbol + factory = get(_DEVICE_REGISTRY, spec) do + throw(ArgumentError("Unsupported KernelAbstractions device spec $(spec); supported values: $(join(sort!(collect(keys(_DEVICE_REGISTRY))), ", "))")) + end + return factory() + elseif spec isa KernelAbstractions.AbstractDevice + return spec + else + throw(ArgumentError("KernelAbstractionsBackend device must be a Symbol or device, got $(typeof(spec))")) + end +end + +@inline function _ka_minmod(a, b) + S = promote_type(typeof(a), typeof(b)) + if a * b <= 0 + return zero(S) + end + return S(copysign(min(abs(a), abs(b)), a)) +end + +@inline function _ka_velocity_pressure(γ, ρ, rhou, rhov, E) + invρ = one(ρ) / ρ + ux = rhou * invρ + uy = rhov * invρ + kinetic = (one(ρ) / 2) * ρ * (ux^2 + uy^2) + p = (γ - one(γ)) * (E - kinetic) + return ux, uy, p +end + +@inline function _ka_rusanov_flux_x(γ, ρL, rhouL, rhovL, EL, + ρR, rhouR, rhovR, ER) + T = promote_type(typeof(ρL), typeof(ρR)) + half = inv(T(2)) + uxL, uyL, pL = _ka_velocity_pressure(γ, ρL, rhouL, rhovL, EL) + uxR, uyR, pR = _ka_velocity_pressure(γ, ρR, rhouR, rhovR, ER) + cL = sqrt(abs(γ * pL / ρL)) + cR = sqrt(abs(γ * pR / ρR)) + smax = max(abs(uxL) + cL, abs(uxR) + cR) + + FL1 = rhouL + FL2 = rhouL * uxL + pL + FL3 = rhouL * uyL + FL4 = (EL + pL) * uxL + + FR1 = rhouR + FR2 = rhouR * uxR + pR + FR3 = rhouR * uyR + FR4 = (ER + pR) * uxR + + flux1 = half * (FL1 + FR1) - half * smax * (ρR - ρL) + flux2 = half * (FL2 + FR2) - half * smax * (rhouR - rhouL) + flux3 = half * (FL3 + FR3) - half * smax * (rhovR - rhovL) + flux4 = half * (FL4 + FR4) - half * smax * (ER - EL) + + return flux1, flux2, flux3, flux4 +end + +@inline function _ka_rusanov_flux_y(γ, ρL, rhouL, rhovL, EL, + ρR, rhouR, rhovR, ER) + T = promote_type(typeof(ρL), typeof(ρR)) + half = inv(T(2)) + uxL, uyL, pL = _ka_velocity_pressure(γ, ρL, rhouL, rhovL, EL) + uxR, uyR, pR = _ka_velocity_pressure(γ, ρR, rhouR, rhovR, ER) + cL = sqrt(abs(γ * pL / ρL)) + cR = sqrt(abs(γ * pR / ρR)) + smax = max(abs(uyL) + cL, abs(uyR) + cR) + + GL1 = rhovL + GL2 = rhovL * uxL + GL3 = rhovL * uyL + pL + GL4 = (EL + pL) * uyL + + GR1 = rhovR + GR2 = rhovR * uxR + GR3 = rhovR * uyR + pR + GR4 = (ER + pR) * uyR + + flux1 = half * (GL1 + GR1) - half * smax * (ρR - ρL) + flux2 = half * (GL2 + GR2) - half * smax * (rhouR - rhouL) + flux3 = half * (GL3 + GR3) - half * smax * (rhovR - rhovL) + flux4 = half * (GL4 + GR4) - half * smax * (ER - EL) + + return flux1, flux2, flux3, flux4 +end + +@kernel function _linear_advection_rhs_kernel!(du, u, ax, ay, inv2dx, inv2dy) + i, j = @index(Global, NTuple) + nx, ny = size(u) + if i <= nx && j <= ny + im1 = i == 1 ? nx : i - 1 + im2 = im1 == 1 ? nx : im1 - 1 + ip1 = i == nx ? 1 : i + 1 + ip2 = ip1 == nx ? 1 : ip1 + 1 + + jm1 = j == 1 ? ny : j - 1 + jm2 = jm1 == 1 ? ny : jm1 - 1 + jp1 = j == ny ? 1 : j + 1 + jp2 = jp1 == ny ? 1 : jp1 + 1 + + zero_ax = zero(ax) + dudx = zero_ax + if ax > zero_ax + dudx = (3 * u[i, j] - 4 * u[im1, j] + u[im2, j]) * inv2dx + elseif ax < zero_ax + dudx = (-3 * u[i, j] + 4 * u[ip1, j] - u[ip2, j]) * inv2dx + end + + zero_ay = zero(ay) + dudy = zero_ay + if ay > zero_ay + dudy = (3 * u[i, j] - 4 * u[i, jm1] + u[i, jm2]) * inv2dy + elseif ay < zero_ay + dudy = (-3 * u[i, j] + 4 * u[i, jp1] - u[i, jp2]) * inv2dy + end + + du[i, j] = -(ax * dudx + ay * dudy) + end +end + +@kernel function _rk2_stage_kernel!(stage, u, rhs, dt) + I = @index(Global) + if I <= length(stage) + stage[I] = u[I] + dt * rhs[I] + end +end + +@kernel function _rk2_update_kernel!(u, k1, k2, factor) + I = @index(Global) + if I <= length(u) + u[I] = u[I] + factor * (k1[I] + k2[I]) + end +end + +@kernel function _primitive_variables_kernel!(ρ_out, u_out, v_out, p_out, + ρc, rhouc, rhovc, Ec, + γm1, epsT) + i, j = @index(Global, NTuple) + nx, ny = size(ρc) + if i <= nx && j <= ny + ρval = ρc[i, j] + ρval = ρval < epsT ? epsT : ρval + invρ = one(ρval) / ρval + ux = rhouc[i, j] * invρ + uy = rhovc[i, j] * invρ + half = inv(convert(typeof(ρval), 2)) + kinetic = half * ρval * (ux^2 + uy^2) + internal = Ec[i, j] - kinetic + internal = internal < epsT ? epsT : internal + p = γm1 * internal + p = p < epsT ? epsT : p + + ρ_out[i, j] = ρval + u_out[i, j] = ux + v_out[i, j] = uy + p_out[i, j] = p + end +end + +function linear_advection_rhs_kernel!(backend::KernelAbstractionsBackend, + du, u, + nx::Int, ny::Int, + ax, ay, + inv2dx, inv2dy) + device = _resolve_ka_device(backend.device) + kernel = backend.workgroupsize === nothing ? + _linear_advection_rhs_kernel!(device) : + _linear_advection_rhs_kernel!(device, backend.workgroupsize) + kernel(du, u, ax, ay, inv2dx, inv2dy; ndrange = (nx, ny)) + KernelAbstractions.synchronize(device) + return du +end + +function rk2_stage_kernel!(backend::KernelAbstractionsBackend, + stage, u, rhs, dt) + device = _resolve_ka_device(backend.device) + kernel = backend.workgroupsize === nothing ? + _rk2_stage_kernel!(device) : + _rk2_stage_kernel!(device, backend.workgroupsize) + kernel(stage, u, rhs, dt; ndrange = length(stage)) + KernelAbstractions.synchronize(device) + return stage +end + +function rk2_update_kernel!(backend::KernelAbstractionsBackend, + u, k1, k2, factor) + device = _resolve_ka_device(backend.device) + kernel = backend.workgroupsize === nothing ? + _rk2_update_kernel!(device) : + _rk2_update_kernel!(device, backend.workgroupsize) + kernel(u, k1, k2, factor; ndrange = length(u)) + KernelAbstractions.synchronize(device) + return u +end + +function primitive_variables_kernel!(backend::KernelAbstractionsBackend, + ρ_out, u_out, v_out, p_out, + ρc, rhouc, rhovc, Ec, + γm1, epsT) + device = _resolve_ka_device(backend.device) + kernel = backend.workgroupsize === nothing ? + _primitive_variables_kernel!(device) : + _primitive_variables_kernel!(device, backend.workgroupsize) + kernel(ρ_out, u_out, v_out, p_out, + ρc, rhouc, rhovc, Ec, + γm1, epsT; ndrange = size(ρc)) + KernelAbstractions.synchronize(device) + return ρ_out, u_out, v_out, p_out +end + +@kernel function _compressible_euler_rhs_kernel!(dρ, drhou, drhov, dE, + ρ, rhou, rhov, E, + γ, inv_dx, inv_dy) + i, j = @index(Global, NTuple) + nx, ny = size(ρ) + if i <= nx && j <= ny + ip = i == nx ? 1 : i + 1 + ip2 = ip == nx ? 1 : ip + 1 + im = i == 1 ? nx : i - 1 + im2 = im == 1 ? nx : im - 1 + + jp = j == ny ? 1 : j + 1 + jp2 = jp == ny ? 1 : jp + 1 + jm = j == 1 ? ny : j - 1 + jm2 = jm == 1 ? ny : jm - 1 + + half = typeof(ρ[i, j])(0.5) + + # Slopes for cell i + ΔLρ = ρ[i, j] - ρ[im, j] + ΔRρ = ρ[ip, j] - ρ[i, j] + ΔLrhox = rhou[i, j] - rhou[im, j] + ΔRrhox = rhou[ip, j] - rhou[i, j] + ΔLrhoy = rhov[i, j] - rhov[im, j] + ΔRrhoy = rhov[ip, j] - rhov[i, j] + ΔLE = E[i, j] - E[im, j] + ΔRE = E[ip, j] - E[i, j] + + sρ = _ka_minmod(ΔLρ, ΔRρ) + srhox = _ka_minmod(ΔLrhox, ΔRrhox) + srhoy = _ka_minmod(ΔLrhoy, ΔRrhoy) + sE = _ka_minmod(ΔLE, ΔRE) + + ρL_plus = ρ[i, j] + half * sρ + rhouL_plus = rhou[i, j] + half * srhox + rhovL_plus = rhov[i, j] + half * srhoy + EL_plus = E[i, j] + half * sE + + # Slopes for cell ip + ΔLρ_ip = ρ[ip, j] - ρ[i, j] + ΔRρ_ip = ρ[ip2, j] - ρ[ip, j] + ΔLrhox_ip = rhou[ip, j] - rhou[i, j] + ΔRrhox_ip = rhou[ip2, j] - rhou[ip, j] + ΔLrhoy_ip = rhov[ip, j] - rhov[i, j] + ΔRrhoy_ip = rhov[ip2, j] - rhov[ip, j] + ΔLE_ip = E[ip, j] - E[i, j] + ΔRE_ip = E[ip2, j] - E[ip, j] + + sρ_ip = _ka_minmod(ΔLρ_ip, ΔRρ_ip) + srhox_ip = _ka_minmod(ΔLrhox_ip, ΔRrhox_ip) + srhoy_ip = _ka_minmod(ΔLrhoy_ip, ΔRrhoy_ip) + sE_ip = _ka_minmod(ΔLE_ip, ΔRE_ip) + + ρR_plus = ρ[ip, j] - half * sρ_ip + rhouR_plus = rhou[ip, j] - half * srhox_ip + rhovR_plus = rhov[ip, j] - half * srhoy_ip + ER_plus = E[ip, j] - half * sE_ip + + flux1_plus, flux2_plus, flux3_plus, flux4_plus = + _ka_rusanov_flux_x(γ, ρL_plus, rhouL_plus, rhovL_plus, EL_plus, + ρR_plus, rhouR_plus, rhovR_plus, ER_plus) + + # Interface i-1/2 + ΔLρ_im = ρ[im, j] - ρ[im2, j] + ΔRρ_im = ρ[i, j] - ρ[im, j] + ΔLrhox_im = rhou[im, j] - rhou[im2, j] + ΔRrhox_im = rhou[i, j] - rhou[im, j] + ΔLrhoy_im = rhov[im, j] - rhov[im2, j] + ΔRrhoy_im = rhov[i, j] - rhov[im, j] + ΔLE_im = E[im, j] - E[im2, j] + ΔRE_im = E[i, j] - E[im, j] + + sρ_im = _ka_minmod(ΔLρ_im, ΔRρ_im) + srhox_im = _ka_minmod(ΔLrhox_im, ΔRrhox_im) + srhoy_im = _ka_minmod(ΔLrhoy_im, ΔRrhoy_im) + sE_im = _ka_minmod(ΔLE_im, ΔRE_im) + + ρL_minus = ρ[im, j] + half * sρ_im + rhouL_minus = rhou[im, j] + half * srhox_im + rhovL_minus = rhov[im, j] + half * srhoy_im + EL_minus = E[im, j] + half * sE_im + + ρR_minus = ρ[i, j] - half * sρ + rhouR_minus = rhou[i, j] - half * srhox + rhovR_minus = rhov[i, j] - half * srhoy + ER_minus = E[i, j] - half * sE + + flux1_minus, flux2_minus, flux3_minus, flux4_minus = + _ka_rusanov_flux_x(γ, ρL_minus, rhouL_minus, rhovL_minus, EL_minus, + ρR_minus, rhouR_minus, rhovR_minus, ER_minus) + + # Y-direction interface j+1/2 + ΔLρ_y = ρ[i, j] - ρ[i, jm] + ΔRρ_y = ρ[i, jp] - ρ[i, j] + ΔLrhox_y = rhou[i, j] - rhou[i, jm] + ΔRrhox_y = rhou[i, jp] - rhou[i, j] + ΔLrhoy_y = rhov[i, j] - rhov[i, jm] + ΔRrhoy_y = rhov[i, jp] - rhov[i, j] + ΔLE_y = E[i, j] - E[i, jm] + ΔRE_y = E[i, jp] - E[i, j] + + sρ_y = _ka_minmod(ΔLρ_y, ΔRρ_y) + srhox_y = _ka_minmod(ΔLrhox_y, ΔRrhox_y) + srhoy_y = _ka_minmod(ΔLrhoy_y, ΔRrhoy_y) + sE_y = _ka_minmod(ΔLE_y, ΔRE_y) + + ρL_plus_y = ρ[i, j] + half * sρ_y + rhouL_plus_y = rhou[i, j] + half * srhox_y + rhovL_plus_y = rhov[i, j] + half * srhoy_y + EL_plus_y = E[i, j] + half * sE_y + + ΔLρ_jp = ρ[i, jp] - ρ[i, j] + ΔRρ_jp = ρ[i, jp2] - ρ[i, jp] + ΔLrhox_jp = rhou[i, jp] - rhou[i, j] + ΔRrhox_jp = rhou[i, jp2] - rhou[i, jp] + ΔLrhoy_jp = rhov[i, jp] - rhov[i, j] + ΔRrhoy_jp = rhov[i, jp2] - rhov[i, jp] + ΔLE_jp = E[i, jp] - E[i, j] + ΔRE_jp = E[i, jp2] - E[i, jp] + + sρ_jp = _ka_minmod(ΔLρ_jp, ΔRρ_jp) + srhox_jp = _ka_minmod(ΔLrhox_jp, ΔRrhox_jp) + srhoy_jp = _ka_minmod(ΔLrhoy_jp, ΔRrhoy_jp) + sE_jp = _ka_minmod(ΔLE_jp, ΔRE_jp) + + ρR_plus_y = ρ[i, jp] - half * sρ_jp + rhouR_plus_y = rhou[i, jp] - half * srhox_jp + rhovR_plus_y = rhov[i, jp] - half * srhoy_jp + ER_plus_y = E[i, jp] - half * sE_jp + + flux1_plus_y, flux2_plus_y, flux3_plus_y, flux4_plus_y = + _ka_rusanov_flux_y(γ, ρL_plus_y, rhouL_plus_y, rhovL_plus_y, EL_plus_y, + ρR_plus_y, rhouR_plus_y, rhovR_plus_y, ER_plus_y) + + # Interface j-1/2 + ΔLρ_jm = ρ[i, jm] - ρ[i, jm2] + ΔRρ_jm = ρ[i, j] - ρ[i, jm] + ΔLrhox_jm = rhou[i, jm] - rhou[i, jm2] + ΔRrhox_jm = rhou[i, j] - rhou[i, jm] + ΔLrhoy_jm = rhov[i, jm] - rhov[i, jm2] + ΔRrhoy_jm = rhov[i, j] - rhov[i, jm] + ΔLE_jm = E[i, jm] - E[i, jm2] + ΔRE_jm = E[i, j] - E[i, jm] + + sρ_jm = _ka_minmod(ΔLρ_jm, ΔRρ_jm) + srhox_jm = _ka_minmod(ΔLrhox_jm, ΔRrhox_jm) + srhoy_jm = _ka_minmod(ΔLrhoy_jm, ΔRrhoy_jm) + sE_jm = _ka_minmod(ΔLE_jm, ΔRE_jm) + + ρL_minus_y = ρ[i, jm] + half * sρ_jm + rhouL_minus_y = rhou[i, jm] + half * srhox_jm + rhovL_minus_y = rhov[i, jm] + half * srhoy_jm + EL_minus_y = E[i, jm] + half * sE_jm + + ρR_minus_y = ρ[i, j] - half * sρ_y + rhouR_minus_y = rhou[i, j] - half * srhox_y + rhovR_minus_y = rhov[i, j] - half * srhoy_y + ER_minus_y = E[i, j] - half * sE_y + + flux1_minus_y, flux2_minus_y, flux3_minus_y, flux4_minus_y = + _ka_rusanov_flux_y(γ, ρL_minus_y, rhouL_minus_y, rhovL_minus_y, EL_minus_y, + ρR_minus_y, rhouR_minus_y, rhovR_minus_y, ER_minus_y) + + dρ_val = -(flux1_plus - flux1_minus) * inv_dx - + (flux1_plus_y - flux1_minus_y) * inv_dy + drhou_val = -(flux2_plus - flux2_minus) * inv_dx - + (flux2_plus_y - flux2_minus_y) * inv_dy + drhov_val = -(flux3_plus - flux3_minus) * inv_dx - + (flux3_plus_y - flux3_minus_y) * inv_dy + dE_val = -(flux4_plus - flux4_minus) * inv_dx - + (flux4_plus_y - flux4_minus_y) * inv_dy + + dρ[i, j] = dρ_val + drhou[i, j] = drhou_val + drhov[i, j] = drhov_val + dE[i, j] = dE_val + end +end + +register_backend!(:cpu, () -> KernelAbstractions.CPU()) diff --git a/src/time_integration.jl b/src/time_integration.jl index 1ab79f7..cc42eb8 100644 --- a/src/time_integration.jl +++ b/src/time_integration.jl @@ -16,9 +16,10 @@ end Hold the cell-centered solution field alongside scratch buffers required by the RK2 integrator. """ -struct LinearAdvectionState{A,W} +struct LinearAdvectionState{A,W,B} solution::A workspace::W + backend::B end """ @@ -35,15 +36,23 @@ Access the Runge-Kutta scratch buffers bundled with `state`. """ workspace(state::LinearAdvectionState) = state.workspace +""" + backend(state) + +Return the execution backend associated with `state`. +""" +backend(state::LinearAdvectionState) = state.backend + """ CompressibleEulerState(solution, workspace) Hold the conserved variables for the compressible Euler system along with RK2 scratch storage. """ -struct CompressibleEulerState{A,W} +struct CompressibleEulerState{A,W,B} solution::A workspace::W + backend::B end """ @@ -60,79 +69,89 @@ Access the Runge-Kutta scratch buffers bundled with `state`. """ workspace(state::CompressibleEulerState) = state.workspace -const _DefaultArrayType = Array +backend(state::CompressibleEulerState) = state.backend function LinearAdvectionState(problem::LinearAdvectionProblem; T::Type = Float64, - array_type = _DefaultArrayType, - init = nothing) + array_type = nothing, + init = nothing, + backend::ExecutionBackend = default_backend()) mesh_obj = mesh(problem) dims = size(mesh_obj) - field = _allocate_field(array_type, T, dims) - _initialize_field!(field, init, mesh_obj, T) + array_type_val = array_type === nothing ? default_array_type(backend) : array_type + field = allocate_cellfield(array_type_val, T, dims, 1) + _initialize_scalar_field!(field, init, mesh_obj, T) - k1 = _allocate_field(array_type, T, dims) - k2 = _allocate_field(array_type, T, dims) - stage = _allocate_field(array_type, T, dims) + k1 = allocate_like(field) + k2 = allocate_like(field) + stage = allocate_like(field) fill!(k1, zero(T)) fill!(k2, zero(T)) fill!(stage, zero(T)) - return LinearAdvectionState(field, RK2Workspace(k1, k2, stage)) -end - -function _allocate_field(array_type, ::Type{T}, dims::Tuple{Vararg{Int}}) where {T} - return array_type{T}(undef, dims...) + return LinearAdvectionState(field, RK2Workspace(k1, k2, stage), backend) end -function _initialize_field!(field, init, mesh::StructuredMesh, ::Type{T}) where {T} +function _initialize_scalar_field!(field::CellField, init, mesh::StructuredMesh, ::Type{T}) where {T} + data = component(field, 1) if init === nothing - fill!(field, zero(T)) - return field + fill!(data, zero(T)) elseif init isa Number - fill!(field, T(init)) - return field + fill!(data, T(init)) elseif init isa AbstractArray - size(init) == size(field) || + size(init) == size(data) || throw(ArgumentError("Initializer array must match mesh dimensions")) - field .= T.(init) - return field + data .= T.(init) elseif init isa Function centers_x, centers_y = cell_centers(mesh) nx, ny = size(mesh) - @inbounds for j in 1:ny, i in 1:nx - field[i, j] = T(init(centers_x[i], centers_y[j])) + if data isa Array + @inbounds for j in 1:ny, i in 1:nx + data[i, j] = T(init(centers_x[i], centers_y[j])) + end + else + host = Array{T}(undef, nx, ny) + @inbounds for j in 1:ny, i in 1:nx + host[i, j] = T(init(centers_x[i], centers_y[j])) + end + copyto!(data, host) end - return field else throw(ArgumentError("Unsupported initializer type $(typeof(init))")) end + return field end const _EulerVarCount = 4 function CompressibleEulerState(problem::CompressibleEulerProblem; T::Type = Float64, - array_type = _DefaultArrayType, - init = nothing) + array_type = nothing, + init = nothing, + backend::ExecutionBackend = default_backend()) mesh_obj = mesh(problem) dims = size(mesh_obj) eq = pde(problem) - field = _allocate_field(array_type, T, ( _EulerVarCount, dims[1], dims[2])) + array_type_val = array_type === nothing ? default_array_type(backend) : array_type + field = allocate_cellfield(array_type_val, T, dims, _EulerVarCount) _initialize_euler_field!(field, init, mesh_obj, eq, T) - k1 = _allocate_field(array_type, T, size(field)) - k2 = _allocate_field(array_type, T, size(field)) - stage = _allocate_field(array_type, T, size(field)) + k1 = allocate_like(field) + k2 = allocate_like(field) + stage = allocate_like(field) fill!(k1, zero(T)) fill!(k2, zero(T)) fill!(stage, zero(T)) - return CompressibleEulerState(field, RK2Workspace(k1, k2, stage)) + return CompressibleEulerState(field, RK2Workspace(k1, k2, stage), backend) end -function _initialize_euler_field!(field, init, mesh::StructuredMesh, equation::CompressibleEuler, ::Type{T}) where {T} +function _initialize_euler_field!(field::CellField, + init, + mesh::StructuredMesh, + equation::CompressibleEuler, + ::Type{T}) where {T} if init === nothing fill!(field, zero(T)) return field @@ -142,15 +161,27 @@ function _initialize_euler_field!(field, init, mesh::StructuredMesh, equation::C elseif init isa AbstractArray size(init) == size(field) || throw(ArgumentError("Initializer array must match (4, nx, ny) layout")) - field .= T.(init) + for comp in 1:ncomponents(field) + component(field, comp) .= T.(view(init, comp, :, :)) + end return field elseif init isa Function centers_x, centers_y = cell_centers(mesh) nx, ny = size(mesh) γ = gamma(equation) - @inbounds for j in 1:ny, i in 1:nx - ρ, u, v, p = _extract_primitive(init, centers_x[i], centers_y[j]) - _store_conserved!(field, i, j, ρ, u, v, p, γ, T) + density = component(field, 1) + if density isa Array + @inbounds for j in 1:ny, i in 1:nx + ρ, u, v, p = _extract_primitive(init, centers_x[i], centers_y[j]) + _store_conserved!(field, i, j, ρ, u, v, p, γ, T) + end + else + host_field = allocate_cellfield(Array, T, (nx, ny), _EulerVarCount) + @inbounds for j in 1:ny, i in 1:nx + ρ, u, v, p = _extract_primitive(init, centers_x[i], centers_y[j]) + _store_conserved!(host_field, i, j, ρ, u, v, p, γ, T) + end + copyto!(field, host_field) end return field else @@ -201,16 +232,30 @@ Runge-Kutta step of size `dt`. function rk2_step!(state::LinearAdvectionState, problem::LinearAdvectionProblem, dt::Real) - return _rk2_step!(state, problem, dt) + backend_obj = backend(state) + if backend_obj isa SerialBackend + return _rk2_step_serial!(state, problem, dt) + elseif backend_obj isa KernelAbstractionsBackend + return _rk2_step_linear_advection_ka!(state, problem, dt, backend_obj) + else + throw(ArgumentError("Unsupported execution backend $(describe(backend_obj)) for linear advection")) + end end function rk2_step!(state::CompressibleEulerState, problem::CompressibleEulerProblem, dt::Real) - return _rk2_step!(state, problem, dt) + backend_obj = backend(state) + if backend_obj isa SerialBackend + return _rk2_step_serial!(state, problem, dt) + elseif backend_obj isa KernelAbstractionsBackend + return _rk2_step_euler_ka!(state, problem, dt, backend_obj) + else + throw(ArgumentError("Unsupported execution backend $(describe(backend_obj)) for compressible Euler")) + end end -function _rk2_step!(state, problem, dt::Real) +function _rk2_step_serial!(state, problem, dt::Real) u = solution(state) ws = workspace(state) Tsol = eltype(u) @@ -220,18 +265,181 @@ function _rk2_step!(state, problem, dt::Real) @timeit timer "RK2 step" begin @timeit timer "RHS compute" compute_rhs!(ws.k1, u, problem) + @timeit timer "Stage predictor" _rk2_stage_predict!(ws.stage, u, ws.k1, dtT) + @timeit timer "RHS compute" compute_rhs!(ws.k2, ws.stage, problem) + @timeit timer "Solution update" _rk2_solution_update!(u, ws.k1, ws.k2, dtT * half) + end + + return state +end + +function _rk2_step_linear_advection_ka!(state::LinearAdvectionState, + problem::LinearAdvectionProblem, + dt::Real, + backend_obj::KernelAbstractionsBackend) + u_field = solution(state) + ws = workspace(state) + mesh_obj = mesh(problem) + dims = size(mesh_obj) + nx, ny = dims + + u = scalar_component(u_field) + k1 = scalar_component(ws.k1) + k2 = scalar_component(ws.k2) + stage = scalar_component(ws.stage) + + Tsol = eltype(u) + dtT = convert(Tsol, dt) + half = convert(Tsol, 0.5) + + eq = pde(problem) + bc = boundary_conditions(problem) + axes = periodic_axes(bc) + vel = velocity(eq) + ax_raw, ay_raw = vel + ax = convert(Tsol, ax_raw) + ay = convert(Tsol, ay_raw) + dx, dy = spacing(mesh_obj) + inv2dx = ax == zero(ax) ? zero(Tsol) : Tsol(1) / (Tsol(2) * Tsol(dx)) + inv2dy = ay == zero(ay) ? zero(Tsol) : Tsol(1) / (Tsol(2) * Tsol(dy)) + + if ax != zero(ax) && !axes[1] + throw(ArgumentError("Periodic boundary conditions required along x for advection")) + end + if ay != zero(ay) && !axes[2] + throw(ArgumentError("Periodic boundary conditions required along y for advection")) + end + if ax != zero(ax) && nx < 3 + throw(ArgumentError("At least three cells along x are required for 2nd-order advection")) + end + if ay != zero(ay) && ny < 3 + throw(ArgumentError("At least three cells along y are required for 2nd-order advection")) + end + + timer = simulation_timers() + + @timeit timer "RK2 step" begin + @timeit timer "RHS compute" begin + linear_advection_rhs_kernel!(backend_obj, k1, u, nx, ny, ax, ay, inv2dx, inv2dy) + end @timeit timer "Stage predictor" begin - @. ws.stage = u + dtT * ws.k1 + rk2_stage_kernel!(backend_obj, stage, u, k1, dtT) + end + @timeit timer "RHS compute" begin + linear_advection_rhs_kernel!(backend_obj, k2, stage, nx, ny, ax, ay, inv2dx, inv2dy) end - @timeit timer "RHS compute" compute_rhs!(ws.k2, ws.stage, problem) @timeit timer "Solution update" begin - @. u = u + (dtT * half) * (ws.k1 + ws.k2) + rk2_update_kernel!(backend_obj, u, k1, k2, dtT * half) end end return state end +function _rk2_step_euler_ka!(state::CompressibleEulerState, + problem::CompressibleEulerProblem, + dt::Real, + backend_obj::KernelAbstractionsBackend) + u_field = solution(state) + ws = workspace(state) + Tsol = eltype(u_field) + dtT = convert(Tsol, dt) + half = convert(Tsol, 0.5) + + timer = simulation_timers() + + @timeit timer "RK2 step" begin + @timeit timer "RHS compute" _compressible_euler_rhs_ka!(backend_obj, ws.k1, u_field, problem) + @timeit timer "Stage predictor" begin + for comp in 1:ncomponents(u_field) + rk2_stage_kernel!(backend_obj, + component(ws.stage, comp), + component(u_field, comp), + component(ws.k1, comp), + dtT) + end + end + @timeit timer "RHS compute" _compressible_euler_rhs_ka!(backend_obj, ws.k2, ws.stage, problem) + @timeit timer "Solution update" begin + for comp in 1:ncomponents(u_field) + rk2_update_kernel!(backend_obj, + component(u_field, comp), + component(ws.k1, comp), + component(ws.k2, comp), + dtT * half) + end + end + end + + return state +end + +function _compressible_euler_rhs_ka!(backend::KernelAbstractionsBackend, + du::CellField, + u::CellField, + problem::CompressibleEulerProblem) + mesh_obj = mesh(problem) + nx, ny = size(mesh_obj) + eq = pde(problem) + T = component_eltype(u) + dx, dy = spacing(mesh_obj) + inv_dx = T(1) / T(dx) + inv_dy = T(1) / T(dy) + γ = T(gamma(eq)) + + for comp in 1:ncomponents(du) + fill!(component(du, comp), zero(T)) + end + + dρ = component(du, 1) + drhou = component(du, 2) + drhov = component(du, 3) + dE = component(du, 4) + + ρ = component(u, 1) + rhou = component(u, 2) + rhov = component(u, 3) + E = component(u, 4) + + device = _resolve_ka_device(backend.device) + kernel = backend.workgroupsize === nothing ? + _compressible_euler_rhs_kernel!(device) : + _compressible_euler_rhs_kernel!(device, backend.workgroupsize) + kernel(dρ, drhou, drhov, dE, + ρ, rhou, rhov, E, + γ, inv_dx, inv_dy; ndrange = (nx, ny)) + KernelAbstractions.synchronize(device) + return du +end + +@inline function _rk2_stage_predict!(stage::CellField, + u::CellField, + rhs::CellField, + α) + n = ncomponents(stage) + @inbounds for comp in 1:n + stage_comp = component(stage, comp) + u_comp = component(u, comp) + rhs_comp = component(rhs, comp) + @. stage_comp = u_comp + α * rhs_comp + end + return stage +end + +@inline function _rk2_solution_update!(u::CellField, + k1::CellField, + k2::CellField, + α) + n = ncomponents(u) + @inbounds for comp in 1:n + u_comp = component(u, comp) + k1_comp = component(k1, comp) + k2_comp = component(k2, comp) + @. u_comp = u_comp + α * (k1_comp + k2_comp) + end + return u +end + """ compute_rhs!(du, u, problem) @@ -239,9 +447,21 @@ Populate `du` with the spatial derivative of `u` for the PDE described by `problem`. Linear advection problems use a second-order upwind stencil, while compressible Euler problems employ slope-limited Rusanov fluxes. """ +function compute_rhs!(du::CellField, + u::CellField, + problem::LinearAdvectionProblem) + return _linear_advection_rhs!(component(du, 1), component(u, 1), problem) +end + function compute_rhs!(du::AbstractArray{T,2}, u::AbstractArray{T,2}, problem::LinearAdvectionProblem) where {T} + return _linear_advection_rhs!(du, u, problem) +end + +function _linear_advection_rhs!(du::AbstractArray{T,2}, + u::AbstractArray{T,2}, + problem::LinearAdvectionProblem) where {T} mesh_obj = mesh(problem) bc = boundary_conditions(problem) eq = pde(problem) @@ -297,9 +517,24 @@ function compute_rhs!(du::AbstractArray{T,2}, return du end +function compute_rhs!(du::CellField, + u::CellField, + problem::CompressibleEulerProblem) + return _compressible_euler_rhs!(du, u, problem) +end + function compute_rhs!(du::AbstractArray{T,3}, u::AbstractArray{T,3}, problem::CompressibleEulerProblem) where {T} + du_field = CellField(ntuple(i -> view(du, i, :, :), Val(_EulerVarCount))) + u_field = CellField(ntuple(i -> view(u, i, :, :), Val(_EulerVarCount))) + _compressible_euler_rhs!(du_field, u_field, problem) + return du +end + +function _compressible_euler_rhs!(du::CellField, + u::CellField, + problem::CompressibleEulerProblem) mesh_obj = mesh(problem) bc = boundary_conditions(problem) eq = pde(problem) @@ -313,13 +548,23 @@ function compute_rhs!(du::AbstractArray{T,3}, nx >= 3 || throw(ArgumentError("At least three cells required along x")) ny >= 3 || throw(ArgumentError("At least three cells required along y")) + T = component_eltype(u) dx, dy = spacing(mesh_obj) inv_dx = one(T) / T(dx) inv_dy = one(T) / T(dy) γ = gamma(eq) - fill!(du, zero(T)) + ρ = component(u, 1) + rhou = component(u, 2) + rhov = component(u, 3) + E = component(u, 4) + + dρ = component(du, 1) + drhou = component(du, 2) + drhov = component(du, 3) + dE = component(du, 4) + # x-direction fluxes @inbounds for j in 1:ny jm1 = j == 1 ? ny : j - 1 @@ -330,57 +575,57 @@ function compute_rhs!(du::AbstractArray{T,3}, im = i == 1 ? nx : i - 1 # Slopes for cell i - ΔLρ = u[1, i, j] - u[1, im, j] - ΔRρ = u[1, ip, j] - u[1, i, j] - ΔLrhox = u[2, i, j] - u[2, im, j] - ΔRrhox = u[2, ip, j] - u[2, i, j] - ΔLrhoy = u[3, i, j] - u[3, im, j] - ΔRrhoy = u[3, ip, j] - u[3, i, j] - ΔLE = u[4, i, j] - u[4, im, j] - ΔRE = u[4, ip, j] - u[4, i, j] + ΔLρ = ρ[i, j] - ρ[im, j] + ΔRρ = ρ[ip, j] - ρ[i, j] + ΔLrhox = rhou[i, j] - rhou[im, j] + ΔRrhox = rhou[ip, j] - rhou[i, j] + ΔLrhoy = rhov[i, j] - rhov[im, j] + ΔRrhoy = rhov[ip, j] - rhov[i, j] + ΔLE = E[i, j] - E[im, j] + ΔRE = E[ip, j] - E[i, j] sρ = _minmod(ΔLρ, ΔRρ) srhox = _minmod(ΔLrhox, ΔRrhox) srhoy = _minmod(ΔLrhoy, ΔRrhoy) sE = _minmod(ΔLE, ΔRE) - ρL = u[1, i, j] + T(0.5) * sρ - rhouL = u[2, i, j] + T(0.5) * srhox - rhovL = u[3, i, j] + T(0.5) * srhoy - EL = u[4, i, j] + T(0.5) * sE + ρL = ρ[i, j] + T(0.5) * sρ + rhouL = rhou[i, j] + T(0.5) * srhox + rhovL = rhov[i, j] + T(0.5) * srhoy + EL = E[i, j] + T(0.5) * sE # Slopes for cell ip - ΔLρ_ip = u[1, ip, j] - u[1, i, j] - ΔRρ_ip = u[1, ip2, j] - u[1, ip, j] - ΔLrhox_ip = u[2, ip, j] - u[2, i, j] - ΔRrhox_ip = u[2, ip2, j] - u[2, ip, j] - ΔLrhoy_ip = u[3, ip, j] - u[3, i, j] - ΔRrhoy_ip = u[3, ip2, j] - u[3, ip, j] - ΔLE_ip = u[4, ip, j] - u[4, i, j] - ΔRE_ip = u[4, ip2, j] - u[4, ip, j] + ΔLρ_ip = ρ[ip, j] - ρ[i, j] + ΔRρ_ip = ρ[ip2, j] - ρ[ip, j] + ΔLrhox_ip = rhou[ip, j] - rhou[i, j] + ΔRrhox_ip = rhou[ip2, j] - rhou[ip, j] + ΔLrhoy_ip = rhov[ip, j] - rhov[i, j] + ΔRrhoy_ip = rhov[ip2, j] - rhov[ip, j] + ΔLE_ip = E[ip, j] - E[i, j] + ΔRE_ip = E[ip2, j] - E[ip, j] sρ_ip = _minmod(ΔLρ_ip, ΔRρ_ip) srhox_ip = _minmod(ΔLrhox_ip, ΔRrhox_ip) srhoy_ip = _minmod(ΔLrhoy_ip, ΔRrhoy_ip) sE_ip = _minmod(ΔLE_ip, ΔRE_ip) - ρR = u[1, ip, j] - T(0.5) * sρ_ip - rhouR = u[2, ip, j] - T(0.5) * srhox_ip - rhovR = u[3, ip, j] - T(0.5) * srhoy_ip - ER = u[4, ip, j] - T(0.5) * sE_ip + ρR = ρ[ip, j] - T(0.5) * sρ_ip + rhouR = rhou[ip, j] - T(0.5) * srhox_ip + rhovR = rhov[ip, j] - T(0.5) * srhoy_ip + ER = E[ip, j] - T(0.5) * sE_ip flux1, flux2, flux3, flux4 = _rusanov_flux_x(eq, ρL, rhouL, rhovL, EL, ρR, rhouR, rhovR, ER) - du[1, i, j] -= flux1 * inv_dx - du[2, i, j] -= flux2 * inv_dx - du[3, i, j] -= flux3 * inv_dx - du[4, i, j] -= flux4 * inv_dx + dρ[i, j] -= flux1 * inv_dx + drhou[i, j] -= flux2 * inv_dx + drhov[i, j] -= flux3 * inv_dx + dE[i, j] -= flux4 * inv_dx - du[1, ip, j] += flux1 * inv_dx - du[2, ip, j] += flux2 * inv_dx - du[3, ip, j] += flux3 * inv_dx - du[4, ip, j] += flux4 * inv_dx + dρ[ip, j] += flux1 * inv_dx + drhou[ip, j] += flux2 * inv_dx + drhov[ip, j] += flux3 * inv_dx + dE[ip, j] += flux4 * inv_dx end end @@ -390,56 +635,56 @@ function compute_rhs!(du::AbstractArray{T,3}, jp2 = jp == ny ? 1 : jp + 1 jm = j == 1 ? ny : j - 1 for i in 1:nx - ΔLρ = u[1, i, j] - u[1, i, jm] - ΔRρ = u[1, i, jp] - u[1, i, j] - ΔLrhox = u[2, i, j] - u[2, i, jm] - ΔRrhox = u[2, i, jp] - u[2, i, j] - ΔLrhoy = u[3, i, j] - u[3, i, jm] - ΔRrhoy = u[3, i, jp] - u[3, i, j] - ΔLE = u[4, i, j] - u[4, i, jm] - ΔRE = u[4, i, jp] - u[4, i, j] + ΔLρ = ρ[i, j] - ρ[i, jm] + ΔRρ = ρ[i, jp] - ρ[i, j] + ΔLrhox = rhou[i, j] - rhou[i, jm] + ΔRrhox = rhou[i, jp] - rhou[i, j] + ΔLrhoy = rhov[i, j] - rhov[i, jm] + ΔRrhoy = rhov[i, jp] - rhov[i, j] + ΔLE = E[i, j] - E[i, jm] + ΔRE = E[i, jp] - E[i, j] sρ = _minmod(ΔLρ, ΔRρ) srhox = _minmod(ΔLrhox, ΔRrhox) srhoy = _minmod(ΔLrhoy, ΔRrhoy) sE = _minmod(ΔLE, ΔRE) - ρL = u[1, i, j] + T(0.5) * sρ - rhouL = u[2, i, j] + T(0.5) * srhox - rhovL = u[3, i, j] + T(0.5) * srhoy - EL = u[4, i, j] + T(0.5) * sE + ρL = ρ[i, j] + T(0.5) * sρ + rhouL = rhou[i, j] + T(0.5) * srhox + rhovL = rhov[i, j] + T(0.5) * srhoy + EL = E[i, j] + T(0.5) * sE - ΔLρ_jp = u[1, i, jp] - u[1, i, j] - ΔRρ_jp = u[1, i, jp2] - u[1, i, jp] - ΔLrhox_jp = u[2, i, jp] - u[2, i, j] - ΔRrhox_jp = u[2, i, jp2] - u[2, i, jp] - ΔLrhoy_jp = u[3, i, jp] - u[3, i, j] - ΔRrhoy_jp = u[3, i, jp2] - u[3, i, jp] - ΔLE_jp = u[4, i, jp] - u[4, i, j] - ΔRE_jp = u[4, i, jp2] - u[4, i, jp] + ΔLρ_jp = ρ[i, jp] - ρ[i, j] + ΔRρ_jp = ρ[i, jp2] - ρ[i, jp] + ΔLrhox_jp = rhou[i, jp] - rhou[i, j] + ΔRrhox_jp = rhou[i, jp2] - rhou[i, jp] + ΔLrhoy_jp = rhov[i, jp] - rhov[i, j] + ΔRrhoy_jp = rhov[i, jp2] - rhov[i, jp] + ΔLE_jp = E[i, jp] - E[i, j] + ΔRE_jp = E[i, jp2] - E[i, jp] sρ_jp = _minmod(ΔLρ_jp, ΔRρ_jp) srhox_jp = _minmod(ΔLrhox_jp, ΔRrhox_jp) srhoy_jp = _minmod(ΔLrhoy_jp, ΔRrhoy_jp) sE_jp = _minmod(ΔLE_jp, ΔRE_jp) - ρR = u[1, i, jp] - T(0.5) * sρ_jp - rhouR = u[2, i, jp] - T(0.5) * srhox_jp - rhovR = u[3, i, jp] - T(0.5) * srhoy_jp - ER = u[4, i, jp] - T(0.5) * sE_jp + ρR = ρ[i, jp] - T(0.5) * sρ_jp + rhouR = rhou[i, jp] - T(0.5) * srhox_jp + rhovR = rhov[i, jp] - T(0.5) * srhoy_jp + ER = E[i, jp] - T(0.5) * sE_jp flux1, flux2, flux3, flux4 = _rusanov_flux_y(eq, ρL, rhouL, rhovL, EL, ρR, rhouR, rhovR, ER) - du[1, i, j] -= flux1 * inv_dy - du[2, i, j] -= flux2 * inv_dy - du[3, i, j] -= flux3 * inv_dy - du[4, i, j] -= flux4 * inv_dy + dρ[i, j] -= flux1 * inv_dy + drhou[i, j] -= flux2 * inv_dy + drhov[i, j] -= flux3 * inv_dy + dE[i, j] -= flux4 * inv_dy - du[1, i, jp] += flux1 * inv_dy - du[2, i, jp] += flux2 * inv_dy - du[3, i, jp] += flux3 * inv_dy - du[4, i, jp] += flux4 * inv_dy + dρ[i, jp] += flux1 * inv_dy + drhou[i, jp] += flux2 * inv_dy + drhov[i, jp] += flux3 * inv_dy + dE[i, jp] += flux4 * inv_dy end end @@ -504,26 +749,8 @@ function cfl_number(problem::CompressibleEulerProblem, mesh_obj = mesh(problem) dx, dy = spacing(mesh_obj) γ = gamma(pde(problem)) - u = solution(state) - nx, ny = size(mesh_obj) - - max_ax = zero(eltype(u)) - max_ay = zero(eltype(u)) - - @inbounds for j in 1:ny, i in 1:nx - ρ = u[1, i, j] - rhou = u[2, i, j] - rhov = u[3, i, j] - E = u[4, i, j] - invρ = one(ρ) / ρ - ux = rhou * invρ - uy = rhov * invρ - kinetic = 0.5 * ρ * (ux^2 + uy^2) - p = (γ - one(γ)) * (E - kinetic) - c = sqrt(abs(γ * p * invρ)) - max_ax = max(max_ax, abs(ux) + c) - max_ay = max(max_ay, abs(uy) + c) - end + backend_obj = backend(state) + max_ax, max_ay = _euler_characteristics(solution(state), γ, backend_obj) dtT = float(dt) return dtT * (max_ax / dx + max_ay / dy) @@ -540,36 +767,97 @@ function stable_timestep(problem::CompressibleEulerProblem, cfl::Real = 0.45) cfl > 0 || throw(ArgumentError("CFL target must be positive")) + backend_obj = backend(state) + if backend_obj isa KernelAbstractionsBackend + return _stable_timestep_gpu(problem, state, cfl, backend_obj) + else + return _stable_timestep_cpu(problem, state, cfl) + end +end + +function _stable_timestep_cpu(problem::CompressibleEulerProblem, + state::CompressibleEulerState, + cfl::Real) mesh_obj = mesh(problem) dx, dy = spacing(mesh_obj) γ = gamma(pde(problem)) - u = solution(state) - nx, ny = size(mesh_obj) + max_ax, max_ay = _euler_characteristics(solution(state), γ, SerialBackend()) + denom = (max_ax == 0 ? zero(Float64) : max_ax / dx) + + (max_ay == 0 ? zero(Float64) : max_ay / dy) + + iszero(denom) && return Inf + + return float(cfl) / denom +end + +function _stable_timestep_gpu(problem::CompressibleEulerProblem, + state::CompressibleEulerState, + cfl::Real, + backend_obj::KernelAbstractionsBackend) + mesh_obj = mesh(problem) + dx, dy = spacing(mesh_obj) + γ = gamma(pde(problem)) + max_ax, max_ay = _euler_characteristics(solution(state), γ, backend_obj) + + denom = (max_ax == 0 ? zero(Float64) : max_ax / dx) + + (max_ay == 0 ? zero(Float64) : max_ay / dy) + + iszero(denom) && return Inf + + return float(cfl) / denom +end + +function _euler_characteristics(u, γ, ::ExecutionBackend) + nx, ny = size(component(u, 1)) max_ax = zero(eltype(u)) max_ay = zero(eltype(u)) + ρ = component(u, 1) + rhou = component(u, 2) + rhov = component(u, 3) + E = component(u, 4) + @inbounds for j in 1:ny, i in 1:nx - ρ = u[1, i, j] - rhou = u[2, i, j] - rhov = u[3, i, j] - E = u[4, i, j] - invρ = one(ρ) / ρ - ux = rhou * invρ - uy = rhov * invρ - kinetic = 0.5 * ρ * (ux^2 + uy^2) - p = (γ - one(γ)) * (E - kinetic) + ρval = ρ[i, j] + rhouval = rhou[i, j] + rhovval = rhov[i, j] + Eval = E[i, j] + invρ = one(ρval) / ρval + ux = rhouval * invρ + uy = rhovval * invρ + kinetic = 0.5 * ρval * (ux^2 + uy^2) + p = (γ - one(γ)) * (Eval - kinetic) c = sqrt(abs(γ * p * invρ)) max_ax = max(max_ax, abs(ux) + c) max_ay = max(max_ay, abs(uy) + c) end - denom = (max_ax == 0 ? zero(Float64) : max_ax / dx) + - (max_ay == 0 ? zero(Float64) : max_ay / dy) - - iszero(denom) && return Inf + return float(max_ax), float(max_ay) +end - return float(cfl) / denom +function _euler_characteristics(u, γ, backend::KernelAbstractionsBackend) + ρ = component(u, 1) + rhou = component(u, 2) + rhov = component(u, 3) + E = component(u, 4) + + T = eltype(ρ) + γT = convert(T, γ) + half = inv(convert(T, 2)) + epsT = eps(T) + + ux = rhou ./ ρ + uy = rhov ./ ρ + vel2 = ux .* ux .+ uy .* uy + kinetic = half .* ρ .* vel2 + internal = max.(E .- kinetic, epsT) + p = max.((γT - one(γT)) .* internal, epsT) + c = sqrt.(abs.(γT .* p ./ ρ)) + + max_ax = float(maximum(abs.(ux) .+ c)) + max_ay = float(maximum(abs.(uy) .+ c)) + return max_ax, max_ay end # Utility micro-kernels for Euler RHS diff --git a/test/runtests.jl b/test/runtests.jl index 61c5bc8..6791eed 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -47,27 +47,69 @@ end @testset "LinearAdvection state" begin problem = setup_linear_advection_problem(8, 4; velocity = (1.0, 0.0)) state = LinearAdvectionState(problem; init = 2.0) - u = solution(state) + u_field = solution(state) + u = scalar_component(u_field) ws = workspace(state) + @test backend(state) isa SerialBackend + + @test size(u_field) == (1, 8, 4) @test size(u) == (8, 4) @test all(u .== 2.0) - @test size(ws.k1) == size(u) - @test size(ws.k2) == size(u) - @test size(ws.stage) == size(u) + @test size(ws.k1) == size(u_field) + @test size(ws.k2) == size(u_field) + @test size(ws.stage) == size(u_field) mesh = CodexMach.mesh(problem) init_fun(x, y) = x + y state_fun = LinearAdvectionState(problem; init = init_fun) - u_fun = solution(state_fun) + u_fun = scalar_component(solution(state_fun)) centers_x, centers_y = CodexMach.cell_centers(mesh) @inbounds for j in 1:size(u_fun, 2), i in 1:size(u_fun, 1) @test isapprox(u_fun[i, j], init_fun(centers_x[i], centers_y[j]); atol = 1e-12) end - fill!(u, 1.0) - compute_rhs!(ws.k1, u, problem) - @test all(isapprox.(ws.k1, 0.0; atol = 1e-12)) + fill!(u_field, 1.0) + compute_rhs!(ws.k1, u_field, problem) + @test all(isapprox.(scalar_component(ws.k1), 0.0; atol = 1e-12)) +end + +@testset "Backends" begin + problem = setup_linear_advection_problem(8, 4; velocity = (0.5, 0.0)) + ka_backend = KernelAbstractionsBackend(:cpu) + state = LinearAdvectionState(problem; init = 1.0, backend = ka_backend) + @test backend(state) === ka_backend + + dt = 0.1 + steps = 5 + serial_state = LinearAdvectionState(problem; init = 1.0) + for _ in 1:steps + rk2_step!(serial_state, problem, dt) + rk2_step!(state, problem, dt) + end + serial_u = scalar_component(solution(serial_state)) + ka_u = scalar_component(solution(state)) + @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) + + if :metal in available_backends() + metal_state = LinearAdvectionState(problem; init = 1.0, + backend = KernelAbstractionsBackend(:metal)) + for _ in 1:steps + rk2_step!(metal_state, problem, dt) + end + metal_u = scalar_component(solution(metal_state)) + @test all(isapprox.(metal_u, serial_u; atol = 1e-10)) + end + + if :cuda in available_backends() + cuda_state = LinearAdvectionState(problem; init = 1.0, + backend = KernelAbstractionsBackend(:cuda)) + for _ in 1:steps + rk2_step!(cuda_state, problem, dt) + end + cuda_u = scalar_component(solution(cuda_state)) + @test all(isapprox.(cuda_u, serial_u; atol = 1e-10)) + end end @testset "LinearAdvection RK2" begin @@ -83,7 +125,7 @@ end rk2_step!(state, problem, dt) expected = [sin(2pi * (x - dt)) for x in centers_x] - u = solution(state) + u = scalar_component(solution(state)) @inbounds for i in 1:nx @test isapprox(u[i, 1], expected[i]; atol = 1e-3) end @@ -115,7 +157,7 @@ end state = LinearAdvectionState(problem; init = init_fun) result = run_linear_advection!(state, problem; steps = 4, dt = dt, record_cfl = true) expected = [sin(2pi * (x - 4 * dt)) for x in centers_x] - u = solution(state) + u = scalar_component(solution(state)) @inbounds for i in 1:nx @test isapprox(u[i, 1], expected[i]; atol = 1e-3) end diff --git a/test/test_compressible_euler.jl b/test/test_compressible_euler.jl index f408a20..5717eb6 100644 --- a/test/test_compressible_euler.jl +++ b/test/test_compressible_euler.jl @@ -66,3 +66,51 @@ using CodexMach @test all(isapprox.(prim_buf.v, prim.v; atol = 1e-12)) @test all(isapprox.(prim_buf.p, prim.p; atol = 1e-12)) end + +@testset "Compressible Euler KA backend" begin + nx, ny = 16, 16 + prob = setup_compressible_euler_problem(nx, ny; + lengths = (1.0, 1.0), + origin = (0.0, 0.0), + gamma = 1.4) + + init(x, y) = (; rho = 1.0 + 0.05 * sinpi(x), + v1 = 0.02 * cospi(y), + v2 = 0.01 * sinpi(x + y), + p = 1.0 + 0.02 * cospi(x)) + + serial_state = CompressibleEulerState(prob; init = init, T = Float64) + ka_state = CompressibleEulerState(prob; init = init, T = Float64, + backend = KernelAbstractionsBackend(:cpu)) + + steps = 3 + dt = stable_timestep(prob, serial_state; cfl = 0.4) + for _ in 1:steps + rk2_step!(serial_state, prob, dt) + rk2_step!(ka_state, prob, dt) + end + + serial_u = solution(serial_state) + ka_u = solution(ka_state) + @test all(isapprox.(ka_u, serial_u; atol = 1e-10)) + + if :metal in available_backends() + metal_state = CompressibleEulerState(prob; init = init, T = Float64, + backend = KernelAbstractionsBackend(:metal)) + for _ in 1:steps + rk2_step!(metal_state, prob, dt) + end + metal_u = solution(metal_state) + @test all(isapprox.(metal_u, serial_u; atol = 1e-10)) + end + + if :cuda in available_backends() + cuda_state = CompressibleEulerState(prob; init = init, T = Float64, + backend = KernelAbstractionsBackend(:cuda)) + for _ in 1:steps + rk2_step!(cuda_state, prob, dt) + end + cuda_u = solution(cuda_state) + @test all(isapprox.(cuda_u, serial_u; atol = 1e-10)) + end +end