hpsc-lab · sloede · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
diff --git a/Project.toml b/Project.toml
@@ -4,10 +4,26 @@ authors = ["Michael Schlottke-Lakemper <michael.schlottke-lakemper@uni-a.de>"]
 version = "0.1.0"
 
 [deps]
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
+[extras]
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+
+[extensions]
+CodexMachCUDAExt = "CUDA"
+CodexMachMetalExt = "Metal"
+
 [compat]
+KernelAbstractions = "0.9.38"
+CUDA = "5"
+Metal = "1"
 Plots = "1.41.1"
 TimerOutputs = "0.5.29"
 julia = "1.11"
diff --git a/README.md b/README.md
@@ -27,21 +27,28 @@ The repository ships with a bare `docs/` folder ready to host Documenter.jl-base
 
 ## Example simulation
 
-The `examples/linear_advection_demo.jl` script wires together mesh generation,
-problem setup, RK2 time integration, and the high-level driver. Run it from the
-repository root:
+Run the linear advection demo on the CPU:
 
 ```bash
-julia --project=. examples/linear_advection_demo.jl
+julia --project=run examples/linear_advection_demo.jl
 ```
 
-It prints periodic RMS diagnostics along with the CFL number used for the run.
-Tune parameters by editing the keyword arguments in `run_linear_advection_demo`.
+Run the same demo on a Metal-capable GPU:
+
+```bash
+julia --project=run -e 'using Metal; include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)'
+```
+
+The script wires together mesh generation, problem setup, RK2 time integration,
+and the high-level driver. It prints periodic RMS diagnostics along with the CFL
+number used for the run. Override the precision explicitly with
+`state_eltype=Float64` if you want to run the GPU case in double precision (CUDA)
+or stay on `Float32` for Metal.
 
 To capture diagnostics, pass output paths (created if missing):
 
 ```bash
-julia --project=. examples/linear_advection_demo.jl diagnostics.csv final_state.csv
+julia --project=run examples/linear_advection_demo.jl diagnostics.csv final_state.csv
 ```
 
 The first file lists sampled step/time/RMS/CFL data, and the second stores the
@@ -53,18 +60,24 @@ To render the sampled diagnostics (and optionally the final field), use the
 helper script:
 
 ```bash
-julia --project=. examples/plot_linear_advection.jl diagnostics.csv final_state.csv plot.png
+julia --project=run examples/plot_linear_advection.jl diagnostics.csv final_state.csv plot.png
 ```
 
 Install `Plots.jl` in your environment first (`import Pkg; Pkg.add("Plots")`).
 The script falls back to a readable error if the dependency is missing.
 
 ## Convergence study
 
-To verify spatial accuracy, run the periodic convergence sweep:
+CPU run:
 
 ```bash
-julia --project=. examples/convergence_linear_advection.jl
+julia --project=run examples/convergence_linear_advection.jl
+```
+
+Metal GPU run:
+
+```bash
+julia --project=run -e 'using Metal; include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)'
 ```
 
 The driver evolves a sinusoidal field across five nested grids, prints the L₂
@@ -75,21 +88,39 @@ or refinement levels.
 
 ## Kelvin-Helmholtz instability
 
-The compressible Euler path is exercised by
-`examples/kelvin_helmholtz_euler.jl`, which seeds a periodic Kelvin-Helmholtz
-roll-up on a square box:
+CPU run:
+
+```bash
+julia --project=run examples/kelvin_helmholtz_euler.jl
+```
+
+Metal GPU run:
 
 ```bash
-julia --project=. examples/kelvin_helmholtz_euler.jl
+julia --project=run -e 'using Metal; include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)'
 ```
 
 By default the driver uses a 256×256 mesh, RK2 time stepping with adaptive CFL
-control, and prints periodic log messages. Pass a file path via
-`diagnostics_path` to capture per-step CFL and kinetic-energy measurements. The
-routine returns the final state so you can post-process density, vorticity, or
-other derived fields. Supply `pdf_path` to snapshot the terminal density field,
-and `animation_path` (MP4 or GIF) plus `animation_every`/`animation_fps` to
-produce a time-resolved movie.
+control, and prints periodic log messages. Set `backend=:metal` (or `:cuda`)
+to run the scenario on a GPU; the helper automatically falls back to `Float32`
+storage for KernelAbstractions backends while keeping `Float64` for the serial
+path. Pass a file path via `diagnostics_path` to capture per-step CFL and
+kinetic-energy measurements. The routine returns the final state so you can
+post-process density, vorticity, or other derived fields. Supply `pdf_path` to
+snapshot the terminal density field, and `animation_path` (MP4 or GIF) plus
+`animation_every`/`animation_fps` to produce a time-resolved movie.
+
+## Backend profiling
+
+Compare the serial and GPU paths with the profiling helper:
+
+```bash
+julia --project=run examples/profile_backends.jl
+```
+
+It benchmarks the RK2 loops for linear advection and compressible Euler across
+the serial and any registered KernelAbstractions backends (CPU, Metal, CUDA),
+forcing `Float32` to keep Metal hardware supported.
 
 ## Maintainer
 

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -48,6 +48,7 @@ CompressibleEulerProblem
 setup_compressible_euler_problem
 CompressibleEulerState
 primitive_variables
+_primitive_variables_cpu
 ```
 
 ## Time Integration
@@ -62,6 +63,20 @@ stable_timestep
 cfl_number
 ```
 
+## Cell Fields
+
+```@docs
+CellField
+allocate_cellfield
+allocate_like
+map_components!
+cell_components
+component
+ncomponents
+spatial_size
+backend
+```
+
 ## Simulation Drivers
 
 ```@docs

diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -5,13 +5,21 @@ CurrentModule = CodexMach
 ```
 
 The `examples/` directory contains reproducible scripts that exercise the core
-numerics. You can run each script directly with Julia once the project
-environment has been instantiated (`julia --project=.`).
+numerics. You can run each script directly with Julia once the `run/` project
+environment has been instantiated (`julia --project=run`).
 
 ## Linear Advection Demo
 
+CPU:
+
+```
+julia --project=run examples/linear_advection_demo.jl
+```
+
+Metal GPU:
+
 ```
-julia --project=. examples/linear_advection_demo.jl
+julia --project=run -e 'using Metal; include("examples/linear_advection_demo.jl"); run_linear_advection_demo(backend=:metal)'
 ```
 
 This script transports a smooth sine blob around a rectangular periodic domain
@@ -24,37 +32,56 @@ until it returns to its starting position. By default it:
 The function returns a named tuple with the step size, final time, and diagnostic
 history. When you pass `diagnostics_path="diagnostics.csv"` it writes a CSV
 mirroring the on-screen log, and `state_path="state.csv"` captures the final
-solution for later visualisation.
+solution for later visualisation. Override the precision explicitly with
+`state_eltype=Float64` (for CUDA) or `state_eltype=Float32` if you want to minimise
+device memory traffic.
 
 To plot the diagnostics or the final field, call
 
 ```
-julia --project=. examples/plot_linear_advection.jl diagnostics.csv state.csv
+julia --project=run examples/plot_linear_advection.jl diagnostics.csv state.csv
 ```
 
 which generates a PDF overlaying the analytic and numerical solutions.
 
 ## Linear Advection Convergence Study
 
+CPU:
+
+```
+julia --project=run examples/convergence_linear_advection.jl
+```
+
+Metal GPU:
+
 ```
-julia --project=. examples/convergence_linear_advection.jl
+julia --project=run -e 'using Metal; include("examples/convergence_linear_advection.jl"); run_convergence_study(backend=:metal, levels=4)'
 ```
 
 The convergence driver repeats the sinusoidal advection problem on a hierarchy
 of meshes. It prints the L² error for each resolution along with the observed
 order of accuracy. You should expect second-order convergence (EOC ≈ 2.0) once
-the mesh is sufficiently fine.
+the mesh is sufficiently fine. The serial path remains the default, so simple
+invocations continue to run on the CPU without any extra keywords.
 
 ## Kelvin–Helmholtz Instability (Compressible Euler)
 
+CPU:
+
+```
+julia --project=run examples/kelvin_helmholtz_euler.jl
+```
+
+Metal GPU:
+
 ```
-julia --project=. examples/kelvin_helmholtz_euler.jl
+julia --project=run -e 'using Metal; include("examples/kelvin_helmholtz_euler.jl"); run_kelvin_helmholtz(backend=:metal, final_time=1.0)'
 ```
 
 This example launches a Kelvin–Helmholtz roll-up on a periodic square domain
 using the compressible Euler equations and the slope-limited Rusanov fluxes. The
 script logs CFL numbers, tracks the volume-averaged kinetic energy, and can
-produce publication-ready figures:
+produce publication-ready figures. Further output customisation includes:
 
 - set `diagnostics_path="diagnostics.csv"` to capture the per-step log;
 - set `pdf_path="khi_density.pdf"` for a static density snapshot at `t = final_time`;
@@ -65,6 +92,17 @@ A stable run with the default settings reaches `t = 1.5` in roughly a thousand
 steps with CFL ≈ 0.4–0.45. The density field develops rolled vortices that match
 the reference figures checked into the repository (`khi_*.pdf/mp4`).
 
+## Backend Profiling
+
+```
+julia --project=run examples/profile_backends.jl
+```
+
+This utility benchmarks the serial driver against available KernelAbstractions
+backends for both linear advection and compressible Euler. It uses `Float32`
+storage so Metal-backed GPUs remain supported and prints per-backend wall-clock
+timings for quick regressions after kernel changes.
+
 ## Tips
 
 - All drivers expose keyword arguments for mesh size, timestep control, and file

diff --git a/examples/convergence_linear_advection.jl b/examples/convergence_linear_advection.jl
@@ -5,7 +5,8 @@ using Printf
 """
     run_convergence_study(; base_resolution=(16, 16), levels=5,
                              lengths=(1.0, 1.0), velocity=(1.0, 0.5),
-                             final_time=0.5, cfl=0.45)
+                             final_time=0.5, cfl=0.45,
+                             backend=default_backend(), state_eltype=nothing)
 
 Execute a grid refinement study for periodic linear advection with a sinusoidal
 initial condition on a square domain. Prints the L₂ error for each resolution,
@@ -16,7 +17,9 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16),
                                 lengths::Tuple{<:Real,<:Real} = (1.0, 1.0),
                                 velocity::Tuple{<:Real,<:Real} = (1.0, 0.5),
                                 final_time::Real = 0.5,
-                                cfl::Real = 0.45)
+                                cfl::Real = 0.45,
+                                backend::Union{ExecutionBackend,Symbol} = default_backend(),
+                                state_eltype::Union{Nothing,Type} = nothing)
     @assert levels >= 2 "Need at least two levels to measure convergence"
 
     Lx, Ly = float(lengths[1]), float(lengths[2])
@@ -31,22 +34,26 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16),
     println(" level    nx    ny        dt          L2 error        EOC")
 
     init = _sinusoidal_initializer(Lx, Ly)
+    backend_obj = _resolve_example_backend(backend)
+    state_T = isnothing(state_eltype) ? _default_state_eltype(backend_obj) : state_eltype
 
     for level in 0:levels-1
         nx = nx0 * 2^level
         ny = ny0 * 2^level
         problem = setup_linear_advection_problem(nx, ny;
                                                  lengths = (Lx, Ly),
                                                  velocity = vel)
-        state = LinearAdvectionState(problem; init = init)
+        state = LinearAdvectionState(problem; init = init, T = state_T, backend = backend_obj)
 
         dt_stable = stable_timestep(problem; cfl = cfl)
         steps = max(1, ceil(Int, final_time / dt_stable))
         dt = final_time / steps
 
         run_linear_advection!(state, problem; steps = steps, dt = dt)
 
-        u_num = solution(state)
+        u_field = solution(state)
+        u_num_backend = scalar_component(u_field)
+        u_num = Array(u_num_backend)  # copy to host for error evaluation
         u_exact = _exact_solution(problem, vel, final_time, u_num)
 
         err = sqrt(sum(abs2, u_num .- u_exact) / length(u_num))
@@ -74,6 +81,22 @@ function run_convergence_study(; base_resolution::Tuple{Int,Int} = (16, 16),
             errors = errors, eocs = eocs)
 end
 
+function _resolve_example_backend(spec::ExecutionBackend)
+    return spec
+end
+
+function _resolve_example_backend(spec::Symbol)
+    return KernelAbstractionsBackend(spec)
+end
+
+function _default_state_eltype(::ExecutionBackend)
+    return Float64
+end
+
+function _default_state_eltype(::KernelAbstractionsBackend)
+    return Float32
+end
+
 function _sinusoidal_initializer(Lx::Float64, Ly::Float64)
     function init(x, y)
         return sin(2pi * x / Lx) * sin(2pi * y / Ly)