diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..f2667b6
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,48 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  test:
+    name: Julia ${{ matrix.julia-version }} on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 10
+
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          - ubuntu-latest
+        julia-version:
+          - '1.12'
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Julia
+        uses: julia-actions/setup-julia@v2
+        with:
+          version: ${{ matrix.julia-version }}
+
+      - name: Cache Julia artifacts
+        uses: julia-actions/cache@v2
+
+      - name: Instantiate project
+        shell: julia --color=yes --project=. {0}
+        run: |
+          using Pkg
+          Pkg.instantiate()
+          Pkg.precompile()
+
+      - name: Run tests
+        shell: julia --color=yes --project=. {0}
+        run: |
+          using Pkg
+          Pkg.test()
diff --git a/.gitignore b/.gitignore
index 9dd64dd..e4c76d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,5 @@ docs/site/
 # environment.
 Manifest.toml
 
+# Benchmark outputs
+benchmark/results/
diff --git a/Project.toml b/Project.toml
index 5212bb6..c25e9fb 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,8 +1,23 @@
 name = "FFTRegGPU"
 uuid = "86d2b4ce-0b7d-4e3e-98ec-23077478530b"
-authors = ["Jungsoo Kim <jungsoo@mit.edu>"]
-version = "0.1.0"
+authors = ["urlicht <urlicht@users.noreply.github.com>"]
+version = "1.0.0"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+
+[weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+
+[extensions]
+FFTRegGPUCUDAExt = "CUDA"
+FFTRegGPUCPUExt = "FFTW"
+
+[extras]
+FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test", "FFTW", "Random"]
diff --git a/README.md b/README.md
index ededdfd..4264fd4 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,28 @@
 # FFTRegGPU.jl
-Fast FFT GPU registrtion using [phase correlation](https://en.wikipedia.org/wiki/Phase_correlation). This GPU version is based on [SubpixelRegistration.jl](https://github.com/romainFr/SubpixelRegistration.jl)  
+[![CI](https://github.com/urlicht/FFTRegGPU.jl/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/urlicht/FFTRegGPU.jl/actions/workflows/ci.yml)
+
+Fast FFT GPU registration using [phase correlation](https://en.wikipedia.org/wiki/Phase_correlation). This GPU version is based on [SubpixelRegistration.jl](https://github.com/romainFr/SubpixelRegistration.jl)  
 Currently it only supports translation.
 
 
 ## Usage and example  
+![before vs after registration showing a slice (xy)](docs/img/before_after_xy_slice.png)
+![before vs after registration showing xz MIP for a volume](docs/img/before_after_xz_MIP.png)
+
+### Backend setup
+Load FFTRegGPU with the backend package you want to use:
+
+```julia
+using FFTRegGPU
+using FFTW    # CPU backend
+# using CUDA  # CUDA backend
+```
+
+Backend selection is determined by array type at call time:
+- `CuArray` inputs use the CUDA extension (CUFFT/CUDA kernels).
+- `Array`/`StridedArray` inputs use the CPU extension (FFTW).
+- You can load both `CUDA` and `FFTW`; dispatch still follows the input array types.
+
 ### Registering a set of 2D images
 ```julia
 # allocate GPU memory
@@ -13,26 +32,27 @@ img1_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y)
 img2_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y)
 CC_g = CuArray{Complex{Float32}}(undef, size_x, size_y)
 N_g = CuArray{Float32}(undef, size_x, size_y)
+img2_reg_g = CuArray{Float32}(undef, size_x, size_y)
 
 # copy to GPU
 copyto!(img1_g, Float32.(img1))
 copyto!(img2_g, Float32.(img2))
 
-# perform FFT using CUFFT
-img1_f_g .= CUFFT.fft(img1_g)
-img2_f_g .= CUFFT.fft(img2_g)
+# perform FFT using the active backend
+img1_f_g .= fft(img1_g)
+img2_f_g .= fft(img2_g)
 
 # register (find the optimal translation)
-error, shift, diffphase = dftreg_gpu!(img1_f_g, img2_f_g, CC_g)
+error, shift, diffphase = dftreg!(img1_f_g, img2_f_g, CC_g)
 
-# resample the moving image
-img2_reg_g = dftreg_resample_gpu!(img1_f_g, N_g, shift, diffphase)
+# resample the moving image (in-place)
+dftreg_resample!(img2_reg_g, img2_f_g, N_g, shift, diffphase)
 
 # copy to CPU
 Array(img2_reg_g)
 ```
 ### Registering a set of 2D images (subpixel registration)
-Use the function `dftreg_subpix_gpu!`. For the argument `CC2x_g`, the array size should be 2x of the image size.
+Use the function `dftreg_subpix!`. For the argument `CC2x_g`, the array size should be 2x of the image size.
 ```julia
 # allocate GPU memory
 img1_g = CuArray{Float32}(undef, size_x, size_y)
@@ -41,20 +61,21 @@ img1_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y)
 img2_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y)
 CC2x_g = CuArray{Complex{Float32}}(undef, 2 * size_x, 2 * size_y)
 N_g = CuArray{Float32}(undef, size_x, size_y)
+img2_reg_g = CuArray{Float32}(undef, size_x, size_y)
 
 # copy to GPU
 copyto!(img1_g, Float32.(img1))
 copyto!(img2_g, Float32.(img2))
 
-# perform FFT using CUFFT
-img1_f_g .= CUFFT.fft(img1_g)
-img2_f_g .= CUFFT.fft(img2_g)
+# perform FFT using the active backend
+img1_f_g .= fft(img1_g)
+img2_f_g .= fft(img2_g)
 
 # register (find the optimal translation)
-error, shift, diffphase = dftreg_subpix_gpu!(img1_f_g, img2_f_g, CC2x_g)
+error, shift, diffphase = dftreg_subpix!(img1_f_g, img2_f_g, CC2x_g)
 
-# resample the moving image
-img2_reg_g = dftreg_resample_gpu!(img1_f_g, N_g, shift, diffphase)
+# resample the moving image (in-place)
+dftreg_resample!(img2_reg_g, img2_f_g, N_g, shift, diffphase)
 
 # copy to CPU
 Array(img2_reg_g)
@@ -63,9 +84,11 @@ Array(img2_reg_g)
 ### Registering z-stack
 - Moving targets on the stage can cause shearing in z-stack. To correct this, the images within the stack are registered together.
 - `reg_stack_translate!` is a memory-efficient and convenient function to register the frames in each z-stack. Here in the example, the script loads the z-stack at each time point, registers it, and then saves the registered z-stack.  
+- To ensure this runs on CUDA: make all working arrays `CuArray`, check `CUDA.functional()`, and optionally set `CUDA.allowscalar(false)` to catch accidental scalar fallback.
+- For repeated large host<->device transfers, use a long-lived pinned host buffer (`CUDA.pin`) for better copy throughput.
 ```julia
 size_x, size_y, size_z = 256, 256, 94
-img_stack_reg = zeros(Float32, size_x, size_y, size_z)
+img_stack_h = CUDA.pin(Array{Float32}(undef, size_x, size_y, size_z))
 img_stack_reg_g = CuArray{Float32}(undef, size_x, size_y, size_z)
 img1_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y)
 img2_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y)
@@ -73,22 +96,136 @@ CC2x_g = CuArray{Complex{Float32}}(undef, 2 * size_x, 2 * size_y)
 N_g = CuArray{Float32}(undef, size_x, size_y)
 
 @showprogress for t = 1:100
-    copyto!(img_stack_reg_g, get_zstack(t)) # copy data to GPU
+    get_zstack!(img_stack_h, t) # load data to pinned host buffer
+    copyto!(img_stack_reg_g, img_stack_h) # copy data to GPU
     reg_stack_translate!(img_stack_reg_g, img1_f_g, img2_f_g, CC2x_g, N_g) # register
-    copyto!(img_stack_reg, img_stack_reg_g) # copy result to GPU
-    save_zstack(t, img_stack_reg)
+    copyto!(img_stack_h, img_stack_reg_g) # copy result to CPU
+    save_zstack(t, img_stack_h)
+end
+```
+
+### Registering z-stack in batches (CUDA, higher throughput)
+If you have many stacks (`t = 1:nt`), process them in micro-batches to improve GPU utilization and overlap CPU I/O with GPU work.
+
+```julia
+using FFTRegGPU
+using CUDA
+using ProgressMeter
+using Base.Threads
+
+CUDA.functional() || error("CUDA is not functional")
+CUDA.allowscalar(false)
+
+size_x, size_y, size_z = 256, 256, 94
+nt = 100
+B = 4  # micro-batch size; tune 2, 4, 8...
+
+# One independent workspace per batch slot.
+h_in = [CUDA.pin(Array{Float32}(undef, size_x, size_y, size_z)) for _ in 1:B]
+h_out = [CUDA.pin(Array{Float32}(undef, size_x, size_y, size_z)) for _ in 1:B]
+d_stack = [CuArray{Float32}(undef, size_x, size_y, size_z) for _ in 1:B]
+img1_f = [CuArray{ComplexF32}(undef, size_x, size_y) for _ in 1:B]
+img2_f = [CuArray{ComplexF32}(undef, size_x, size_y) for _ in 1:B]
+CC2x = [CuArray{ComplexF32}(undef, 2 * size_x, 2 * size_y) for _ in 1:B]
+N = [CuArray{Float32}(undef, size_x, size_y) for _ in 1:B]
+reg_param = [Dict{Int,Any}() for _ in 1:B]
+
+@showprogress for t0 in 1:B:nt
+    k = min(B, nt - t0 + 1)
+
+    # Load stacks on CPU.
+    @threads for i in 1:k
+        get_zstack!(h_in[i], t0 + i - 1)
+    end
+
+    # Process stacks concurrently on GPU.
+    @sync for i in 1:k
+        @spawn begin
+            empty!(reg_param[i])
+            CUDA.@sync begin
+                copyto!(d_stack[i], h_in[i]) # H2D
+                reg_stack_translate!(d_stack[i], img1_f[i], img2_f[i], CC2x[i], N[i]; reg_param=reg_param[i])
+                copyto!(h_out[i], d_stack[i]) # D2H
+            end
+        end
+    end
+
+    # Save stacks on CPU.
+    @threads for i in 1:k
+        save_zstack(t0 + i - 1, h_out[i])
+    end
 end
 ```
 
+Notes:
+- Run Julia with multiple threads (e.g. `julia -t auto`) to benefit from `@threads`/`@spawn`.
+- `B` is the main tuning parameter; increase it until throughput stops improving or GPU memory becomes limiting.
+
 ## Performance
-Measured time to call `reg_stack_translate!`  
-Hardware: GTX 1080  
-Julia v1.5.3 and CUDA.jl v.2.4.1
-
-| size (x,y,z) | mean time (ms) |
-| - | - |
-| 32,32,32 | 43.817 ms |
-| 64,64,64 | 91.215 ms |
-| 128,128,64 | 103.022 ms |
-| 256,256,64 | 131.046 ms |
-| 256,256,256 | 525.257 ms |
+### Benchmark command
+```bash
+julia --project=benchmark benchmark/run_benchmarks.jl --backend=both --samples=20 --sizes=64x64,128x128,256x256,512x512,2048x2048 --stack=128x128x128,256x256x256 --output=benchmark/results/both.csv
+```
+
+### Benchmark results
+Time in ms
+
+| backend | case | dims | median | mean | min | memory | allocs |
+| :--- | :--- | :--- | ---: | ---: | ---: | ---: | ---: |
+| cpu | dftreg | 64x64 | 0.060 | 0.066 | 0.059 | 400 | 7 |
+| cpu | dftreg_subpix | 64x64 | 0.410 | 0.408 | 0.306 | 295408 | 247 |
+| cpu | dftreg | 128x128 | 0.204 | 0.214 | 0.203 | 400 | 7 |
+| cpu | dftreg_subpix | 128x128 | 3.687 | 3.661 | 3.331 | 968880 | 247 |
+| cpu | dftreg | 256x256 | 2.970 | 2.978 | 2.960 | 400 | 7 |
+| cpu | dftreg_subpix | 256x256 | 99.947 | 89.491 | 18.744 | 3493856 | 269 |
+| cpu | dftreg | 512x512 | 17.416 | 17.440 | 17.333 | 400 | 7 |
+| cpu | dftreg_subpix | 512x512 | 196.452 | 190.043 | 69.564 | 13262576 | 279 |
+| cpu | dftreg | 2048x2048 | 258.009 | 258.358 | 256.690 | 400 | 7 |
+| cpu | dftreg_subpix | 2048x2048 | 1790.361 | 1768.056 | 1710.282 | 204021056 | 279 |
+| cpu | reg_stack_translate | 128x128x128 | 498.022 | 503.165 | 493.216 | 124292056 | 36460 |
+| cpu | reg_stack_translate | 256x256x256 | 28044.507 | 28044.507 | 28044.507 | 895572952 | 79316 |
+| cuda | dftreg | 64x64 | 0.683 | 0.760 | 0.586 | 15104 | 487 |
+| cuda | dftreg_subpix | 64x64 | 1.412 | 1.409 | 0.986 | 118320 | 2051 |
+| cuda | dftreg | 128x128 | 0.215 | 0.235 | 0.211 | 15008 | 474 |
+| cuda | dftreg_subpix | 128x128 | 1.439 | 1.457 | 1.081 | 166896 | 2051 |
+| cuda | dftreg | 256x256 | 0.220 | 0.241 | 0.215 | 15072 | 483 |
+| cuda | dftreg_subpix | 256x256 | 1.495 | 1.510 | 1.065 | 263472 | 2076 |
+| cuda | dftreg | 512x512 | 0.232 | 0.255 | 0.228 | 15152 | 492 |
+| cuda | dftreg_subpix | 512x512 | 1.927 | 3.169 | 1.334 | 457184 | 2186 |
+| cuda | dftreg | 2048x2048 | 1.845 | 1.849 | 1.589 | 24432 | 1028 |
+| cuda | dftreg_subpix | 2048x2048 | 11.011 | 10.937 | 9.759 | 1622016 | 2752 |
+| cuda | reg_stack_translate | 128x128x128 | 182.471 | 212.267 | 165.538 | 24102736 | 324324 |
+| cuda | reg_stack_translate | 256x256x256 | 382.678 | 389.114 | 370.962 | 74343728 | 659790 |
+
+### Configuration
+```
+CUDA toolchain: 
+- runtime 13.2, artifact installation
+- driver 570.181.0 for 13.2
+- compiler 13.2
+
+CUDA libraries: 
+- CUBLAS: 13.3.0
+- CURAND: 10.4.2
+- CUFFT: 12.2.0
+- CUSOLVER: 12.1.0
+- CUSPARSE: 12.7.9
+- CUPTI: 2026.1.0 (API 13.2.0)
+- NVML: 12.0.0+570.181
+
+Julia packages: 
+- CUDA: 5.11.0
+- GPUArrays: 11.4.1
+- GPUCompiler: 1.8.2
+- KernelAbstractions: 0.9.40
+- CUDA_Driver_jll: 13.2.0+0
+- CUDA_Compiler_jll: 0.4.2+0
+- CUDA_Runtime_jll: 0.21.0+0
+
+Toolchain:
+- Julia: 1.12.5
+- LLVM: 18.1.7
+
+1 device:
+  0: NVIDIA GeForce RTX 4060 Ti (sm_89, 15.581 GiB / 15.996 GiB available)
+```
diff --git a/benchmark/Project.toml b/benchmark/Project.toml
new file mode 100644
index 0000000..bbedb82
--- /dev/null
+++ b/benchmark/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+FFTRegGPU = "86d2b4ce-0b7d-4e3e-98ec-23077478530b"
+
+[sources]
+FFTRegGPU = {path = ".."}
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000..e2f336c
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,69 @@
+# Benchmarking FFTRegGPU
+
+This benchmark suite is deterministic (fixed synthetic data seed) and reusable across commits.
+
+## Setup
+
+```bash
+julia --project=benchmark -e 'using Pkg; Pkg.instantiate()'
+```
+
+## Run
+
+From the repository root:
+
+```bash
+julia --project=benchmark benchmark/run_benchmarks.jl --backend=cpu --samples=30 --output=benchmark/results/cpu.csv
+```
+
+To run CUDA too:
+
+```bash
+julia --project=benchmark -e 'using Pkg; Pkg.add("CUDA")'
+julia --project=benchmark benchmark/run_benchmarks.jl --backend=both --samples=20 --output=benchmark/results/both.csv
+```
+
+If CUDA is not functional on the machine, CUDA cases are skipped automatically.
+
+## Compare `reg_stack_translate!` across two commits
+
+Run this standalone commit comparison benchmark (CUDA only):
+
+```bash
+julia --project=benchmark benchmark/compare_reg_stack_translate_commits.jl \
+  --current-ref=HEAD \
+  --previous-ref=HEAD~1 \
+  --stack=256x256x64,256x256x256 \
+  --samples=20
+```
+
+You can manually provide any two refs (branches, tags, or SHAs):
+
+```bash
+julia --project=benchmark benchmark/compare_reg_stack_translate_commits.jl \
+  --current-ref=3a1b2c4 \
+  --previous-ref=f09d8e7
+```
+
+The script prints:
+- Median timing comparison (`current_ms`, `prev_ms`, ratio, percent delta)
+- Output equivalence (`isapprox` + max absolute difference) per stack size
+
+## Useful options
+
+- `--backend=cpu|cuda|both`
+- `--samples=N`
+- `--evals=N`
+- `--seed=N`
+- `--sizes=128x128,256x256`
+- `--stack=256x256x64`
+- `--noise=0.02`
+- `--output=PATH`
+- `--current-ref=REF`
+- `--previous-ref=REF`
+
+Use `--help` for all options:
+
+```bash
+julia --project=benchmark benchmark/run_benchmarks.jl --help
+```
diff --git a/benchmark/compare_reg_stack_translate_commits.jl b/benchmark/compare_reg_stack_translate_commits.jl
new file mode 100644
index 0000000..02c1b4b
--- /dev/null
+++ b/benchmark/compare_reg_stack_translate_commits.jl
@@ -0,0 +1,406 @@
+using Printf
+using Serialization
+
+const DEFAULT_SEED = 20260313
+
+struct CompareConfig
+    repo_path::String
+    current_ref::String
+    previous_ref::String
+    stack_sizes::Vector{NTuple{3,Int}}
+    samples::Int
+    evals::Int
+    seed::Int
+    noise_sigma::Float32
+end
+
+function parse_dim3(token::AbstractString)
+    m = match(r"^(\d+)x(\d+)x(\d+)$", lowercase(strip(token)))
+    m === nothing && error("Invalid 3D size '$token'. Use NxMxZ (e.g. 256x256x64).")
+    parse(Int, m.captures[1]), parse(Int, m.captures[2]), parse(Int, m.captures[3])
+end
+
+function parse_stack_sizes(token::AbstractString)
+    [parse_dim3(s) for s in split(token, ",")]
+end
+
+function parse_args(args::Vector{String})
+    cfg = Dict(
+        :repo_path => abspath(joinpath(@__DIR__, "..")),
+        :current_ref => "HEAD",
+        :previous_ref => "HEAD~1",
+        :stack_sizes => [(256, 256, 64), (256, 256, 256)],
+        :samples => 20,
+        :evals => 1,
+        :seed => DEFAULT_SEED,
+        :noise_sigma => 0.02f0,
+    )
+
+    for arg in args
+        if arg == "--help" || arg == "-h"
+            println("""
+Usage:
+  julia --project=benchmark benchmark/compare_reg_stack_translate_commits.jl [options]
+
+Options:
+  --repo=PATH                     Path to FFTRegGPU repository. Default: repo root
+  --current-ref=REF               Current/reference A git ref. Default: HEAD
+  --previous-ref=REF              Baseline/reference B git ref. Default: HEAD~1
+  --stack=256x256x64,256x256x256  Stack sizes to benchmark. Default: 256x256x64,256x256x256
+  --samples=N                     BenchmarkTools samples per case. Default: 20
+  --evals=N                       BenchmarkTools evals per sample. Default: 1
+  --seed=N                        RNG seed for synthetic data. Default: $(DEFAULT_SEED)
+  --noise=0.02                    Added Gaussian noise sigma. Default: 0.02
+""")
+            exit(0)
+        elseif startswith(arg, "--repo=")
+            cfg[:repo_path] = abspath(split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--current-ref=")
+            cfg[:current_ref] = split(arg, "=", limit=2)[2]
+        elseif startswith(arg, "--previous-ref=")
+            cfg[:previous_ref] = split(arg, "=", limit=2)[2]
+        elseif startswith(arg, "--stack=")
+            cfg[:stack_sizes] = parse_stack_sizes(split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--samples=")
+            cfg[:samples] = parse(Int, split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--evals=")
+            cfg[:evals] = parse(Int, split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--seed=")
+            cfg[:seed] = parse(Int, split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--noise=")
+            cfg[:noise_sigma] = Float32(parse(Float64, split(arg, "=", limit=2)[2]))
+        else
+            error("Unknown option: $arg")
+        end
+    end
+
+    isdir(cfg[:repo_path]) || error("Repo path does not exist: $(cfg[:repo_path])")
+    cfg[:samples] > 0 || error("samples must be positive")
+    cfg[:evals] > 0 || error("evals must be positive")
+    isempty(cfg[:current_ref]) && error("current-ref must not be empty")
+    isempty(cfg[:previous_ref]) && error("previous-ref must not be empty")
+    isempty(cfg[:stack_sizes]) && error("stack must include at least one size")
+
+    CompareConfig(
+        cfg[:repo_path],
+        cfg[:current_ref],
+        cfg[:previous_ref],
+        cfg[:stack_sizes],
+        cfg[:samples],
+        cfg[:evals],
+        cfg[:seed],
+        cfg[:noise_sigma],
+    )
+end
+
+const CHILD_CODE = raw"""
+using Pkg
+using Random
+using Serialization
+
+repo_checkout = ARGS[1]
+result_file = ARGS[2]
+stack_arg = ARGS[3]
+samples = parse(Int, ARGS[4])
+evals = parse(Int, ARGS[5])
+seed = parse(Int, ARGS[6])
+noise_sigma = Float32(parse(Float64, ARGS[7]))
+
+function parse_stack_sizes(token::AbstractString)
+    out = NTuple{3,Int}[]
+    for s in split(token, ';')
+        isempty(s) && continue
+        p = split(s, 'x')
+        length(p) == 3 || error("Invalid stack size: $s (expected NxMxZ)")
+        push!(out, (parse(Int, p[1]), parse(Int, p[2]), parse(Int, p[3])))
+    end
+    out
+end
+
+stack_sizes = parse_stack_sizes(stack_arg)
+isempty(stack_sizes) && error("No stack sizes provided")
+
+env_dir = mktempdir()
+Pkg.activate(env_dir)
+Pkg.develop(PackageSpec(path=repo_checkout))
+Pkg.instantiate()
+Pkg.add(["BenchmarkTools", "FFTW", "CUDA"])
+
+using BenchmarkTools
+using FFTRegGPU
+using FFTW
+using CUDA
+
+CUDA.functional() || error("CUDA is not functional on this machine")
+
+_safe_str(f, default="unknown") = try
+    string(f())
+catch
+    default
+end
+
+function _cuda_info()
+    dev = CUDA.device()
+    (
+        functional = true,
+        device_name = _safe_str(() -> CUDA.name(dev), string(dev)),
+        device_index = _safe_str(() -> CUDA.deviceid(dev)),
+        driver_version = _safe_str(CUDA.driver_version),
+        runtime_version = _safe_str(CUDA.runtime_version),
+    )
+end
+
+function make_base_image(nx::Int, ny::Int, rng::AbstractRNG)
+    x = reshape(collect(LinRange(0f0, 1f0, nx)), nx, 1)
+    y = reshape(collect(LinRange(0f0, 1f0, ny)), 1, ny)
+    img = @. 0.55f0 * sin(2f0 * π * (3f0 * x + 5f0 * y)) + 0.35f0 * cos(2f0 * π * (7f0 * x - 2f0 * y))
+    img .+= 0.10f0 .* randn(rng, Float32, nx, ny)
+    img .-= minimum(img)
+    img ./= maximum(img)
+    img
+end
+
+function make_stack(nx::Int, ny::Int, nz::Int, rng::AbstractRNG, noise_sigma::Float32)
+    stack = zeros(Float32, nx, ny, nz)
+    stack[:, :, 1] .= make_base_image(nx, ny, rng)
+    for z in 2:nz
+        stack[:, :, z] .= circshift(@view(stack[:, :, z - 1]), (1, -1))
+    end
+    stack .+= noise_sigma .* randn(rng, Float32, nx, ny, nz)
+    stack
+end
+
+function _resolve_reg_fn()
+    if isdefined(FFTRegGPU, :reg_stack_translate_gpu!)
+        return getfield(FFTRegGPU, :reg_stack_translate_gpu!)
+    elseif isdefined(FFTRegGPU, :reg_stack_translate!)
+        return getfield(FFTRegGPU, :reg_stack_translate!)
+    else
+        error("Neither reg_stack_translate_gpu! nor reg_stack_translate! exists in this commit")
+    end
+end
+
+function call_reg_stack_translate!(stack_work, img1_f, img2_f, cc2x, nbuf, reg_param)
+    f = _resolve_reg_fn()
+    try
+        return f(stack_work, img1_f, img2_f, cc2x, nbuf; reg_param=reg_param)
+    catch err
+        if err isa MethodError
+            return f(stack_work, img1_f, img2_f, cc2x, nbuf)
+        end
+        rethrow(err)
+    end
+end
+
+function run_case_rows(stack_sizes, samples, evals, seed, noise_sigma)
+    rows = NamedTuple[]
+    all_buffers_on_cuda = true
+    for (i, (nx, ny, nz)) in enumerate(stack_sizes)
+        rng = MersenneTwister(seed + 100 + i)
+        dims = "$(nx)x$(ny)x$(nz)"
+
+        stack_template = CUDA.CuArray(make_stack(nx, ny, nz, rng, noise_sigma))
+        stack_work = similar(stack_template)
+        img1_f = CUDA.zeros(ComplexF32, nx, ny)
+        img2_f = CUDA.zeros(ComplexF32, nx, ny)
+        cc2x = CUDA.zeros(ComplexF32, 2 * nx, 2 * ny)
+        nbuf = CUDA.zeros(Float32, nx, ny)
+        reg_param = Dict{Int,Any}()
+        all_buffers_on_cuda = all_buffers_on_cuda && (
+            stack_template isa CUDA.CuArray &&
+            stack_work isa CUDA.CuArray &&
+            img1_f isa CUDA.CuArray &&
+            img2_f isa CUDA.CuArray &&
+            cc2x isa CUDA.CuArray &&
+            nbuf isa CUDA.CuArray
+        )
+
+        copyto!(stack_work, stack_template)
+        call_reg_stack_translate!(stack_work, img1_f, img2_f, cc2x, nbuf, reg_param)
+        CUDA.synchronize()
+        stack_out = Array(stack_work)
+
+        trial = run(@benchmarkable begin
+            copyto!($stack_work, $stack_template)
+            empty!($reg_param)
+            call_reg_stack_translate!($stack_work, $img1_f, $img2_f, $cc2x, $nbuf, $reg_param)
+            CUDA.synchronize()
+        end samples=samples evals=evals)
+
+        med = BenchmarkTools.median(trial)
+        mn = BenchmarkTools.mean(trial)
+        mnm = BenchmarkTools.minimum(trial)
+        push!(rows, (
+            dims = dims,
+            median_ms = med.time / 1e6,
+            mean_ms = mn.time / 1e6,
+            min_ms = mnm.time / 1e6,
+            memory_bytes = med.memory,
+            allocs = med.allocs,
+            stack_out = stack_out,
+        ))
+    end
+
+    return rows, all_buffers_on_cuda
+end
+
+rows, all_buffers_on_cuda = run_case_rows(stack_sizes, samples, evals, seed, noise_sigma)
+
+serialize(
+    result_file,
+    (
+        rows = rows,
+        cuda_info = _cuda_info(),
+        all_buffers_on_cuda = all_buffers_on_cuda,
+    ),
+)
+"""
+
+function write_child_script()
+    dir = mktempdir()
+    path = joinpath(dir, "bench_compare_one_ref.jl")
+    write(path, CHILD_CODE)
+    path
+end
+
+function checkout_ref(repo_path::AbstractString, ref::AbstractString)
+    dir = mktempdir()
+    run(`git clone --quiet $(String(repo_path)) $dir`)
+    run(`git -C $dir checkout --quiet $(String(ref))`)
+    dir
+end
+
+function run_ref(
+    repo_path::AbstractString,
+    ref::AbstractString,
+    child_file::AbstractString,
+    stack_sizes::Vector{NTuple{3,Int}},
+    samples::Int,
+    evals::Int,
+    seed::Int,
+    noise_sigma::Float32,
+)
+    checkout = checkout_ref(repo_path, ref)
+    result_dir = mktempdir()
+    result_file = joinpath(result_dir, "result.bin")
+    stack_arg = join(["$(x)x$(y)x$(z)" for (x, y, z) in stack_sizes], ";")
+
+    cmd = `$(Base.julia_cmd()) $(String(child_file)) $checkout $result_file $stack_arg $samples $evals $seed $noise_sigma`
+
+    try
+        run(cmd)
+        commit = chomp(read(`git -C $checkout rev-parse --short HEAD`, String))
+        payload = deserialize(result_file)
+        if payload isa NamedTuple && hasproperty(payload, :rows)
+            return (
+                commit = commit,
+                rows = payload.rows,
+                cuda_info = hasproperty(payload, :cuda_info) ? payload.cuda_info : (functional = true,),
+                all_buffers_on_cuda = hasproperty(payload, :all_buffers_on_cuda) ? payload.all_buffers_on_cuda : false,
+            )
+        end
+
+        # Backward compatibility with older payload shape.
+        return (
+            commit = commit,
+            rows = payload,
+            cuda_info = (functional = true,),
+            all_buffers_on_cuda = false,
+        )
+    finally
+        rm(result_dir; force=true, recursive=true)
+        rm(checkout; force=true, recursive=true)
+    end
+end
+
+function print_cuda_report(label::String, run_result)
+    info = run_result.cuda_info
+    device_name = hasproperty(info, :device_name) ? info.device_name : "unknown"
+    device_index = hasproperty(info, :device_index) ? info.device_index : "unknown"
+    driver = hasproperty(info, :driver_version) ? info.driver_version : "unknown"
+    runtime = hasproperty(info, :runtime_version) ? info.runtime_version : "unknown"
+    functional = hasproperty(info, :functional) ? info.functional : "unknown"
+    arrays_on_cuda = run_result.all_buffers_on_cuda ? "yes" : "no"
+    pad = repeat(" ", length(label))
+    println("  ", label, ": functional=", functional, ", device=", device_name, " (id=", device_index, ")")
+    println("  ", pad, "  driver=", driver, ", runtime=", runtime, ", arrays_on_cuda=", arrays_on_cuda)
+end
+
+function print_table(curr, prev, current_ref::String, previous_ref::String)
+    curr_by_dims = Dict(r.dims => r for r in curr.rows)
+    prev_by_dims = Dict(r.dims => r for r in prev.rows)
+    dims_all = sort!(collect(intersect(keys(curr_by_dims), keys(prev_by_dims))))
+    isempty(dims_all) && error("No overlapping stack dimensions between compared refs.")
+
+    println("CUDA reg_stack_translate! comparison")
+    println("current:  $(curr.commit) ($current_ref)")
+    println("previous: $(prev.commit) ($previous_ref)")
+    println("CUDA verification:")
+    print_cuda_report("current", curr)
+    print_cuda_report("previous", prev)
+    println()
+
+    @printf("%-14s%14s%14s%12s%12s\n", "dims", "current_ms", "prev_ms", "curr/prev", "delta_%")
+    for d in dims_all
+        c = curr_by_dims[d]
+        p = prev_by_dims[d]
+        ratio = c.median_ms / p.median_ms
+        delta = (c.median_ms - p.median_ms) / p.median_ms * 100
+        @printf("%-14s%14.3f%14.3f%12.3f%12.1f\n", d, c.median_ms, p.median_ms, ratio, delta)
+    end
+
+    println()
+    println("Output equivalence (isapprox):")
+    @printf("%-14s%8s%12s\n", "dims", "same", "max_abs")
+    for d in dims_all
+        a = curr_by_dims[d].stack_out
+        b = prev_by_dims[d].stack_out
+        max_abs = maximum(abs.(a .- b))
+        same = isapprox(a, b; rtol=1f-4, atol=1f-4)
+        @printf("%-14s%8s%12.3e\n", d, same ? "yes" : "no", max_abs)
+    end
+end
+
+function main()
+    cfg = parse_args(ARGS)
+
+    println("Commit comparison configuration:")
+    println("  repo:         ", cfg.repo_path)
+    println("  current-ref:  ", cfg.current_ref)
+    println("  previous-ref: ", cfg.previous_ref)
+    println("  samples:      ", cfg.samples)
+    println("  evals:        ", cfg.evals)
+    println("  seed:         ", cfg.seed)
+    println("  stack:        ", join(["$(x)x$(y)x$(z)" for (x, y, z) in cfg.stack_sizes], ", "))
+    println("  noise:        ", cfg.noise_sigma)
+    println()
+
+    child_file = write_child_script()
+    try
+        curr = run_ref(
+            cfg.repo_path,
+            cfg.current_ref,
+            child_file,
+            cfg.stack_sizes,
+            cfg.samples,
+            cfg.evals,
+            cfg.seed,
+            cfg.noise_sigma,
+        )
+        prev = run_ref(
+            cfg.repo_path,
+            cfg.previous_ref,
+            child_file,
+            cfg.stack_sizes,
+            cfg.samples,
+            cfg.evals,
+            cfg.seed,
+            cfg.noise_sigma,
+        )
+        print_table(curr, prev, cfg.current_ref, cfg.previous_ref)
+    finally
+        rm(dirname(child_file); force=true, recursive=true)
+    end
+end
+
+main()
diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl
new file mode 100644
index 0000000..146a624
--- /dev/null
+++ b/benchmark/run_benchmarks.jl
@@ -0,0 +1,385 @@
+using BenchmarkTools
+using FFTRegGPU
+using FFTW
+using Random
+using Printf
+
+const DEFAULT_SEED = 20260313
+
+struct BenchConfig
+    backend::Symbol
+    samples::Int
+    evals::Int
+    seed::Int
+    sizes::Vector{Tuple{Int,Int}}
+    stack_sizes::Vector{NTuple{3,Int}}
+    noise_sigma::Float32
+    output::Union{Nothing,String}
+end
+
+function parse_dim2(token::AbstractString)
+    m = match(r"^(\d+)x(\d+)$", lowercase(strip(token)))
+    m === nothing && error("Invalid 2D size '$token'. Use NxM (e.g. 256x256).")
+    parse(Int, m.captures[1]), parse(Int, m.captures[2])
+end
+
+function parse_dim3(token::AbstractString)
+    m = match(r"^(\d+)x(\d+)x(\d+)$", lowercase(strip(token)))
+    m === nothing && error("Invalid 3D size '$token'. Use NxMxZ (e.g. 256x256x64).")
+    parse(Int, m.captures[1]), parse(Int, m.captures[2]), parse(Int, m.captures[3])
+end
+
+function parse_sizes(token::AbstractString)
+    [parse_dim2(s) for s in split(token, ",")]
+end
+
+function parse_stack_sizes(token::AbstractString)
+    [parse_dim3(s) for s in split(token, ",")]
+end
+
+function parse_backend(token::AbstractString)
+    b = Symbol(lowercase(strip(token)))
+    b in (:cpu, :cuda, :both) || error("backend must be cpu, cuda, or both")
+    b
+end
+
+function parse_args(args::Vector{String})
+    cfg = Dict(
+        :backend => :both,
+        :samples => 20,
+        :evals => 1,
+        :seed => DEFAULT_SEED,
+        :sizes => [(128, 128), (256, 256)],
+        :stack_sizes => [(256, 256, 64)],
+        :noise_sigma => 0.02f0,
+        :output => nothing,
+    )
+
+    for arg in args
+        if arg == "--help" || arg == "-h"
+            println("""
+Usage:
+  julia --project=benchmark benchmark/run_benchmarks.jl [options]
+
+Options:
+  --backend=cpu|cuda|both      Benchmark backend(s). Default: both
+  --samples=N                  BenchmarkTools samples per case. Default: 20
+  --evals=N                    BenchmarkTools evals per sample. Default: 1
+  --seed=N                     RNG seed for synthetic data. Default: $(DEFAULT_SEED)
+  --sizes=128x128,256x256      2D pair benchmark sizes. Default: 128x128,256x256
+  --stack=256x256x64,256x256x96
+                               3D stack benchmark sizes. Default: 256x256x64
+  --noise=0.02                 Added Gaussian noise sigma. Default: 0.02
+  --output=PATH                Optional CSV output path.
+""")
+            exit(0)
+        elseif startswith(arg, "--backend=")
+            cfg[:backend] = parse_backend(split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--samples=")
+            cfg[:samples] = parse(Int, split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--evals=")
+            cfg[:evals] = parse(Int, split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--seed=")
+            cfg[:seed] = parse(Int, split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--sizes=")
+            cfg[:sizes] = parse_sizes(split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--stack=")
+            cfg[:stack_sizes] = parse_stack_sizes(split(arg, "=", limit=2)[2])
+        elseif startswith(arg, "--noise=")
+            cfg[:noise_sigma] = Float32(parse(Float64, split(arg, "=", limit=2)[2]))
+        elseif startswith(arg, "--output=")
+            cfg[:output] = split(arg, "=", limit=2)[2]
+        else
+            error("Unknown option: $arg")
+        end
+    end
+
+    cfg[:samples] > 0 || error("samples must be positive")
+    cfg[:evals] > 0 || error("evals must be positive")
+    isempty(cfg[:stack_sizes]) && error("stack must include at least one size")
+
+    BenchConfig(
+        cfg[:backend],
+        cfg[:samples],
+        cfg[:evals],
+        cfg[:seed],
+        cfg[:sizes],
+        cfg[:stack_sizes],
+        cfg[:noise_sigma],
+        cfg[:output],
+    )
+end
+
+function make_base_image(nx::Int, ny::Int, rng::AbstractRNG)
+    x = reshape(collect(LinRange(0f0, 1f0, nx)), nx, 1)
+    y = reshape(collect(LinRange(0f0, 1f0, ny)), 1, ny)
+    img = @. 0.55f0 * sin(2f0 * π * (3f0 * x + 5f0 * y)) + 0.35f0 * cos(2f0 * π * (7f0 * x - 2f0 * y))
+    img .+= 0.10f0 .* randn(rng, Float32, nx, ny)
+    img .-= minimum(img)
+    img ./= maximum(img)
+    img
+end
+
+function make_integer_pair(nx::Int, ny::Int, rng::AbstractRNG, noise_sigma::Float32)
+    img1 = make_base_image(nx, ny, rng)
+    img2 = circshift(img1, (3, -2))
+    img2 .+= noise_sigma .* randn(rng, Float32, nx, ny)
+    fft(img1), fft(img2)
+end
+
+function make_subpixel_pair(nx::Int, ny::Int, rng::AbstractRNG, noise_sigma::Float32)
+    img1 = make_base_image(nx, ny, rng)
+    img1_f = fft(img1)
+
+    shift = Float32[2.25, -1.75]
+    nbuf = zeros(Float32, nx, ny)
+    shifted_f = similar(img1_f)
+    subpix_shift!(shifted_f, img1_f, nbuf, shift, 0f0)
+    img2 = real(ifft(shifted_f))
+    img2 .+= noise_sigma .* randn(rng, Float32, nx, ny)
+    img1_f, fft(img2)
+end
+
+function make_stack(nx::Int, ny::Int, nz::Int, rng::AbstractRNG, noise_sigma::Float32)
+    stack = zeros(Float32, nx, ny, nz)
+    stack[:, :, 1] .= make_base_image(nx, ny, rng)
+
+    for z in 2:nz
+        stack[:, :, z] .= circshift(@view(stack[:, :, z - 1]), (1, -1))
+    end
+    stack .+= noise_sigma .* randn(rng, Float32, nx, ny, nz)
+    stack
+end
+
+function push_summary!(rows, backend::String, case_name::String, dims::String, cfg::BenchConfig, trial::BenchmarkTools.Trial)
+    med = BenchmarkTools.median(trial)
+    mn = BenchmarkTools.mean(trial)
+    mnm = BenchmarkTools.minimum(trial)
+    push!(
+        rows,
+        (
+            backend = backend,
+            case = case_name,
+            dims = dims,
+            median_ms = med.time / 1e6,
+            mean_ms = mn.time / 1e6,
+            min_ms = mnm.time / 1e6,
+            memory_bytes = med.memory,
+            allocs = med.allocs,
+            samples = cfg.samples,
+            evals = cfg.evals,
+        ),
+    )
+end
+
+function run_cpu_benchmarks(cfg::BenchConfig, rows)
+    println("Running CPU benchmarks...")
+
+    for (i, (nx, ny)) in enumerate(cfg.sizes)
+        rng = MersenneTwister(cfg.seed + i)
+        dims = "$(nx)x$(ny)"
+
+        img1_f, img2_f = make_integer_pair(nx, ny, rng, cfg.noise_sigma)
+        cc = similar(img1_f)
+        cc_abs2_work = similar(cc, float(real(eltype(cc))))
+        dftreg!(img1_f, img2_f, cc; cc_abs2_work=cc_abs2_work) # warmup
+        trial = run(
+            @benchmarkable dftreg!($img1_f, $img2_f, $cc; cc_abs2_work=$cc_abs2_work) samples=cfg.samples evals=cfg.evals
+        )
+        push_summary!(rows, "cpu", "dftreg", dims, cfg, trial)
+
+        img1_f_sub, img2_f_sub = make_subpixel_pair(nx, ny, rng, cfg.noise_sigma)
+        cc2x = zeros(eltype(img1_f_sub), 2 * nx, 2 * ny)
+        cc2x_abs2_work = similar(cc2x, float(real(eltype(cc2x))))
+        dftreg_subpix!(img1_f_sub, img2_f_sub, cc2x; cc2x_abs2_work=cc2x_abs2_work) # warmup
+        trial = run(
+            @benchmarkable dftreg_subpix!($img1_f_sub, $img2_f_sub, $cc2x; cc2x_abs2_work=$cc2x_abs2_work) samples=cfg.samples evals=cfg.evals
+        )
+        push_summary!(rows, "cpu", "dftreg_subpix", dims, cfg, trial)
+    end
+
+    for (i, (nx, ny, nz)) in enumerate(cfg.stack_sizes)
+        rng = MersenneTwister(cfg.seed + 100 + i)
+        dims = "$(nx)x$(ny)x$(nz)"
+        stack_template = make_stack(nx, ny, nz, rng, cfg.noise_sigma)
+        stack_work = similar(stack_template)
+        img1_f = zeros(ComplexF32, nx, ny)
+        img2_f = zeros(ComplexF32, nx, ny)
+        cc2x = zeros(ComplexF32, 2 * nx, 2 * ny)
+        nbuf = zeros(Float32, nx, ny)
+        reg_param = Dict{Int,Any}()
+
+        copyto!(stack_work, stack_template)
+        reg_stack_translate!(stack_work, img1_f, img2_f, cc2x, nbuf; reg_param=reg_param) # warmup
+
+        trial = run(
+            @benchmarkable begin
+                copyto!($stack_work, $stack_template)
+                empty!($reg_param)
+                reg_stack_translate!($stack_work, $img1_f, $img2_f, $cc2x, $nbuf; reg_param=$reg_param)
+            end samples=cfg.samples evals=1
+        )
+        push_summary!(rows, "cpu", "reg_stack_translate", dims, cfg, trial)
+    end
+end
+
+function maybe_load_cuda()
+    Base.find_package("CUDA") === nothing && return nothing
+    try
+        @eval using CUDA
+        # `@eval using CUDA` inside a function can create a newer-world binding.
+        cuda = Base.invokelatest(() -> getfield(@__MODULE__, :CUDA))
+        Base.invokelatest(cuda.functional) || return nothing
+        cuda
+    catch
+        nothing
+    end
+end
+
+function run_cuda_benchmarks(cfg::BenchConfig, rows)
+    CUDA = maybe_load_cuda()
+    if CUDA === nothing
+        println("Skipping CUDA benchmarks (CUDA not installed or no functional CUDA device).")
+        return
+    end
+    println("Running CUDA benchmarks...")
+    Base.invokelatest(_run_cuda_benchmarks_loaded, CUDA, cfg, rows)
+end
+
+function _run_cuda_benchmarks_loaded(CUDA, cfg::BenchConfig, rows)
+    for (i, (nx, ny)) in enumerate(cfg.sizes)
+        rng = MersenneTwister(cfg.seed + i)
+        dims = "$(nx)x$(ny)"
+
+        img1_f_cpu, img2_f_cpu = make_integer_pair(nx, ny, rng, cfg.noise_sigma)
+        img1_f = CUDA.CuArray(img1_f_cpu)
+        img2_f = CUDA.CuArray(img2_f_cpu)
+        cc = similar(img1_f)
+        cc_abs2_work = similar(cc, float(real(eltype(cc))))
+
+        dftreg!(img1_f, img2_f, cc; cc_abs2_work=cc_abs2_work) # warmup
+        CUDA.synchronize()
+        trial = run(
+            @benchmarkable begin
+                dftreg!($img1_f, $img2_f, $cc; cc_abs2_work=$cc_abs2_work)
+                CUDA.synchronize()
+            end samples=cfg.samples evals=cfg.evals
+        )
+        push_summary!(rows, "cuda", "dftreg", dims, cfg, trial)
+
+        img1_f_sub_cpu, img2_f_sub_cpu = make_subpixel_pair(nx, ny, rng, cfg.noise_sigma)
+        img1_f_sub = CUDA.CuArray(img1_f_sub_cpu)
+        img2_f_sub = CUDA.CuArray(img2_f_sub_cpu)
+        cc2x = CUDA.zeros(eltype(img1_f_sub), 2 * nx, 2 * ny)
+        cc2x_abs2_work = similar(cc2x, float(real(eltype(cc2x))))
+
+        dftreg_subpix!(img1_f_sub, img2_f_sub, cc2x; cc2x_abs2_work=cc2x_abs2_work) # warmup
+        CUDA.synchronize()
+        trial = run(
+            @benchmarkable begin
+                dftreg_subpix!($img1_f_sub, $img2_f_sub, $cc2x; cc2x_abs2_work=$cc2x_abs2_work)
+                CUDA.synchronize()
+            end samples=cfg.samples evals=cfg.evals
+        )
+        push_summary!(rows, "cuda", "dftreg_subpix", dims, cfg, trial)
+    end
+
+    for (i, (nx, ny, nz)) in enumerate(cfg.stack_sizes)
+        rng = MersenneTwister(cfg.seed + 100 + i)
+        dims = "$(nx)x$(ny)x$(nz)"
+        stack_template = CUDA.CuArray(make_stack(nx, ny, nz, rng, cfg.noise_sigma))
+        stack_work = similar(stack_template)
+        img1_f = CUDA.zeros(ComplexF32, nx, ny)
+        img2_f = CUDA.zeros(ComplexF32, nx, ny)
+        cc2x = CUDA.zeros(ComplexF32, 2 * nx, 2 * ny)
+        nbuf = CUDA.zeros(Float32, nx, ny)
+        reg_param = Dict{Int,Any}()
+
+        copyto!(stack_work, stack_template)
+        reg_stack_translate!(stack_work, img1_f, img2_f, cc2x, nbuf; reg_param=reg_param) # warmup
+        CUDA.synchronize()
+
+        trial = run(
+            @benchmarkable begin
+                copyto!($stack_work, $stack_template)
+                empty!($reg_param)
+                reg_stack_translate!($stack_work, $img1_f, $img2_f, $cc2x, $nbuf; reg_param=$reg_param)
+                CUDA.synchronize()
+            end samples=cfg.samples evals=1
+        )
+        push_summary!(rows, "cuda", "reg_stack_translate", dims, cfg, trial)
+    end
+end
+
+function print_rows(rows)
+    println()
+    println("Benchmark results (times in ms):")
+    println(rpad("backend", 8), rpad("case", 20), rpad("dims", 14), lpad("median", 12), lpad("mean", 12), lpad("min", 12), lpad("memory", 12), lpad("allocs", 10))
+    for r in rows
+        @printf(
+            "%-8s%-20s%-14s%12.3f%12.3f%12.3f%12d%10d\n",
+            r.backend,
+            r.case,
+            r.dims,
+            r.median_ms,
+            r.mean_ms,
+            r.min_ms,
+            r.memory_bytes,
+            r.allocs,
+        )
+    end
+end
+
+function write_csv(path::String, rows, cfg::BenchConfig)
+    mkpath(dirname(path))
+    open(path, "w") do io
+        println(io, "backend,case,dims,median_ms,mean_ms,min_ms,memory_bytes,allocs,samples,evals,seed")
+        for r in rows
+            @printf(
+                io,
+                "%s,%s,%s,%.6f,%.6f,%.6f,%d,%d,%d,%d,%d\n",
+                r.backend,
+                r.case,
+                r.dims,
+                r.median_ms,
+                r.mean_ms,
+                r.min_ms,
+                r.memory_bytes,
+                r.allocs,
+                cfg.samples,
+                cfg.evals,
+                cfg.seed,
+            )
+        end
+    end
+end
+
+function main()
+    cfg = parse_args(ARGS)
+    rows = Vector{NamedTuple}()
+
+    println("FFTRegGPU benchmark configuration:")
+    println("  backend:   ", cfg.backend)
+    println("  samples:   ", cfg.samples)
+    println("  evals:     ", cfg.evals)
+    println("  seed:      ", cfg.seed)
+    println("  sizes:     ", join(["$(x)x$(y)" for (x, y) in cfg.sizes], ", "))
+    println("  stack:     ", join(["$(x)x$(y)x$(z)" for (x, y, z) in cfg.stack_sizes], ", "))
+    println("  noise:     ", cfg.noise_sigma)
+
+    if cfg.backend in (:cpu, :both)
+        run_cpu_benchmarks(cfg, rows)
+    end
+    if cfg.backend in (:cuda, :both)
+        run_cuda_benchmarks(cfg, rows)
+    end
+
+    print_rows(rows)
+    if cfg.output !== nothing
+        write_csv(cfg.output, rows, cfg)
+        println()
+        println("Wrote CSV: ", cfg.output)
+    end
+end
+
+main()
diff --git a/docs/img/before_after_xy_slice.png b/docs/img/before_after_xy_slice.png
new file mode 100644
index 0000000..cf705e0
Binary files /dev/null and b/docs/img/before_after_xy_slice.png differ
diff --git a/docs/img/before_after_xz_MIP.png b/docs/img/before_after_xz_MIP.png
new file mode 100644
index 0000000..f309152
Binary files /dev/null and b/docs/img/before_after_xz_MIP.png differ
diff --git a/ext/FFTRegGPUCPUExt.jl b/ext/FFTRegGPUCPUExt.jl
new file mode 100644
index 0000000..23c8a7d
--- /dev/null
+++ b/ext/FFTRegGPUCPUExt.jl
@@ -0,0 +1,27 @@
+"""
+    FFTRegGPUCPUExt
+
+CPU extension for FFTRegGPU backend hooks.
+
+When `FFTW` is loaded and inputs are `StridedArray`s, FFTRegGPU internal hooks
+dispatch to FFTW and AbstractFFTs shift operations.
+"""
+module FFTRegGPUCPUExt
+
+using FFTRegGPU
+using FFTW
+using AbstractFFTs
+
+import FFTRegGPU: _fft, _ifft, _fftshift, _ifftshift
+import FFTRegGPU: _ifft!, _plan_fft_inplace, _fft_inplace!
+
+_fft(inp::StridedArray) = FFTW.fft(inp)
+_ifft(inp::StridedArray) = FFTW.ifft(inp)
+_ifft!(inp::StridedArray{<:Complex}) = FFTW.ifft!(inp)
+_plan_fft_inplace(inp::StridedArray{<:Complex}) = FFTW.plan_fft!(inp)
+_fft_inplace!(inp::StridedArray{<:Complex}) = FFTW.fft!(inp)
+_fft_inplace!(inp::StridedArray{<:Complex}, plan::FFTW.cFFTWPlan) = (plan * inp)
+_fftshift(inp::StridedArray) = AbstractFFTs.fftshift(inp)
+_ifftshift(inp::StridedArray) = AbstractFFTs.ifftshift(inp)
+
+end
diff --git a/ext/FFTRegGPUCUDAExt.jl b/ext/FFTRegGPUCUDAExt.jl
new file mode 100644
index 0000000..6e3aef8
--- /dev/null
+++ b/ext/FFTRegGPUCUDAExt.jl
@@ -0,0 +1,34 @@
+"""
+    FFTRegGPUCUDAExt
+
+CUDA extension for FFTRegGPU backend hooks.
+
+When `CUDA` is loaded and inputs are `CuArray` (or views over `CuArray`), FFT
+and shift operations dispatch to CUFFT/CUDA implementations.
+"""
+module FFTRegGPUCUDAExt
+
+using FFTRegGPU
+using CUDA
+
+import FFTRegGPU: _fft, _ifft, _ifft!, _fftshift, _ifftshift, _scalar_at
+import FFTRegGPU: _plan_fft_inplace, _fft_inplace!
+
+_fft(inp::CUDA.CuArray) = CUDA.CUFFT.fft(inp)
+_ifft(inp::CUDA.CuArray) = CUDA.CUFFT.ifft(inp)
+_ifft!(inp::CUDA.CuArray{<:Complex}) = CUDA.CUFFT.ifft!(inp)
+_plan_fft_inplace(inp::CUDA.CuArray{<:Complex}) = CUDA.CUFFT.plan_fft!(inp)
+_fft_inplace!(inp::CUDA.CuArray{<:Complex}) = CUDA.CUFFT.fft!(inp)
+_fft_inplace!(inp::CUDA.CuArray{<:Complex}, ::Nothing) = CUDA.CUFFT.fft!(inp)
+_fft_inplace!(inp::CUDA.CuArray{<:Complex}, plan) = (plan * inp)
+_fftshift(inp::CUDA.CuArray) = CUDA.CUFFT.fftshift(inp)
+_ifftshift(inp::CUDA.CuArray) = CUDA.CUFFT.ifftshift(inp)
+_scalar_at(inp::CUDA.CuArray, idx) = CUDA.@allowscalar inp[idx]
+
+_fft(inp::SubArray{<:Any,<:Any,<:CUDA.CuArray}) = CUDA.CUFFT.fft(inp)
+_ifft(inp::SubArray{<:Any,<:Any,<:CUDA.CuArray}) = CUDA.CUFFT.ifft(inp)
+_fftshift(inp::SubArray{<:Any,<:Any,<:CUDA.CuArray}) = CUDA.CUFFT.fftshift(inp)
+_ifftshift(inp::SubArray{<:Any,<:Any,<:CUDA.CuArray}) = CUDA.CUFFT.ifftshift(inp)
+_scalar_at(inp::SubArray{<:Any,<:Any,<:CUDA.CuArray}, idx) = CUDA.@allowscalar inp[idx]
+
+end
diff --git a/src/FFTRegGPU.jl b/src/FFTRegGPU.jl
index 708b98a..1ce1a9a 100644
--- a/src/FFTRegGPU.jl
+++ b/src/FFTRegGPU.jl
@@ -1,11 +1,25 @@
+"""
+    FFTRegGPU
+
+Phase-correlation based translation registration for 2D images with array-type
+driven backend dispatch (CPU via FFTW, GPU via CUDA extensions).
+
+The core API expects Fourier-domain images (`fft(img)`) and exposes:
+- `dftreg!` for integer-pixel translation estimation,
+- `dftreg_subpix!` for subpixel translation estimation,
+- `subpix_shift!` and `dftreg_resample!` for Fourier-domain shifting/resampling,
+- `reg_stack_translate!` for sequential z-stack registration.
+"""
 module FFTRegGPU
-using CUDA, AbstractFFTs
+using AbstractFFTs
 
 include("dftreg_translate.jl")
 
-export dftreg_gpu!,
-	subpix_shift_gpu!,
-	dftreg_resample_gpu!,
-	reg_stack_translate!
+export dftreg!,
+    dftreg_subpix!,
+    subpix_shift!,
+    dftreg_resample,
+    dftreg_resample!,
+    reg_stack_translate!
 
 end # module
diff --git a/src/dftreg_translate.jl b/src/dftreg_translate.jl
index c37f285..65a18f6 100644
--- a/src/dftreg_translate.jl
+++ b/src/dftreg_translate.jl
@@ -1,64 +1,265 @@
-function dftups(inp::AbstractArray{T,N},no,usfac::Int=1,offset=zeros(N)) where {T,N}
-    sz = [size(inp)...]
+"""
+    _fft(inp)
+    _ifft(inp)
+    _ifft!(inp)
+    _plan_fft_inplace(inp)
+    _fft_inplace!(inp)
+    _fft_inplace!(inp, plan)
+    _fftshift(inp)
+    _ifftshift(inp)
+
+Internal backend hooks used by FFTRegGPU algorithms.
+
+These methods default to `AbstractFFTs` behavior and are extended in
+`ext/FFTRegGPUCPUExt.jl` and `ext/FFTRegGPUCUDAExt.jl` so that dispatch follows
+the input array type.
+"""
+_fft(inp::AbstractArray) = fft(inp)
+_ifft(inp::AbstractArray) = ifft(inp)
+_ifft!(inp::AbstractArray) = copyto!(inp, _ifft(inp))
+_plan_fft_inplace(inp::AbstractArray{<:Complex}) = nothing
+_fft_inplace!(inp::AbstractArray{<:Complex}) = copyto!(inp, _fft(inp))
+_fft_inplace!(inp::AbstractArray{<:Complex}, ::Nothing) = _fft_inplace!(inp)
+_fft_inplace!(inp::AbstractArray{<:Complex}, plan) = _fft_inplace!(inp)
+_fftshift(inp::AbstractArray) = fftshift(inp)
+_ifftshift(inp::AbstractArray) = ifftshift(inp)
+
+"""
+    _scalar_at(inp, idx)
+
+Read a scalar element from `inp[idx]`.
+
+CUDA extensions overload this helper with `@allowscalar` so peak extraction can
+remain explicit and localized.
+"""
+_scalar_at(inp::AbstractArray, idx) = inp[idx]
+
+"""
+    _backend_template(inp)
+
+Return a parent array/container that represents where allocations should happen
+for `inp` (for example the underlying `CuArray` for wrapped views).
+"""
+_backend_template(inp::AbstractArray) = inp
+_backend_template(inp::SubArray) = _backend_template(parent(inp))
+_backend_template(inp::Base.ReshapedArray) = _backend_template(parent(inp))
+_backend_template(inp::Base.PermutedDimsArray) = _backend_template(parent(inp))
+
+"""
+    _to_backend(ref, src)
+
+Allocate a new array with the same backend/container family as `ref`, then copy
+`src` into it.
+
+This is used to keep temporary arrays on the same device as the active
+computation (for example CPU `Array` vs GPU `CuArray`).
+"""
+function _to_backend(ref::AbstractArray, src::AbstractArray)
+    tmpl = _backend_template(ref)
+    out = similar(tmpl, eltype(src), size(src))
+    copyto!(out, src)
+    out
+end
+
+"""
+    _findmax_abs2_loc(inp, [work])
+
+Return the location of the maximum of `abs2.(inp)`.
+
+If `work` is provided, it must match `size(inp)` and is reused as a scratch
+buffer to avoid allocating `abs2.(inp)`.
+"""
+function _findmax_abs2_loc(inp::AbstractArray{<:Complex}, work::Union{Nothing,AbstractArray{<:Real}}=nothing)
+    if work === nothing
+        _, loc = findmax(abs2.(inp))
+        return loc
+    end
+
+    size(work) == size(inp) || throw(DimensionMismatch("work must have the same size as input"))
+    @. work = abs2(inp)
+    _, loc = findmax(work)
+    loc
+end
+
+"""
+    dftups(inp, no, usfac=1, offset=nothing)
+
+Compute an upsampled DFT region by matrix multiplication (Guizar-Sicairos style)
+without zero-padding the full spectrum.
+
+`inp` is typically the Fourier-domain cross-power spectrum. `no` is the output
+size per dimension, `usfac` is the upsampling factor, and `offset` optionally
+sets the center offset (per-dimension) of the sampled DFT window.
+"""
+function dftups(inp::AbstractArray{T,N}, no::Integer, usfac::Int=1, offset=nothing) where {T<:Number,N}
+    no > 0 || throw(ArgumentError("no must be positive"))
+    usfac > 0 || throw(ArgumentError("usfac must be positive"))
+
+    Treal = float(real(T))
+    offset_vals = offset === nothing ? ntuple(_ -> zero(Treal), N) : ntuple(i -> Treal(offset[i]), N)
+
+    sz = ntuple(i -> size(inp, i), N)
     permV = 1:N
+    out = inp
     for i in permV
-        inp = permutedims(inp,[i;deleteat!(collect(permV),i)])
-        kern = exp.((-1im*2*pi/(sz[i]*usfac))*((0:(no-1)).-offset[i])*transpose(ifftshift(0:(sz[i]-1)).-floor(sz[i]/2)))
-        d = size(inp)[2:N]
-        inp = kern * reshape(inp, Val(2))
-        inp = reshape(inp,(no,d...))
+        out = permutedims(out, [i; deleteat!(collect(permV), i)])
+
+        row = Treal.(0:(no - 1)) .- offset_vals[i]
+        col = Treal.(ifftshift(0:(sz[i] - 1)) .- div(sz[i], 2))
+        phase = -complex(zero(Treal), Treal(2 * pi / (sz[i] * usfac)))
+        kern_host = exp.(phase .* (row * transpose(col)))
+        kern = _to_backend(out, convert.(eltype(out), kern_host))
+
+        d = size(out)[2:N]
+        out = kern * reshape(out, Val(2))
+        out = reshape(out, (no, d...))
     end
-    permutedims(inp,collect(ndims(inp):-1:1))
+    permutedims(out, collect(ndims(out):-1:1))
 end
 
-function dftreg_gpu!(img1_f_g::CuArray{Complex{Float32},2}, img2_f_g::CuArray{Complex{Float32},2},
-    CC_g::CuArray{Complex{Float32},2})
-    L = length(img1_f_g)
-    CC_g .= CUFFT.ifft(img1_f_g .* CUDA.conj(img2_f_g))
-    loc = argmax(abs.(CC_g))
-    CCmax = Array(CC_g)[loc]
-    rfzero = sum(abs2, img1_f_g) / L
-    rgzero = sum(abs2, img2_f_g) / L
-    error = abs(1 - CCmax * conj(CCmax) / (rgzero * rfzero))
-    diffphase = atan(imag(CCmax),real(CCmax))
-
-    indi = size(img1_f_g)
-    ind2 = tuple([div(x,2) for x in indi]...)
-
-    locI = [Tuple(loc)...]
-
-    shift = zeros(size(locI))
-    for i = eachindex(locI)
-        if locI[i]>ind2[i]
-            shift[i]=locI[i]-indi[i]-1
-        else shift[i]=locI[i]-1
+"""
+    dftreg!(img1_f, img2_f, CC; cc_abs2_work=nothing) -> (error, shift, diffphase)
+
+Estimate integer-pixel translation between two 2D images in the Fourier domain.
+
+`img1_f` and `img2_f` must be Fourier transforms of real images of equal size.
+`CC` is a complex work/output buffer with the same size and is overwritten with
+the inverse-FFT cross-correlation field.
+
+Returns:
+- `error`: translation-invariant normalized RMS-like mismatch metric,
+- `shift`: real-valued vector `(dx, dy)` to apply to `img2` to align to `img1`,
+- `diffphase`: global phase difference (radians).
+"""
+function dftreg!(
+    img1_f::AbstractMatrix{T1},
+    img2_f::AbstractMatrix{T2},
+    CC::AbstractMatrix{TC},
+    ;
+    cc_abs2_work::Union{Nothing,AbstractMatrix{<:Real}}=nothing,
+) where {T1<:Complex,T2<:Complex,TC<:Complex}
+    size(img1_f) == size(img2_f) || throw(DimensionMismatch("img1_f and img2_f must have the same size"))
+    size(img1_f) == size(CC) || throw(DimensionMismatch("CC must have the same size as img1_f"))
+
+    Treal = float(real(promote_type(T1, T2, TC)))
+    L = Treal(length(img1_f))
+
+    @. CC = img1_f * conj(img2_f)
+    _ifft!(CC)
+
+    loc = _findmax_abs2_loc(CC, cc_abs2_work)
+    CCmax = _scalar_at(CC, loc)
+    rfzero = sum(abs2, img1_f) / L
+    rgzero = sum(abs2, img2_f) / L
+    error = abs(one(Treal) - CCmax * conj(CCmax) / (rgzero * rfzero))
+    diffphase = Treal(atan(imag(CCmax), real(CCmax)))
+
+    indi = size(img1_f)
+    ind2 = ntuple(i -> div(indi[i], 2), length(indi))
+    locI = Tuple(loc)
+    shift = zeros(Treal, length(locI))
+
+    for i in eachindex(locI)
+        if locI[i] > ind2[i]
+            shift[i] = locI[i] - indi[i] - 1
+        else
+            shift[i] = locI[i] - 1
         end
     end
-    
+
     error, shift, diffphase
 end
 
-function dftreg_subpix_gpu!(img1_f_g::CuArray{Complex{Float32},2}, img2_f_g::CuArray{Complex{Float32},2},
-    CC2x_g::CuArray{Complex{Float32},2}, up_fac::Int=10)
-    ## initial estimate by 2x upsample
-    # embed 2x fft
-    dim_input = collect(size(img1_f_g))
-    ranges = [(x+1-div(x,2)):(x+1+div(x-1,2)) for x in dim_input]
-    CC2x_g .= 0
-    CC2x_g[ranges...] .= CUFFT.fftshift(img1_f_g) .* CUFFT.conj(CUFFT.fftshift(img2_f_g))
+"""
+    _embed_crosspower_2x!(CC2x, img1_f, img2_f) -> CC2x
+
+Write the 2x zero-padded cross-power spectrum directly in native FFT order.
+
+This is equivalent to constructing `fftshift(img1_f) .* conj.(fftshift(img2_f))`
+in the centered region of `CC2x` and then applying `ifftshift(CC2x)`, but avoids
+materializing the shifted arrays.
+"""
+function _embed_crosspower_2x!(
+    CC2x::AbstractMatrix{TC},
+    img1_f::AbstractMatrix{T1},
+    img2_f::AbstractMatrix{T2},
+) where {TC<:Complex,T1<:Complex,T2<:Complex}
+    size(img1_f) == size(img2_f) || throw(DimensionMismatch("img1_f and img2_f must have the same size"))
+    m, n = size(img1_f)
+    expected_cc_size = (2m, 2n)
+    size(CC2x) == expected_cc_size ||
+        throw(DimensionMismatch("CC2x must have size $(expected_cc_size), got $(size(CC2x))"))
+
+    fill!(CC2x, zero(TC))
+
+    h1, t1 = cld(m, 2), fld(m, 2)
+    h2, t2 = cld(n, 2), fld(n, 2)
+
+    in1_h, in1_t = 1:h1, (h1 + 1):m
+    in2_h, in2_t = 1:h2, (h2 + 1):n
+    out1_h, out1_t = 1:h1, (2m - t1 + 1):(2m)
+    out2_h, out2_t = 1:h2, (2n - t2 + 1):(2n)
+
+    @views @. CC2x[out1_h, out2_h] = img1_f[in1_h, in2_h] * conj(img2_f[in1_h, in2_h])
+
+    if t2 > 0
+        @views @. CC2x[out1_h, out2_t] = img1_f[in1_h, in2_t] * conj(img2_f[in1_h, in2_t])
+    end
+    if t1 > 0
+        @views @. CC2x[out1_t, out2_h] = img1_f[in1_t, in2_h] * conj(img2_f[in1_t, in2_h])
+    end
+    if t1 > 0 && t2 > 0
+        @views @. CC2x[out1_t, out2_t] = img1_f[in1_t, in2_t] * conj(img2_f[in1_t, in2_t])
+    end
+
+    CC2x
+end
+
+"""
+    dftreg_subpix!(img1_f, img2_f, CC2x, up_fac=10; cc2x_abs2_work=nothing)
+        -> (error, shift, diffphase)
+
+Estimate subpixel translation between two 2D Fourier-domain images.
+
+Algorithm:
+1. Build a 2x upsampled cross-correlation grid in `CC2x` for a coarse estimate.
+2. Optionally refine the peak with matrix-multiply DFT (`dftups`) at `up_fac`.
+
+`CC2x` must be size `(2*size(img1_f,1), 2*size(img1_f,2))`. Returned `shift`
+is the translation to apply to `img2` so it aligns with `img1`.
+"""
+function dftreg_subpix!(
+    img1_f::AbstractMatrix{T1},
+    img2_f::AbstractMatrix{T2},
+    CC2x::AbstractMatrix{TC},
+    up_fac::Int=10,
+    ;
+    cc2x_abs2_work::Union{Nothing,AbstractMatrix{<:Real}}=nothing,
+) where {T1<:Complex,T2<:Complex,TC<:Complex}
+    size(img1_f) == size(img2_f) || throw(DimensionMismatch("img1_f and img2_f must have the same size"))
+    up_fac > 0 || throw(ArgumentError("up_fac must be positive"))
+    Treal = float(real(promote_type(T1, T2, TC)))
+
+    # initial estimate by 2x upsample
+    dim_input = size(img1_f)
+    expected_cc_size = ntuple(i -> 2 * dim_input[i], length(dim_input))
+    size(CC2x) == expected_cc_size ||
+        throw(DimensionMismatch("CC2x must have size $(expected_cc_size), got $(size(CC2x))"))
+
+    _embed_crosspower_2x!(CC2x, img1_f, img2_f)
 
     # compute cross-correlation and locate the peak
-    CC2x_g = CUFFT.ifft(CUFFT.ifftshift(CC2x_g))
-    loc = argmax(abs.(CC2x_g))
+    _ifft!(CC2x)
+    loc = _findmax_abs2_loc(CC2x, cc2x_abs2_work)
 
-    indi = size(CC2x_g)
+    indi = size(CC2x)
     locI = collect(Tuple(loc))
-    CC2x_max = Array(CC2x_g)[loc]
+    CC2x_max = _scalar_at(CC2x, loc)
 
     # obtain shift in original pixel grid
-    ind2 = indi ./ 2
-    shift = zeros(size(locI))
-    for i = eachindex(locI)
+    ind2 = Treal.(indi) ./ Treal(2)
+    shift = zeros(Treal, length(locI))
+    for i in eachindex(locI)
         if locI[i] > ind2[i]
             shift[i] = locI[i] - indi[i] - 1
         else
@@ -67,88 +268,210 @@ function dftreg_subpix_gpu!(img1_f_g::CuArray{Complex{Float32},2}, img2_f_g::CuA
     end
     shift = shift / 2
 
-    ## refine subpixel estimation
+    # refine subpixel estimation
     if up_fac > 2
-        # refine the estimate with matrix multiply DFT
-        shift = round.(Int, shift * up_fac) / up_fac # initial shift estimate
-        dft_shift = ceil(up_fac * 1.5) / 2 # center of output at dft_shift + 1
-
-        # mat multiplies dft around the current shift estimate
-        CC_refine = dftups(Array(img2_f_g .* CUFFT.conj(img1_f_g)), ceil(Int, up_fac * 1.5),
-            up_fac, dft_shift .- shift * up_fac) / (prod(ind2) * up_fac ^ 2)
-
-        # locate max and map back to the original grid
-        loc = argmax(abs.(CC_refine))
-        locI = Tuple(loc)
-        CC_refine_max = CC_refine[loc]
-        locI = locI .- dft_shift .- 1
-        shift = shift .+ locI ./ up_fac
-        
-        img1_00 = dftups(Array(img1_f_g .* CUFFT.conj(img1_f_g)), 1, up_fac)[1] / (prod(ind2) * up_fac ^ 2)
-        img2_00 = dftups(Array(img2_f_g .* CUFFT.conj(img2_f_g)), 1, up_fac)[1] / (prod(ind2) * up_fac ^ 2)
+        up = Treal(up_fac)
+        shift = round.(Treal, shift .* up) ./ up # initial shift estimate
+        dft_shift = Treal(ceil(up_fac * 1.5) / 2) # center of output at dft_shift + 1
+        denom = prod(ind2) * up ^ 2
+
+        CC_refine = dftups(
+            img2_f .* conj.(img1_f),
+            ceil(Int, up_fac * 1.5),
+            up_fac,
+            dft_shift .- shift .* up,
+        ) / denom
+
+        loc = _findmax_abs2_loc(CC_refine)
+        locI_ref = Treal.(Tuple(loc))
+        CC_refine_max = _scalar_at(CC_refine, loc)
+        shift = shift .+ (locI_ref .- dft_shift .- one(Treal)) ./ up
+
+        img1_00 = _scalar_at(dftups(img1_f .* conj.(img1_f), 1, up_fac), 1) / denom
+        img2_00 = _scalar_at(dftups(img2_f .* conj.(img2_f), 1, up_fac), 1) / denom
         CC_max = CC_refine_max
     else
-        img1_00 = sum(img1_f_g .* CUFFT.conj(img1_f_g)) / prod(indi)
-        img2_00 = sum(img2_f_g .* CUFFT.conj(img2_f_g)) / prod(indi)
+        img1_00 = sum(img1_f .* conj.(img1_f)) / prod(indi)
+        img2_00 = sum(img2_f .* conj.(img2_f)) / prod(indi)
         CC_max = CC2x_max
     end
-    
+
     error = 1 - CC_max * conj(CC_max) / (img1_00 * img2_00)
-    error = sqrt(abs.(error))
-    diffphase = atan(imag(CC_max), real(CC_max))
-    
+    error = sqrt(abs(error)) |> Treal
+    diffphase = Treal(atan(imag(CC_max), real(CC_max)))
+
     error, shift, diffphase
 end
 
-function subpix_shift_gpu!(img_f_g::CuArray{Complex{Float32},2}, N_g::CuArray{Float32,2}, shift, diffphase)
-    sz = [size(img_f_g)...]
-    N_ = Float32(0)
-    for i = eachindex(sz)
-        shifti = ifftshift((-div(sz[i],2)):(ceil(Integer,sz[i]/2)-1))*shift[i]/sz[i]
-        resh = (repeat([1],inner=[i-1])...,length(shifti))
-        N_ = N_ .- Float32.(reshape(shifti,resh))
+"""
+    subpix_shift!(out, img_f, N, shift, diffphase) -> out
+
+Apply a subpixel translation and global phase correction directly in Fourier
+space.
+
+`img_f` is the Fourier-domain moving image, `shift` is the translation vector,
+`diffphase` is the global phase correction, and `N` is a real scratch matrix
+matching `size(img_f)` used to accumulate the phase ramp. `out` is overwritten
+with the shifted Fourier-domain image.
+"""
+function subpix_shift!(
+    out::AbstractMatrix{<:Complex},
+    img_f::AbstractMatrix{<:Complex},
+    N::AbstractMatrix{<:Real},
+    shift,
+    diffphase,
+)
+    size(out) == size(img_f) || throw(DimensionMismatch("out must have the same size as img_f"))
+    size(img_f) == size(N) || throw(DimensionMismatch("N must have the same size as img_f"))
+    length(shift) == ndims(img_f) || throw(DimensionMismatch("shift must match image dimensionality"))
+
+    sz = [size(img_f)...]
+    Tphase = float(real(eltype(out)))
+    TN = eltype(N)
+    fill!(N, zero(TN))
+
+    for i in eachindex(sz)
+        shifti = ifftshift((-div(sz[i], 2)):(ceil(Int, sz[i] / 2) - 1)) * shift[i] / sz[i]
+        resh = (ntuple(_ -> 1, i - 1)..., length(shifti))
+        shifti_backend = _to_backend(N, reshape(TN.(shifti), resh))
+        N .-= shifti_backend
     end
-    
-    copyto!(N_g, N_)
-    exp(1im * diffphase) .* (img_f_g .* exp.(Complex{Float32}(2im * pi) * N_g))
+
+    phase = cis(Tphase(diffphase))
+    twopi_im = complex(zero(Tphase), Tphase(2 * pi))
+    @. out = phase * (img_f * exp(twopi_im * N))
+    out
 end
 
-function dftreg_resample_gpu!(img_f_g::CuArray{Complex{Float32}, 2}, N_g::CuArray{Float32,2}, shift, diffphase)
-    real(CUFFT.ifft(subpix_shift_gpu!(img_f_g, N_g, shift, diffphase)))
+"""
+    subpix_shift!(img_f, N, shift, diffphase)
+
+Allocation convenience method for [`subpix_shift!`](@ref) that returns a newly
+allocated Fourier-domain output array.
+"""
+function subpix_shift!(
+    img_f::AbstractMatrix{<:Complex},
+    N::AbstractMatrix{<:Real},
+    shift,
+    diffphase,
+)
+    out = similar(img_f)
+    subpix_shift!(out, img_f, N, shift, diffphase)
 end
 
-function reg_stack_translate!(img_stack_reg_g::CuArray{Float32,3}, img1_f_g::CuArray{Complex{Float32},2},
-        img2_f_g::CuArray{Complex{Float32},2}, CC2x_g::CuArray{Complex{Float32},2},
-        N_g::CuArray{Float32,2}; reg_param::Dict)
-    size_x, size_y, size_z = size(img_stack_reg_g)
-    # reset arrays
-    CC2x_g .= 0
-    N_g .= 0
-    
-    # register
+"""
+    dftreg_resample!(out, img_f, N, shift, diffphase; work_f=similar(img_f)) -> out
+
+Resample a Fourier-domain moving image into real-space after applying
+translation/phase correction.
+
+This combines [`subpix_shift!`](@ref) with inverse FFT and writes the real part
+to `out`. `work_f` is a complex scratch/output buffer for the shifted spectrum.
+"""
+function dftreg_resample!(
+    out::AbstractMatrix{<:Real},
+    img_f::AbstractMatrix{<:Complex},
+    N::AbstractMatrix{<:Real},
+    shift,
+    diffphase,
+    ;
+    work_f::AbstractMatrix{<:Complex}=similar(img_f),
+)
+    size(out) == size(img_f) || throw(DimensionMismatch("out must have the same size as img_f"))
+    size(work_f) == size(img_f) || throw(DimensionMismatch("work_f must have the same size as img_f"))
+
+    subpix_shift!(work_f, img_f, N, shift, diffphase)
+    _ifft!(work_f)
+    out .= real.(work_f)
+    out
+end
+
+"""
+    dftreg_resample(img_f, N, shift, diffphase; work_f=similar(img_f))
+
+Allocation convenience method for [`dftreg_resample!`](@ref) that returns a new
+real-valued registered image.
+"""
+function dftreg_resample(
+    img_f::AbstractMatrix{<:Complex},
+    N::AbstractMatrix{<:Real},
+    shift,
+    diffphase,
+    ;
+    work_f::AbstractMatrix{<:Complex}=similar(img_f),
+)
+    out = similar(N, float(real(eltype(img_f))), size(img_f))
+    dftreg_resample!(out, img_f, N, shift, diffphase; work_f=work_f)
+end
+
+"""
+    dftreg_resample!(img_f, N, shift, diffphase)
+
+Legacy convenience alias equivalent to [`dftreg_resample`](@ref).
+"""
+function dftreg_resample!(
+    img_f::AbstractMatrix{<:Complex},
+    N::AbstractMatrix{<:Real},
+    shift,
+    diffphase,
+)
+    dftreg_resample(img_f, N, shift, diffphase)
+end
+
+"""
+    reg_stack_translate!(img_stack_reg, img1_f, img2_f, CC2x, N; reg_param=Dict())
+
+Register a 3D stack (`x, y, z`) by translating each plane `z` to align with the
+previous registered plane `z-1`.
+
+The operation is in-place on `img_stack_reg`. `img1_f`, `img2_f`, `CC2x`, and
+`N` are reusable work buffers:
+- `img1_f`, `img2_f`: Fourier-domain work images (`size == size(img_stack_reg[:,:,1])`)
+- `CC2x`: coarse cross-correlation buffer (2x in each planar dimension)
+- `N`: real phase-ramp scratch buffer
+
+Forward FFTs are executed in-place on `img1_f`/`img2_f`; when supported by the
+active backend (for example FFTW), plans are reused across all z-planes.
+
+If `reg_param[z]` exists, it must contain `(error, shift, diffphase)` and is
+reused instead of recomputing registration for plane `z`. Otherwise the tuple is
+computed and stored.
+"""
+function reg_stack_translate!(
+    img_stack_reg::AbstractArray{<:Real,3},
+    img1_f::AbstractMatrix{<:Complex},
+    img2_f::AbstractMatrix{<:Complex},
+    CC2x::AbstractMatrix{<:Complex},
+    N::AbstractMatrix{<:Real};
+    reg_param::AbstractDict{<:Integer}=Dict{Int,Any}(),
+)
+    _, _, size_z = size(img_stack_reg)
+    CC2x .= zero(eltype(CC2x))
+    N .= zero(eltype(N))
+    cc2x_abs2_work = similar(CC2x, float(real(eltype(CC2x))))
+    fft_plan1 = _plan_fft_inplace(img1_f)
+    fft_plan2 = _plan_fft_inplace(img2_f)
+
     for z = 2:size_z
         z1, z2 = z - 1, z
-        # copy data and fft
-        img1_g = view(img_stack_reg_g, :,:,z1)
-        img2_g = view(img_stack_reg_g, :,:,z2)
-        img1_f_g .= CUFFT.fft(img1_g)
-        img2_f_g .= CUFFT.fft(img2_g)
+        img1 = view(img_stack_reg, :, :, z1)
+        img2 = view(img_stack_reg, :, :, z2)
+        img1_f .= img1
+        _fft_inplace!(img1_f, fft_plan1)
+        img2_f .= img2
+        _fft_inplace!(img2_f, fft_plan2)
 
-        # register
         if !haskey(reg_param, z)
-            error, shift, diffphase = dftreg_subpix_gpu!(img1_f_g, img2_f_g, CC2x_g)
+            error, shift, diffphase = dftreg_subpix!(img1_f, img2_f, CC2x; cc2x_abs2_work=cc2x_abs2_work)
             reg_param[z] = (error, shift, diffphase)
         else
             error, shift, diffphase = reg_param[z]
         end
-        
-        # resample
-        img_stack_reg_g[:,:,z] .= dftreg_resample_gpu!(img2_f_g, N_g, shift, diffphase)
-        
-        # reset arrays
-        CC2x_g .= 0
-        N_g .= 0
+
+        dftreg_resample!(view(img_stack_reg, :, :, z), img2_f, N, shift, diffphase; work_f=img2_f)
+        CC2x .= zero(eltype(CC2x))
+        N .= zero(eltype(N))
     end
-    
+
     nothing
-end
\ No newline at end of file
+end
diff --git a/test/runtests.jl b/test/runtests.jl
new file mode 100644
index 0000000..88a5505
--- /dev/null
+++ b/test/runtests.jl
@@ -0,0 +1,6 @@
+using Test
+using FFTW
+using FFTRegGPU
+
+include("unit_tests.jl")
+include("synthetic_stack_tests.jl")
diff --git a/test/synthetic_stack_tests.jl b/test/synthetic_stack_tests.jl
new file mode 100644
index 0000000..cccdd33
--- /dev/null
+++ b/test/synthetic_stack_tests.jl
@@ -0,0 +1,71 @@
+using Random
+using Test
+
+function make_blob_image(nx::Int, ny::Int, rng::AbstractRNG; nspots::Int=10, sigma::Float32=2.8f0)
+    x = reshape(Float32.(0:(nx - 1)), nx, 1)
+    y = reshape(Float32.(0:(ny - 1)), 1, ny)
+    img = zeros(Float32, nx, ny)
+
+    two_sigma2 = 2f0 * sigma^2
+    for _ in 1:nspots
+        cx = rand(rng, Float32) * Float32(nx - 1)
+        cy = rand(rng, Float32) * Float32(ny - 1)
+        amp = 0.2f0 + 0.8f0 * rand(rng, Float32)
+        img .+= amp .* exp.(-((x .- cx).^2 .+ (y .- cy).^2) ./ two_sigma2)
+    end
+
+    img ./= maximum(img)
+    img
+end
+
+function stack_adjacent_mse(stack::AbstractArray{<:Real,3})
+    _, _, nz = size(stack)
+    nz <= 1 && return 0.0
+    acc = 0.0
+    for z in 2:nz
+        a = @view stack[:, :, z - 1]
+        b = @view stack[:, :, z]
+        acc += sum((a .- b) .^ 2) / length(a)
+    end
+    acc / (nz - 1)
+end
+
+@testset "synthetic noisy stack registration" begin
+    rng = MersenneTwister(20260313)
+    nx, ny = 64, 64
+    known_rel = [(1, -1), (2, 1), (-1, 2), (0, -2), (1, 1)]
+    nz = length(known_rel) + 1
+
+    base = make_blob_image(nx, ny, rng)
+    clean_stack = zeros(Float32, nx, ny, nz)
+    clean_stack[:, :, 1] .= base
+    for z in 2:nz
+        clean_stack[:, :, z] .= circshift(@view(clean_stack[:, :, z - 1]), known_rel[z - 1])
+    end
+
+    noise_sigma = 0.03f0
+    noisy_stack = clean_stack .+ noise_sigma .* randn(rng, Float32, nx, ny, nz)
+    reg_stack = copy(noisy_stack)
+
+    img1_f = zeros(ComplexF32, nx, ny)
+    img2_f = zeros(ComplexF32, nx, ny)
+    cc2x = zeros(ComplexF32, 2 * nx, 2 * ny)
+    nbuf = zeros(Float32, nx, ny)
+    reg_param = Dict{Int,Any}()
+
+    mse_before = stack_adjacent_mse(noisy_stack)
+    reg_stack_translate!(reg_stack, img1_f, img2_f, cc2x, nbuf; reg_param=reg_param)
+    mse_after = stack_adjacent_mse(reg_stack)
+
+    @test length(reg_param) == nz - 1
+    cum_shift = zeros(Float32, 2)
+    for z in 2:nz
+        cum_shift .+= Float32.(collect(known_rel[z - 1]))
+        _, shift, _ = reg_param[z]
+        expected = -cum_shift
+        @test length(shift) == 2
+        @test all(isapprox.(shift, expected; atol=0.35f0))
+    end
+
+    @test mse_after < 0.75 * mse_before
+end
diff --git a/test/unit_tests.jl b/test/unit_tests.jl
new file mode 100644
index 0000000..76bc75a
--- /dev/null
+++ b/test/unit_tests.jl
@@ -0,0 +1,92 @@
+using Random
+using Test
+
+@testset "FFTRegGPU unit tests" begin
+    rng = MersenneTwister(42)
+    nx, ny = 32, 24
+
+    img = rand(rng, Float32, nx, ny)
+    img_f = fft(img)
+
+    @testset "argument validation" begin
+        bad_cc = zeros(ComplexF32, nx + 1, ny)
+        @test_throws DimensionMismatch dftreg!(img_f, img_f, bad_cc)
+
+        cc2x = zeros(ComplexF32, 2 * nx, 2 * ny)
+        @test_throws ArgumentError dftreg_subpix!(img_f, img_f, cc2x, 0)
+
+        bad_cc2x = zeros(ComplexF32, 2 * nx + 1, 2 * ny)
+        @test_throws DimensionMismatch dftreg_subpix!(img_f, img_f, bad_cc2x, 10)
+
+        @test_throws ArgumentError FFTRegGPU.dftups(img_f, 0, 1)
+        @test_throws ArgumentError FFTRegGPU.dftups(img_f, 4, 0)
+
+        out_bad = zeros(Float32, nx + 1, ny)
+        nbuf = zeros(Float32, nx, ny)
+        @test_throws DimensionMismatch dftreg_resample!(out_bad, img_f, nbuf, (0f0, 0f0), 0f0)
+    end
+
+    @testset "identity registration" begin
+        cc = similar(img_f)
+        err, shift, phase = dftreg!(img_f, img_f, cc)
+
+        @test err ≤ 1f-5
+        @test isapprox(phase, 0f0; atol=1f-5)
+        @test eltype(shift) == Float32
+        @test all(isapprox.(shift, zero(eltype(shift)); atol=1f-5))
+    end
+
+    @testset "integer shift recovery" begin
+        shifted = circshift(img, (3, -2))
+        shifted_f = fft(shifted)
+        cc = similar(img_f)
+        _, shift, _ = dftreg!(img_f, shifted_f, cc)
+        @test all(isapprox.(shift, Float32[-3, 2]; atol=1f-5))
+    end
+
+    @testset "resampling API" begin
+        shift = Float32[2.0, -1.0]
+        phase = 0f0
+        nbuf1 = zeros(Float32, nx, ny)
+        nbuf2 = zeros(Float32, nx, ny)
+
+        moved = circshift(img, (2, -1))
+        moved_f = fft(moved)
+
+        alloc_out = dftreg_resample(moved_f, nbuf1, shift, phase)
+        inpl_out = similar(img)
+        work_f = similar(moved_f)
+        ret = dftreg_resample!(inpl_out, moved_f, nbuf2, shift, phase; work_f=work_f)
+
+        @test ret === inpl_out
+        @test alloc_out ≈ inpl_out atol = 1f-5
+
+        phase_shifted = similar(moved_f)
+        ret2 = subpix_shift!(phase_shifted, moved_f, nbuf2, (0f0, 0f0), 0f0)
+        @test ret2 === phase_shifted
+        @test phase_shifted ≈ moved_f atol = 1f-6
+    end
+
+    @testset "2x cross-power embedding matches shift pipeline" begin
+        for (mx, my) in ((6, 4), (5, 7))
+            a = rand(rng, Float32, mx, my)
+            b = rand(rng, Float32, mx, my)
+            a_f = fft(a)
+            b_f = fft(b)
+
+            cc_new = zeros(ComplexF32, 2 * mx, 2 * my)
+            cc_ref = similar(cc_new)
+
+            FFTRegGPU._embed_crosspower_2x!(cc_new, a_f, b_f)
+
+            ranges = [(x + 1 - div(x, 2)):(x + 1 + div(x - 1, 2)) for x in size(a_f)]
+            cc_ref .= 0
+            a_shift = FFTRegGPU._fftshift(a_f)
+            b_shift = FFTRegGPU._fftshift(b_f)
+            @views @. cc_ref[ranges...] = a_shift * conj(b_shift)
+            copyto!(cc_ref, FFTRegGPU._ifftshift(cc_ref))
+
+            @test cc_new == cc_ref
+        end
+    end
+end