diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..f2667b6 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,48 @@ +name: CI + +on: + push: + pull_request: + workflow_dispatch: + +permissions: + contents: read + +jobs: + test: + name: Julia ${{ matrix.julia-version }} on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + timeout-minutes: 10 + + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + julia-version: + - '1.12' + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Julia + uses: julia-actions/setup-julia@v2 + with: + version: ${{ matrix.julia-version }} + + - name: Cache Julia artifacts + uses: julia-actions/cache@v2 + + - name: Instantiate project + shell: julia --color=yes --project=. {0} + run: | + using Pkg + Pkg.instantiate() + Pkg.precompile() + + - name: Run tests + shell: julia --color=yes --project=. {0} + run: | + using Pkg + Pkg.test() diff --git a/.gitignore b/.gitignore index 9dd64dd..e4c76d7 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,5 @@ docs/site/ # environment. Manifest.toml +# Benchmark outputs +benchmark/results/ diff --git a/Project.toml b/Project.toml index 5212bb6..c25e9fb 100644 --- a/Project.toml +++ b/Project.toml @@ -1,8 +1,23 @@ name = "FFTRegGPU" uuid = "86d2b4ce-0b7d-4e3e-98ec-23077478530b" -authors = ["Jungsoo Kim "] -version = "0.1.0" +authors = ["urlicht "] +version = "1.0.0" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" + +[weakdeps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" + +[extensions] +FFTRegGPUCUDAExt = "CUDA" +FFTRegGPUCPUExt = "FFTW" + +[extras] +FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test", "FFTW", "Random"] diff --git a/README.md b/README.md index ededdfd..4264fd4 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,28 @@ # FFTRegGPU.jl -Fast FFT GPU registrtion using [phase correlation](https://en.wikipedia.org/wiki/Phase_correlation). This GPU version is based on [SubpixelRegistration.jl](https://github.com/romainFr/SubpixelRegistration.jl) +[![CI](https://github.com/urlicht/FFTRegGPU.jl/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/urlicht/FFTRegGPU.jl/actions/workflows/ci.yml) + +Fast FFT GPU registration using [phase correlation](https://en.wikipedia.org/wiki/Phase_correlation). This GPU version is based on [SubpixelRegistration.jl](https://github.com/romainFr/SubpixelRegistration.jl) Currently it only supports translation. ## Usage and example +![before vs after registration showing a slice (xy)](docs/img/before_after_xy_slice.png) +![before vs after registration showing xz MIP for a volume](docs/img/before_after_xz_MIP.png) + +### Backend setup +Load FFTRegGPU with the backend package you want to use: + +```julia +using FFTRegGPU +using FFTW # CPU backend +# using CUDA # CUDA backend +``` + +Backend selection is determined by array type at call time: +- `CuArray` inputs use the CUDA extension (CUFFT/CUDA kernels). +- `Array`/`StridedArray` inputs use the CPU extension (FFTW). +- You can load both `CUDA` and `FFTW`; dispatch still follows the input array types. + ### Registering a set of 2D images ```julia # allocate GPU memory @@ -13,26 +32,27 @@ img1_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y) img2_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y) CC_g = CuArray{Complex{Float32}}(undef, size_x, size_y) N_g = CuArray{Float32}(undef, size_x, size_y) +img2_reg_g = CuArray{Float32}(undef, size_x, size_y) # copy to GPU copyto!(img1_g, Float32.(img1)) copyto!(img2_g, Float32.(img2)) -# perform FFT using CUFFT -img1_f_g .= CUFFT.fft(img1_g) -img2_f_g .= CUFFT.fft(img2_g) +# perform FFT using the active backend +img1_f_g .= fft(img1_g) +img2_f_g .= fft(img2_g) # register (find the optimal translation) -error, shift, diffphase = dftreg_gpu!(img1_f_g, img2_f_g, CC_g) +error, shift, diffphase = dftreg!(img1_f_g, img2_f_g, CC_g) -# resample the moving image -img2_reg_g = dftreg_resample_gpu!(img1_f_g, N_g, shift, diffphase) +# resample the moving image (in-place) +dftreg_resample!(img2_reg_g, img2_f_g, N_g, shift, diffphase) # copy to CPU Array(img2_reg_g) ``` ### Registering a set of 2D images (subpixel registration) -Use the function `dftreg_subpix_gpu!`. For the argument `CC2x_g`, the array size should be 2x of the image size. +Use the function `dftreg_subpix!`. For the argument `CC2x_g`, the array size should be 2x of the image size. ```julia # allocate GPU memory img1_g = CuArray{Float32}(undef, size_x, size_y) @@ -41,20 +61,21 @@ img1_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y) img2_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y) CC2x_g = CuArray{Complex{Float32}}(undef, 2 * size_x, 2 * size_y) N_g = CuArray{Float32}(undef, size_x, size_y) +img2_reg_g = CuArray{Float32}(undef, size_x, size_y) # copy to GPU copyto!(img1_g, Float32.(img1)) copyto!(img2_g, Float32.(img2)) -# perform FFT using CUFFT -img1_f_g .= CUFFT.fft(img1_g) -img2_f_g .= CUFFT.fft(img2_g) +# perform FFT using the active backend +img1_f_g .= fft(img1_g) +img2_f_g .= fft(img2_g) # register (find the optimal translation) -error, shift, diffphase = dftreg_subpix_gpu!(img1_f_g, img2_f_g, CC2x_g) +error, shift, diffphase = dftreg_subpix!(img1_f_g, img2_f_g, CC2x_g) -# resample the moving image -img2_reg_g = dftreg_resample_gpu!(img1_f_g, N_g, shift, diffphase) +# resample the moving image (in-place) +dftreg_resample!(img2_reg_g, img2_f_g, N_g, shift, diffphase) # copy to CPU Array(img2_reg_g) @@ -63,9 +84,11 @@ Array(img2_reg_g) ### Registering z-stack - Moving targets on the stage can cause shearing in z-stack. To correct this, the images within the stack are registered together. - `reg_stack_translate!` is a memory-efficient and convenient function to register the frames in each z-stack. Here in the example, the script loads the z-stack at each time point, registers it, and then saves the registered z-stack. +- To ensure this runs on CUDA: make all working arrays `CuArray`, check `CUDA.functional()`, and optionally set `CUDA.allowscalar(false)` to catch accidental scalar fallback. +- For repeated large host<->device transfers, use a long-lived pinned host buffer (`CUDA.pin`) for better copy throughput. ```julia size_x, size_y, size_z = 256, 256, 94 -img_stack_reg = zeros(Float32, size_x, size_y, size_z) +img_stack_h = CUDA.pin(Array{Float32}(undef, size_x, size_y, size_z)) img_stack_reg_g = CuArray{Float32}(undef, size_x, size_y, size_z) img1_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y) img2_f_g = CuArray{Complex{Float32}}(undef, size_x, size_y) @@ -73,22 +96,136 @@ CC2x_g = CuArray{Complex{Float32}}(undef, 2 * size_x, 2 * size_y) N_g = CuArray{Float32}(undef, size_x, size_y) @showprogress for t = 1:100 - copyto!(img_stack_reg_g, get_zstack(t)) # copy data to GPU + get_zstack!(img_stack_h, t) # load data to pinned host buffer + copyto!(img_stack_reg_g, img_stack_h) # copy data to GPU reg_stack_translate!(img_stack_reg_g, img1_f_g, img2_f_g, CC2x_g, N_g) # register - copyto!(img_stack_reg, img_stack_reg_g) # copy result to GPU - save_zstack(t, img_stack_reg) + copyto!(img_stack_h, img_stack_reg_g) # copy result to CPU + save_zstack(t, img_stack_h) +end +``` + +### Registering z-stack in batches (CUDA, higher throughput) +If you have many stacks (`t = 1:nt`), process them in micro-batches to improve GPU utilization and overlap CPU I/O with GPU work. + +```julia +using FFTRegGPU +using CUDA +using ProgressMeter +using Base.Threads + +CUDA.functional() || error("CUDA is not functional") +CUDA.allowscalar(false) + +size_x, size_y, size_z = 256, 256, 94 +nt = 100 +B = 4 # micro-batch size; tune 2, 4, 8... + +# One independent workspace per batch slot. +h_in = [CUDA.pin(Array{Float32}(undef, size_x, size_y, size_z)) for _ in 1:B] +h_out = [CUDA.pin(Array{Float32}(undef, size_x, size_y, size_z)) for _ in 1:B] +d_stack = [CuArray{Float32}(undef, size_x, size_y, size_z) for _ in 1:B] +img1_f = [CuArray{ComplexF32}(undef, size_x, size_y) for _ in 1:B] +img2_f = [CuArray{ComplexF32}(undef, size_x, size_y) for _ in 1:B] +CC2x = [CuArray{ComplexF32}(undef, 2 * size_x, 2 * size_y) for _ in 1:B] +N = [CuArray{Float32}(undef, size_x, size_y) for _ in 1:B] +reg_param = [Dict{Int,Any}() for _ in 1:B] + +@showprogress for t0 in 1:B:nt + k = min(B, nt - t0 + 1) + + # Load stacks on CPU. + @threads for i in 1:k + get_zstack!(h_in[i], t0 + i - 1) + end + + # Process stacks concurrently on GPU. + @sync for i in 1:k + @spawn begin + empty!(reg_param[i]) + CUDA.@sync begin + copyto!(d_stack[i], h_in[i]) # H2D + reg_stack_translate!(d_stack[i], img1_f[i], img2_f[i], CC2x[i], N[i]; reg_param=reg_param[i]) + copyto!(h_out[i], d_stack[i]) # D2H + end + end + end + + # Save stacks on CPU. + @threads for i in 1:k + save_zstack(t0 + i - 1, h_out[i]) + end end ``` +Notes: +- Run Julia with multiple threads (e.g. `julia -t auto`) to benefit from `@threads`/`@spawn`. +- `B` is the main tuning parameter; increase it until throughput stops improving or GPU memory becomes limiting. + ## Performance -Measured time to call `reg_stack_translate!` -Hardware: GTX 1080 -Julia v1.5.3 and CUDA.jl v.2.4.1 - -| size (x,y,z) | mean time (ms) | -| - | - | -| 32,32,32 | 43.817 ms | -| 64,64,64 | 91.215 ms | -| 128,128,64 | 103.022 ms | -| 256,256,64 | 131.046 ms | -| 256,256,256 | 525.257 ms | +### Benchmark command +```bash +julia --project=benchmark benchmark/run_benchmarks.jl --backend=both --samples=20 --sizes=64x64,128x128,256x256,512x512,2048x2048 --stack=128x128x128,256x256x256 --output=benchmark/results/both.csv +``` + +### Benchmark results +Time in ms + +| backend | case | dims | median | mean | min | memory | allocs | +| :--- | :--- | :--- | ---: | ---: | ---: | ---: | ---: | +| cpu | dftreg | 64x64 | 0.060 | 0.066 | 0.059 | 400 | 7 | +| cpu | dftreg_subpix | 64x64 | 0.410 | 0.408 | 0.306 | 295408 | 247 | +| cpu | dftreg | 128x128 | 0.204 | 0.214 | 0.203 | 400 | 7 | +| cpu | dftreg_subpix | 128x128 | 3.687 | 3.661 | 3.331 | 968880 | 247 | +| cpu | dftreg | 256x256 | 2.970 | 2.978 | 2.960 | 400 | 7 | +| cpu | dftreg_subpix | 256x256 | 99.947 | 89.491 | 18.744 | 3493856 | 269 | +| cpu | dftreg | 512x512 | 17.416 | 17.440 | 17.333 | 400 | 7 | +| cpu | dftreg_subpix | 512x512 | 196.452 | 190.043 | 69.564 | 13262576 | 279 | +| cpu | dftreg | 2048x2048 | 258.009 | 258.358 | 256.690 | 400 | 7 | +| cpu | dftreg_subpix | 2048x2048 | 1790.361 | 1768.056 | 1710.282 | 204021056 | 279 | +| cpu | reg_stack_translate | 128x128x128 | 498.022 | 503.165 | 493.216 | 124292056 | 36460 | +| cpu | reg_stack_translate | 256x256x256 | 28044.507 | 28044.507 | 28044.507 | 895572952 | 79316 | +| cuda | dftreg | 64x64 | 0.683 | 0.760 | 0.586 | 15104 | 487 | +| cuda | dftreg_subpix | 64x64 | 1.412 | 1.409 | 0.986 | 118320 | 2051 | +| cuda | dftreg | 128x128 | 0.215 | 0.235 | 0.211 | 15008 | 474 | +| cuda | dftreg_subpix | 128x128 | 1.439 | 1.457 | 1.081 | 166896 | 2051 | +| cuda | dftreg | 256x256 | 0.220 | 0.241 | 0.215 | 15072 | 483 | +| cuda | dftreg_subpix | 256x256 | 1.495 | 1.510 | 1.065 | 263472 | 2076 | +| cuda | dftreg | 512x512 | 0.232 | 0.255 | 0.228 | 15152 | 492 | +| cuda | dftreg_subpix | 512x512 | 1.927 | 3.169 | 1.334 | 457184 | 2186 | +| cuda | dftreg | 2048x2048 | 1.845 | 1.849 | 1.589 | 24432 | 1028 | +| cuda | dftreg_subpix | 2048x2048 | 11.011 | 10.937 | 9.759 | 1622016 | 2752 | +| cuda | reg_stack_translate | 128x128x128 | 182.471 | 212.267 | 165.538 | 24102736 | 324324 | +| cuda | reg_stack_translate | 256x256x256 | 382.678 | 389.114 | 370.962 | 74343728 | 659790 | + +### Configuration +``` +CUDA toolchain: +- runtime 13.2, artifact installation +- driver 570.181.0 for 13.2 +- compiler 13.2 + +CUDA libraries: +- CUBLAS: 13.3.0 +- CURAND: 10.4.2 +- CUFFT: 12.2.0 +- CUSOLVER: 12.1.0 +- CUSPARSE: 12.7.9 +- CUPTI: 2026.1.0 (API 13.2.0) +- NVML: 12.0.0+570.181 + +Julia packages: +- CUDA: 5.11.0 +- GPUArrays: 11.4.1 +- GPUCompiler: 1.8.2 +- KernelAbstractions: 0.9.40 +- CUDA_Driver_jll: 13.2.0+0 +- CUDA_Compiler_jll: 0.4.2+0 +- CUDA_Runtime_jll: 0.21.0+0 + +Toolchain: +- Julia: 1.12.5 +- LLVM: 18.1.7 + +1 device: + 0: NVIDIA GeForce RTX 4060 Ti (sm_89, 15.581 GiB / 15.996 GiB available) +``` diff --git a/benchmark/Project.toml b/benchmark/Project.toml new file mode 100644 index 0000000..bbedb82 --- /dev/null +++ b/benchmark/Project.toml @@ -0,0 +1,7 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" +FFTRegGPU = "86d2b4ce-0b7d-4e3e-98ec-23077478530b" + +[sources] +FFTRegGPU = {path = ".."} diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..e2f336c --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,69 @@ +# Benchmarking FFTRegGPU + +This benchmark suite is deterministic (fixed synthetic data seed) and reusable across commits. + +## Setup + +```bash +julia --project=benchmark -e 'using Pkg; Pkg.instantiate()' +``` + +## Run + +From the repository root: + +```bash +julia --project=benchmark benchmark/run_benchmarks.jl --backend=cpu --samples=30 --output=benchmark/results/cpu.csv +``` + +To run CUDA too: + +```bash +julia --project=benchmark -e 'using Pkg; Pkg.add("CUDA")' +julia --project=benchmark benchmark/run_benchmarks.jl --backend=both --samples=20 --output=benchmark/results/both.csv +``` + +If CUDA is not functional on the machine, CUDA cases are skipped automatically. + +## Compare `reg_stack_translate!` across two commits + +Run this standalone commit comparison benchmark (CUDA only): + +```bash +julia --project=benchmark benchmark/compare_reg_stack_translate_commits.jl \ + --current-ref=HEAD \ + --previous-ref=HEAD~1 \ + --stack=256x256x64,256x256x256 \ + --samples=20 +``` + +You can manually provide any two refs (branches, tags, or SHAs): + +```bash +julia --project=benchmark benchmark/compare_reg_stack_translate_commits.jl \ + --current-ref=3a1b2c4 \ + --previous-ref=f09d8e7 +``` + +The script prints: +- Median timing comparison (`current_ms`, `prev_ms`, ratio, percent delta) +- Output equivalence (`isapprox` + max absolute difference) per stack size + +## Useful options + +- `--backend=cpu|cuda|both` +- `--samples=N` +- `--evals=N` +- `--seed=N` +- `--sizes=128x128,256x256` +- `--stack=256x256x64` +- `--noise=0.02` +- `--output=PATH` +- `--current-ref=REF` +- `--previous-ref=REF` + +Use `--help` for all options: + +```bash +julia --project=benchmark benchmark/run_benchmarks.jl --help +``` diff --git a/benchmark/compare_reg_stack_translate_commits.jl b/benchmark/compare_reg_stack_translate_commits.jl new file mode 100644 index 0000000..02c1b4b --- /dev/null +++ b/benchmark/compare_reg_stack_translate_commits.jl @@ -0,0 +1,406 @@ +using Printf +using Serialization + +const DEFAULT_SEED = 20260313 + +struct CompareConfig + repo_path::String + current_ref::String + previous_ref::String + stack_sizes::Vector{NTuple{3,Int}} + samples::Int + evals::Int + seed::Int + noise_sigma::Float32 +end + +function parse_dim3(token::AbstractString) + m = match(r"^(\d+)x(\d+)x(\d+)$", lowercase(strip(token))) + m === nothing && error("Invalid 3D size '$token'. Use NxMxZ (e.g. 256x256x64).") + parse(Int, m.captures[1]), parse(Int, m.captures[2]), parse(Int, m.captures[3]) +end + +function parse_stack_sizes(token::AbstractString) + [parse_dim3(s) for s in split(token, ",")] +end + +function parse_args(args::Vector{String}) + cfg = Dict( + :repo_path => abspath(joinpath(@__DIR__, "..")), + :current_ref => "HEAD", + :previous_ref => "HEAD~1", + :stack_sizes => [(256, 256, 64), (256, 256, 256)], + :samples => 20, + :evals => 1, + :seed => DEFAULT_SEED, + :noise_sigma => 0.02f0, + ) + + for arg in args + if arg == "--help" || arg == "-h" + println(""" +Usage: + julia --project=benchmark benchmark/compare_reg_stack_translate_commits.jl [options] + +Options: + --repo=PATH Path to FFTRegGPU repository. Default: repo root + --current-ref=REF Current/reference A git ref. Default: HEAD + --previous-ref=REF Baseline/reference B git ref. Default: HEAD~1 + --stack=256x256x64,256x256x256 Stack sizes to benchmark. Default: 256x256x64,256x256x256 + --samples=N BenchmarkTools samples per case. Default: 20 + --evals=N BenchmarkTools evals per sample. Default: 1 + --seed=N RNG seed for synthetic data. Default: $(DEFAULT_SEED) + --noise=0.02 Added Gaussian noise sigma. Default: 0.02 +""") + exit(0) + elseif startswith(arg, "--repo=") + cfg[:repo_path] = abspath(split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--current-ref=") + cfg[:current_ref] = split(arg, "=", limit=2)[2] + elseif startswith(arg, "--previous-ref=") + cfg[:previous_ref] = split(arg, "=", limit=2)[2] + elseif startswith(arg, "--stack=") + cfg[:stack_sizes] = parse_stack_sizes(split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--samples=") + cfg[:samples] = parse(Int, split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--evals=") + cfg[:evals] = parse(Int, split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--seed=") + cfg[:seed] = parse(Int, split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--noise=") + cfg[:noise_sigma] = Float32(parse(Float64, split(arg, "=", limit=2)[2])) + else + error("Unknown option: $arg") + end + end + + isdir(cfg[:repo_path]) || error("Repo path does not exist: $(cfg[:repo_path])") + cfg[:samples] > 0 || error("samples must be positive") + cfg[:evals] > 0 || error("evals must be positive") + isempty(cfg[:current_ref]) && error("current-ref must not be empty") + isempty(cfg[:previous_ref]) && error("previous-ref must not be empty") + isempty(cfg[:stack_sizes]) && error("stack must include at least one size") + + CompareConfig( + cfg[:repo_path], + cfg[:current_ref], + cfg[:previous_ref], + cfg[:stack_sizes], + cfg[:samples], + cfg[:evals], + cfg[:seed], + cfg[:noise_sigma], + ) +end + +const CHILD_CODE = raw""" +using Pkg +using Random +using Serialization + +repo_checkout = ARGS[1] +result_file = ARGS[2] +stack_arg = ARGS[3] +samples = parse(Int, ARGS[4]) +evals = parse(Int, ARGS[5]) +seed = parse(Int, ARGS[6]) +noise_sigma = Float32(parse(Float64, ARGS[7])) + +function parse_stack_sizes(token::AbstractString) + out = NTuple{3,Int}[] + for s in split(token, ';') + isempty(s) && continue + p = split(s, 'x') + length(p) == 3 || error("Invalid stack size: $s (expected NxMxZ)") + push!(out, (parse(Int, p[1]), parse(Int, p[2]), parse(Int, p[3]))) + end + out +end + +stack_sizes = parse_stack_sizes(stack_arg) +isempty(stack_sizes) && error("No stack sizes provided") + +env_dir = mktempdir() +Pkg.activate(env_dir) +Pkg.develop(PackageSpec(path=repo_checkout)) +Pkg.instantiate() +Pkg.add(["BenchmarkTools", "FFTW", "CUDA"]) + +using BenchmarkTools +using FFTRegGPU +using FFTW +using CUDA + +CUDA.functional() || error("CUDA is not functional on this machine") + +_safe_str(f, default="unknown") = try + string(f()) +catch + default +end + +function _cuda_info() + dev = CUDA.device() + ( + functional = true, + device_name = _safe_str(() -> CUDA.name(dev), string(dev)), + device_index = _safe_str(() -> CUDA.deviceid(dev)), + driver_version = _safe_str(CUDA.driver_version), + runtime_version = _safe_str(CUDA.runtime_version), + ) +end + +function make_base_image(nx::Int, ny::Int, rng::AbstractRNG) + x = reshape(collect(LinRange(0f0, 1f0, nx)), nx, 1) + y = reshape(collect(LinRange(0f0, 1f0, ny)), 1, ny) + img = @. 0.55f0 * sin(2f0 * π * (3f0 * x + 5f0 * y)) + 0.35f0 * cos(2f0 * π * (7f0 * x - 2f0 * y)) + img .+= 0.10f0 .* randn(rng, Float32, nx, ny) + img .-= minimum(img) + img ./= maximum(img) + img +end + +function make_stack(nx::Int, ny::Int, nz::Int, rng::AbstractRNG, noise_sigma::Float32) + stack = zeros(Float32, nx, ny, nz) + stack[:, :, 1] .= make_base_image(nx, ny, rng) + for z in 2:nz + stack[:, :, z] .= circshift(@view(stack[:, :, z - 1]), (1, -1)) + end + stack .+= noise_sigma .* randn(rng, Float32, nx, ny, nz) + stack +end + +function _resolve_reg_fn() + if isdefined(FFTRegGPU, :reg_stack_translate_gpu!) + return getfield(FFTRegGPU, :reg_stack_translate_gpu!) + elseif isdefined(FFTRegGPU, :reg_stack_translate!) + return getfield(FFTRegGPU, :reg_stack_translate!) + else + error("Neither reg_stack_translate_gpu! nor reg_stack_translate! exists in this commit") + end +end + +function call_reg_stack_translate!(stack_work, img1_f, img2_f, cc2x, nbuf, reg_param) + f = _resolve_reg_fn() + try + return f(stack_work, img1_f, img2_f, cc2x, nbuf; reg_param=reg_param) + catch err + if err isa MethodError + return f(stack_work, img1_f, img2_f, cc2x, nbuf) + end + rethrow(err) + end +end + +function run_case_rows(stack_sizes, samples, evals, seed, noise_sigma) + rows = NamedTuple[] + all_buffers_on_cuda = true + for (i, (nx, ny, nz)) in enumerate(stack_sizes) + rng = MersenneTwister(seed + 100 + i) + dims = "$(nx)x$(ny)x$(nz)" + + stack_template = CUDA.CuArray(make_stack(nx, ny, nz, rng, noise_sigma)) + stack_work = similar(stack_template) + img1_f = CUDA.zeros(ComplexF32, nx, ny) + img2_f = CUDA.zeros(ComplexF32, nx, ny) + cc2x = CUDA.zeros(ComplexF32, 2 * nx, 2 * ny) + nbuf = CUDA.zeros(Float32, nx, ny) + reg_param = Dict{Int,Any}() + all_buffers_on_cuda = all_buffers_on_cuda && ( + stack_template isa CUDA.CuArray && + stack_work isa CUDA.CuArray && + img1_f isa CUDA.CuArray && + img2_f isa CUDA.CuArray && + cc2x isa CUDA.CuArray && + nbuf isa CUDA.CuArray + ) + + copyto!(stack_work, stack_template) + call_reg_stack_translate!(stack_work, img1_f, img2_f, cc2x, nbuf, reg_param) + CUDA.synchronize() + stack_out = Array(stack_work) + + trial = run(@benchmarkable begin + copyto!($stack_work, $stack_template) + empty!($reg_param) + call_reg_stack_translate!($stack_work, $img1_f, $img2_f, $cc2x, $nbuf, $reg_param) + CUDA.synchronize() + end samples=samples evals=evals) + + med = BenchmarkTools.median(trial) + mn = BenchmarkTools.mean(trial) + mnm = BenchmarkTools.minimum(trial) + push!(rows, ( + dims = dims, + median_ms = med.time / 1e6, + mean_ms = mn.time / 1e6, + min_ms = mnm.time / 1e6, + memory_bytes = med.memory, + allocs = med.allocs, + stack_out = stack_out, + )) + end + + return rows, all_buffers_on_cuda +end + +rows, all_buffers_on_cuda = run_case_rows(stack_sizes, samples, evals, seed, noise_sigma) + +serialize( + result_file, + ( + rows = rows, + cuda_info = _cuda_info(), + all_buffers_on_cuda = all_buffers_on_cuda, + ), +) +""" + +function write_child_script() + dir = mktempdir() + path = joinpath(dir, "bench_compare_one_ref.jl") + write(path, CHILD_CODE) + path +end + +function checkout_ref(repo_path::AbstractString, ref::AbstractString) + dir = mktempdir() + run(`git clone --quiet $(String(repo_path)) $dir`) + run(`git -C $dir checkout --quiet $(String(ref))`) + dir +end + +function run_ref( + repo_path::AbstractString, + ref::AbstractString, + child_file::AbstractString, + stack_sizes::Vector{NTuple{3,Int}}, + samples::Int, + evals::Int, + seed::Int, + noise_sigma::Float32, +) + checkout = checkout_ref(repo_path, ref) + result_dir = mktempdir() + result_file = joinpath(result_dir, "result.bin") + stack_arg = join(["$(x)x$(y)x$(z)" for (x, y, z) in stack_sizes], ";") + + cmd = `$(Base.julia_cmd()) $(String(child_file)) $checkout $result_file $stack_arg $samples $evals $seed $noise_sigma` + + try + run(cmd) + commit = chomp(read(`git -C $checkout rev-parse --short HEAD`, String)) + payload = deserialize(result_file) + if payload isa NamedTuple && hasproperty(payload, :rows) + return ( + commit = commit, + rows = payload.rows, + cuda_info = hasproperty(payload, :cuda_info) ? payload.cuda_info : (functional = true,), + all_buffers_on_cuda = hasproperty(payload, :all_buffers_on_cuda) ? payload.all_buffers_on_cuda : false, + ) + end + + # Backward compatibility with older payload shape. + return ( + commit = commit, + rows = payload, + cuda_info = (functional = true,), + all_buffers_on_cuda = false, + ) + finally + rm(result_dir; force=true, recursive=true) + rm(checkout; force=true, recursive=true) + end +end + +function print_cuda_report(label::String, run_result) + info = run_result.cuda_info + device_name = hasproperty(info, :device_name) ? info.device_name : "unknown" + device_index = hasproperty(info, :device_index) ? info.device_index : "unknown" + driver = hasproperty(info, :driver_version) ? info.driver_version : "unknown" + runtime = hasproperty(info, :runtime_version) ? info.runtime_version : "unknown" + functional = hasproperty(info, :functional) ? info.functional : "unknown" + arrays_on_cuda = run_result.all_buffers_on_cuda ? "yes" : "no" + pad = repeat(" ", length(label)) + println(" ", label, ": functional=", functional, ", device=", device_name, " (id=", device_index, ")") + println(" ", pad, " driver=", driver, ", runtime=", runtime, ", arrays_on_cuda=", arrays_on_cuda) +end + +function print_table(curr, prev, current_ref::String, previous_ref::String) + curr_by_dims = Dict(r.dims => r for r in curr.rows) + prev_by_dims = Dict(r.dims => r for r in prev.rows) + dims_all = sort!(collect(intersect(keys(curr_by_dims), keys(prev_by_dims)))) + isempty(dims_all) && error("No overlapping stack dimensions between compared refs.") + + println("CUDA reg_stack_translate! comparison") + println("current: $(curr.commit) ($current_ref)") + println("previous: $(prev.commit) ($previous_ref)") + println("CUDA verification:") + print_cuda_report("current", curr) + print_cuda_report("previous", prev) + println() + + @printf("%-14s%14s%14s%12s%12s\n", "dims", "current_ms", "prev_ms", "curr/prev", "delta_%") + for d in dims_all + c = curr_by_dims[d] + p = prev_by_dims[d] + ratio = c.median_ms / p.median_ms + delta = (c.median_ms - p.median_ms) / p.median_ms * 100 + @printf("%-14s%14.3f%14.3f%12.3f%12.1f\n", d, c.median_ms, p.median_ms, ratio, delta) + end + + println() + println("Output equivalence (isapprox):") + @printf("%-14s%8s%12s\n", "dims", "same", "max_abs") + for d in dims_all + a = curr_by_dims[d].stack_out + b = prev_by_dims[d].stack_out + max_abs = maximum(abs.(a .- b)) + same = isapprox(a, b; rtol=1f-4, atol=1f-4) + @printf("%-14s%8s%12.3e\n", d, same ? "yes" : "no", max_abs) + end +end + +function main() + cfg = parse_args(ARGS) + + println("Commit comparison configuration:") + println(" repo: ", cfg.repo_path) + println(" current-ref: ", cfg.current_ref) + println(" previous-ref: ", cfg.previous_ref) + println(" samples: ", cfg.samples) + println(" evals: ", cfg.evals) + println(" seed: ", cfg.seed) + println(" stack: ", join(["$(x)x$(y)x$(z)" for (x, y, z) in cfg.stack_sizes], ", ")) + println(" noise: ", cfg.noise_sigma) + println() + + child_file = write_child_script() + try + curr = run_ref( + cfg.repo_path, + cfg.current_ref, + child_file, + cfg.stack_sizes, + cfg.samples, + cfg.evals, + cfg.seed, + cfg.noise_sigma, + ) + prev = run_ref( + cfg.repo_path, + cfg.previous_ref, + child_file, + cfg.stack_sizes, + cfg.samples, + cfg.evals, + cfg.seed, + cfg.noise_sigma, + ) + print_table(curr, prev, cfg.current_ref, cfg.previous_ref) + finally + rm(dirname(child_file); force=true, recursive=true) + end +end + +main() diff --git a/benchmark/run_benchmarks.jl b/benchmark/run_benchmarks.jl new file mode 100644 index 0000000..146a624 --- /dev/null +++ b/benchmark/run_benchmarks.jl @@ -0,0 +1,385 @@ +using BenchmarkTools +using FFTRegGPU +using FFTW +using Random +using Printf + +const DEFAULT_SEED = 20260313 + +struct BenchConfig + backend::Symbol + samples::Int + evals::Int + seed::Int + sizes::Vector{Tuple{Int,Int}} + stack_sizes::Vector{NTuple{3,Int}} + noise_sigma::Float32 + output::Union{Nothing,String} +end + +function parse_dim2(token::AbstractString) + m = match(r"^(\d+)x(\d+)$", lowercase(strip(token))) + m === nothing && error("Invalid 2D size '$token'. Use NxM (e.g. 256x256).") + parse(Int, m.captures[1]), parse(Int, m.captures[2]) +end + +function parse_dim3(token::AbstractString) + m = match(r"^(\d+)x(\d+)x(\d+)$", lowercase(strip(token))) + m === nothing && error("Invalid 3D size '$token'. Use NxMxZ (e.g. 256x256x64).") + parse(Int, m.captures[1]), parse(Int, m.captures[2]), parse(Int, m.captures[3]) +end + +function parse_sizes(token::AbstractString) + [parse_dim2(s) for s in split(token, ",")] +end + +function parse_stack_sizes(token::AbstractString) + [parse_dim3(s) for s in split(token, ",")] +end + +function parse_backend(token::AbstractString) + b = Symbol(lowercase(strip(token))) + b in (:cpu, :cuda, :both) || error("backend must be cpu, cuda, or both") + b +end + +function parse_args(args::Vector{String}) + cfg = Dict( + :backend => :both, + :samples => 20, + :evals => 1, + :seed => DEFAULT_SEED, + :sizes => [(128, 128), (256, 256)], + :stack_sizes => [(256, 256, 64)], + :noise_sigma => 0.02f0, + :output => nothing, + ) + + for arg in args + if arg == "--help" || arg == "-h" + println(""" +Usage: + julia --project=benchmark benchmark/run_benchmarks.jl [options] + +Options: + --backend=cpu|cuda|both Benchmark backend(s). Default: both + --samples=N BenchmarkTools samples per case. Default: 20 + --evals=N BenchmarkTools evals per sample. Default: 1 + --seed=N RNG seed for synthetic data. Default: $(DEFAULT_SEED) + --sizes=128x128,256x256 2D pair benchmark sizes. Default: 128x128,256x256 + --stack=256x256x64,256x256x96 + 3D stack benchmark sizes. Default: 256x256x64 + --noise=0.02 Added Gaussian noise sigma. Default: 0.02 + --output=PATH Optional CSV output path. +""") + exit(0) + elseif startswith(arg, "--backend=") + cfg[:backend] = parse_backend(split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--samples=") + cfg[:samples] = parse(Int, split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--evals=") + cfg[:evals] = parse(Int, split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--seed=") + cfg[:seed] = parse(Int, split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--sizes=") + cfg[:sizes] = parse_sizes(split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--stack=") + cfg[:stack_sizes] = parse_stack_sizes(split(arg, "=", limit=2)[2]) + elseif startswith(arg, "--noise=") + cfg[:noise_sigma] = Float32(parse(Float64, split(arg, "=", limit=2)[2])) + elseif startswith(arg, "--output=") + cfg[:output] = split(arg, "=", limit=2)[2] + else + error("Unknown option: $arg") + end + end + + cfg[:samples] > 0 || error("samples must be positive") + cfg[:evals] > 0 || error("evals must be positive") + isempty(cfg[:stack_sizes]) && error("stack must include at least one size") + + BenchConfig( + cfg[:backend], + cfg[:samples], + cfg[:evals], + cfg[:seed], + cfg[:sizes], + cfg[:stack_sizes], + cfg[:noise_sigma], + cfg[:output], + ) +end + +function make_base_image(nx::Int, ny::Int, rng::AbstractRNG) + x = reshape(collect(LinRange(0f0, 1f0, nx)), nx, 1) + y = reshape(collect(LinRange(0f0, 1f0, ny)), 1, ny) + img = @. 0.55f0 * sin(2f0 * π * (3f0 * x + 5f0 * y)) + 0.35f0 * cos(2f0 * π * (7f0 * x - 2f0 * y)) + img .+= 0.10f0 .* randn(rng, Float32, nx, ny) + img .-= minimum(img) + img ./= maximum(img) + img +end + +function make_integer_pair(nx::Int, ny::Int, rng::AbstractRNG, noise_sigma::Float32) + img1 = make_base_image(nx, ny, rng) + img2 = circshift(img1, (3, -2)) + img2 .+= noise_sigma .* randn(rng, Float32, nx, ny) + fft(img1), fft(img2) +end + +function make_subpixel_pair(nx::Int, ny::Int, rng::AbstractRNG, noise_sigma::Float32) + img1 = make_base_image(nx, ny, rng) + img1_f = fft(img1) + + shift = Float32[2.25, -1.75] + nbuf = zeros(Float32, nx, ny) + shifted_f = similar(img1_f) + subpix_shift!(shifted_f, img1_f, nbuf, shift, 0f0) + img2 = real(ifft(shifted_f)) + img2 .+= noise_sigma .* randn(rng, Float32, nx, ny) + img1_f, fft(img2) +end + +function make_stack(nx::Int, ny::Int, nz::Int, rng::AbstractRNG, noise_sigma::Float32) + stack = zeros(Float32, nx, ny, nz) + stack[:, :, 1] .= make_base_image(nx, ny, rng) + + for z in 2:nz + stack[:, :, z] .= circshift(@view(stack[:, :, z - 1]), (1, -1)) + end + stack .+= noise_sigma .* randn(rng, Float32, nx, ny, nz) + stack +end + +function push_summary!(rows, backend::String, case_name::String, dims::String, cfg::BenchConfig, trial::BenchmarkTools.Trial) + med = BenchmarkTools.median(trial) + mn = BenchmarkTools.mean(trial) + mnm = BenchmarkTools.minimum(trial) + push!( + rows, + ( + backend = backend, + case = case_name, + dims = dims, + median_ms = med.time / 1e6, + mean_ms = mn.time / 1e6, + min_ms = mnm.time / 1e6, + memory_bytes = med.memory, + allocs = med.allocs, + samples = cfg.samples, + evals = cfg.evals, + ), + ) +end + +function run_cpu_benchmarks(cfg::BenchConfig, rows) + println("Running CPU benchmarks...") + + for (i, (nx, ny)) in enumerate(cfg.sizes) + rng = MersenneTwister(cfg.seed + i) + dims = "$(nx)x$(ny)" + + img1_f, img2_f = make_integer_pair(nx, ny, rng, cfg.noise_sigma) + cc = similar(img1_f) + cc_abs2_work = similar(cc, float(real(eltype(cc)))) + dftreg!(img1_f, img2_f, cc; cc_abs2_work=cc_abs2_work) # warmup + trial = run( + @benchmarkable dftreg!($img1_f, $img2_f, $cc; cc_abs2_work=$cc_abs2_work) samples=cfg.samples evals=cfg.evals + ) + push_summary!(rows, "cpu", "dftreg", dims, cfg, trial) + + img1_f_sub, img2_f_sub = make_subpixel_pair(nx, ny, rng, cfg.noise_sigma) + cc2x = zeros(eltype(img1_f_sub), 2 * nx, 2 * ny) + cc2x_abs2_work = similar(cc2x, float(real(eltype(cc2x)))) + dftreg_subpix!(img1_f_sub, img2_f_sub, cc2x; cc2x_abs2_work=cc2x_abs2_work) # warmup + trial = run( + @benchmarkable dftreg_subpix!($img1_f_sub, $img2_f_sub, $cc2x; cc2x_abs2_work=$cc2x_abs2_work) samples=cfg.samples evals=cfg.evals + ) + push_summary!(rows, "cpu", "dftreg_subpix", dims, cfg, trial) + end + + for (i, (nx, ny, nz)) in enumerate(cfg.stack_sizes) + rng = MersenneTwister(cfg.seed + 100 + i) + dims = "$(nx)x$(ny)x$(nz)" + stack_template = make_stack(nx, ny, nz, rng, cfg.noise_sigma) + stack_work = similar(stack_template) + img1_f = zeros(ComplexF32, nx, ny) + img2_f = zeros(ComplexF32, nx, ny) + cc2x = zeros(ComplexF32, 2 * nx, 2 * ny) + nbuf = zeros(Float32, nx, ny) + reg_param = Dict{Int,Any}() + + copyto!(stack_work, stack_template) + reg_stack_translate!(stack_work, img1_f, img2_f, cc2x, nbuf; reg_param=reg_param) # warmup + + trial = run( + @benchmarkable begin + copyto!($stack_work, $stack_template) + empty!($reg_param) + reg_stack_translate!($stack_work, $img1_f, $img2_f, $cc2x, $nbuf; reg_param=$reg_param) + end samples=cfg.samples evals=1 + ) + push_summary!(rows, "cpu", "reg_stack_translate", dims, cfg, trial) + end +end + +function maybe_load_cuda() + Base.find_package("CUDA") === nothing && return nothing + try + @eval using CUDA + # `@eval using CUDA` inside a function can create a newer-world binding. + cuda = Base.invokelatest(() -> getfield(@__MODULE__, :CUDA)) + Base.invokelatest(cuda.functional) || return nothing + cuda + catch + nothing + end +end + +function run_cuda_benchmarks(cfg::BenchConfig, rows) + CUDA = maybe_load_cuda() + if CUDA === nothing + println("Skipping CUDA benchmarks (CUDA not installed or no functional CUDA device).") + return + end + println("Running CUDA benchmarks...") + Base.invokelatest(_run_cuda_benchmarks_loaded, CUDA, cfg, rows) +end + +function _run_cuda_benchmarks_loaded(CUDA, cfg::BenchConfig, rows) + for (i, (nx, ny)) in enumerate(cfg.sizes) + rng = MersenneTwister(cfg.seed + i) + dims = "$(nx)x$(ny)" + + img1_f_cpu, img2_f_cpu = make_integer_pair(nx, ny, rng, cfg.noise_sigma) + img1_f = CUDA.CuArray(img1_f_cpu) + img2_f = CUDA.CuArray(img2_f_cpu) + cc = similar(img1_f) + cc_abs2_work = similar(cc, float(real(eltype(cc)))) + + dftreg!(img1_f, img2_f, cc; cc_abs2_work=cc_abs2_work) # warmup + CUDA.synchronize() + trial = run( + @benchmarkable begin + dftreg!($img1_f, $img2_f, $cc; cc_abs2_work=$cc_abs2_work) + CUDA.synchronize() + end samples=cfg.samples evals=cfg.evals + ) + push_summary!(rows, "cuda", "dftreg", dims, cfg, trial) + + img1_f_sub_cpu, img2_f_sub_cpu = make_subpixel_pair(nx, ny, rng, cfg.noise_sigma) + img1_f_sub = CUDA.CuArray(img1_f_sub_cpu) + img2_f_sub = CUDA.CuArray(img2_f_sub_cpu) + cc2x = CUDA.zeros(eltype(img1_f_sub), 2 * nx, 2 * ny) + cc2x_abs2_work = similar(cc2x, float(real(eltype(cc2x)))) + + dftreg_subpix!(img1_f_sub, img2_f_sub, cc2x; cc2x_abs2_work=cc2x_abs2_work) # warmup + CUDA.synchronize() + trial = run( + @benchmarkable begin + dftreg_subpix!($img1_f_sub, $img2_f_sub, $cc2x; cc2x_abs2_work=$cc2x_abs2_work) + CUDA.synchronize() + end samples=cfg.samples evals=cfg.evals + ) + push_summary!(rows, "cuda", "dftreg_subpix", dims, cfg, trial) + end + + for (i, (nx, ny, nz)) in enumerate(cfg.stack_sizes) + rng = MersenneTwister(cfg.seed + 100 + i) + dims = "$(nx)x$(ny)x$(nz)" + stack_template = CUDA.CuArray(make_stack(nx, ny, nz, rng, cfg.noise_sigma)) + stack_work = similar(stack_template) + img1_f = CUDA.zeros(ComplexF32, nx, ny) + img2_f = CUDA.zeros(ComplexF32, nx, ny) + cc2x = CUDA.zeros(ComplexF32, 2 * nx, 2 * ny) + nbuf = CUDA.zeros(Float32, nx, ny) + reg_param = Dict{Int,Any}() + + copyto!(stack_work, stack_template) + reg_stack_translate!(stack_work, img1_f, img2_f, cc2x, nbuf; reg_param=reg_param) # warmup + CUDA.synchronize() + + trial = run( + @benchmarkable begin + copyto!($stack_work, $stack_template) + empty!($reg_param) + reg_stack_translate!($stack_work, $img1_f, $img2_f, $cc2x, $nbuf; reg_param=$reg_param) + CUDA.synchronize() + end samples=cfg.samples evals=1 + ) + push_summary!(rows, "cuda", "reg_stack_translate", dims, cfg, trial) + end +end + +function print_rows(rows) + println() + println("Benchmark results (times in ms):") + println(rpad("backend", 8), rpad("case", 20), rpad("dims", 14), lpad("median", 12), lpad("mean", 12), lpad("min", 12), lpad("memory", 12), lpad("allocs", 10)) + for r in rows + @printf( + "%-8s%-20s%-14s%12.3f%12.3f%12.3f%12d%10d\n", + r.backend, + r.case, + r.dims, + r.median_ms, + r.mean_ms, + r.min_ms, + r.memory_bytes, + r.allocs, + ) + end +end + +function write_csv(path::String, rows, cfg::BenchConfig) + mkpath(dirname(path)) + open(path, "w") do io + println(io, "backend,case,dims,median_ms,mean_ms,min_ms,memory_bytes,allocs,samples,evals,seed") + for r in rows + @printf( + io, + "%s,%s,%s,%.6f,%.6f,%.6f,%d,%d,%d,%d,%d\n", + r.backend, + r.case, + r.dims, + r.median_ms, + r.mean_ms, + r.min_ms, + r.memory_bytes, + r.allocs, + cfg.samples, + cfg.evals, + cfg.seed, + ) + end + end +end + +function main() + cfg = parse_args(ARGS) + rows = Vector{NamedTuple}() + + println("FFTRegGPU benchmark configuration:") + println(" backend: ", cfg.backend) + println(" samples: ", cfg.samples) + println(" evals: ", cfg.evals) + println(" seed: ", cfg.seed) + println(" sizes: ", join(["$(x)x$(y)" for (x, y) in cfg.sizes], ", ")) + println(" stack: ", join(["$(x)x$(y)x$(z)" for (x, y, z) in cfg.stack_sizes], ", ")) + println(" noise: ", cfg.noise_sigma) + + if cfg.backend in (:cpu, :both) + run_cpu_benchmarks(cfg, rows) + end + if cfg.backend in (:cuda, :both) + run_cuda_benchmarks(cfg, rows) + end + + print_rows(rows) + if cfg.output !== nothing + write_csv(cfg.output, rows, cfg) + println() + println("Wrote CSV: ", cfg.output) + end +end + +main() diff --git a/docs/img/before_after_xy_slice.png b/docs/img/before_after_xy_slice.png new file mode 100644 index 0000000..cf705e0 Binary files /dev/null and b/docs/img/before_after_xy_slice.png differ diff --git a/docs/img/before_after_xz_MIP.png b/docs/img/before_after_xz_MIP.png new file mode 100644 index 0000000..f309152 Binary files /dev/null and b/docs/img/before_after_xz_MIP.png differ diff --git a/ext/FFTRegGPUCPUExt.jl b/ext/FFTRegGPUCPUExt.jl new file mode 100644 index 0000000..23c8a7d --- /dev/null +++ b/ext/FFTRegGPUCPUExt.jl @@ -0,0 +1,27 @@ +""" + FFTRegGPUCPUExt + +CPU extension for FFTRegGPU backend hooks. + +When `FFTW` is loaded and inputs are `StridedArray`s, FFTRegGPU internal hooks +dispatch to FFTW and AbstractFFTs shift operations. +""" +module FFTRegGPUCPUExt + +using FFTRegGPU +using FFTW +using AbstractFFTs + +import FFTRegGPU: _fft, _ifft, _fftshift, _ifftshift +import FFTRegGPU: _ifft!, _plan_fft_inplace, _fft_inplace! + +_fft(inp::StridedArray) = FFTW.fft(inp) +_ifft(inp::StridedArray) = FFTW.ifft(inp) +_ifft!(inp::StridedArray{<:Complex}) = FFTW.ifft!(inp) +_plan_fft_inplace(inp::StridedArray{<:Complex}) = FFTW.plan_fft!(inp) +_fft_inplace!(inp::StridedArray{<:Complex}) = FFTW.fft!(inp) +_fft_inplace!(inp::StridedArray{<:Complex}, plan::FFTW.cFFTWPlan) = (plan * inp) +_fftshift(inp::StridedArray) = AbstractFFTs.fftshift(inp) +_ifftshift(inp::StridedArray) = AbstractFFTs.ifftshift(inp) + +end diff --git a/ext/FFTRegGPUCUDAExt.jl b/ext/FFTRegGPUCUDAExt.jl new file mode 100644 index 0000000..6e3aef8 --- /dev/null +++ b/ext/FFTRegGPUCUDAExt.jl @@ -0,0 +1,34 @@ +""" + FFTRegGPUCUDAExt + +CUDA extension for FFTRegGPU backend hooks. + +When `CUDA` is loaded and inputs are `CuArray` (or views over `CuArray`), FFT +and shift operations dispatch to CUFFT/CUDA implementations. +""" +module FFTRegGPUCUDAExt + +using FFTRegGPU +using CUDA + +import FFTRegGPU: _fft, _ifft, _ifft!, _fftshift, _ifftshift, _scalar_at +import FFTRegGPU: _plan_fft_inplace, _fft_inplace! + +_fft(inp::CUDA.CuArray) = CUDA.CUFFT.fft(inp) +_ifft(inp::CUDA.CuArray) = CUDA.CUFFT.ifft(inp) +_ifft!(inp::CUDA.CuArray{<:Complex}) = CUDA.CUFFT.ifft!(inp) +_plan_fft_inplace(inp::CUDA.CuArray{<:Complex}) = CUDA.CUFFT.plan_fft!(inp) +_fft_inplace!(inp::CUDA.CuArray{<:Complex}) = CUDA.CUFFT.fft!(inp) +_fft_inplace!(inp::CUDA.CuArray{<:Complex}, ::Nothing) = CUDA.CUFFT.fft!(inp) +_fft_inplace!(inp::CUDA.CuArray{<:Complex}, plan) = (plan * inp) +_fftshift(inp::CUDA.CuArray) = CUDA.CUFFT.fftshift(inp) +_ifftshift(inp::CUDA.CuArray) = CUDA.CUFFT.ifftshift(inp) +_scalar_at(inp::CUDA.CuArray, idx) = CUDA.@allowscalar inp[idx] + +_fft(inp::SubArray{<:Any,<:Any,<:CUDA.CuArray}) = CUDA.CUFFT.fft(inp) +_ifft(inp::SubArray{<:Any,<:Any,<:CUDA.CuArray}) = CUDA.CUFFT.ifft(inp) +_fftshift(inp::SubArray{<:Any,<:Any,<:CUDA.CuArray}) = CUDA.CUFFT.fftshift(inp) +_ifftshift(inp::SubArray{<:Any,<:Any,<:CUDA.CuArray}) = CUDA.CUFFT.ifftshift(inp) +_scalar_at(inp::SubArray{<:Any,<:Any,<:CUDA.CuArray}, idx) = CUDA.@allowscalar inp[idx] + +end diff --git a/src/FFTRegGPU.jl b/src/FFTRegGPU.jl index 708b98a..1ce1a9a 100644 --- a/src/FFTRegGPU.jl +++ b/src/FFTRegGPU.jl @@ -1,11 +1,25 @@ +""" + FFTRegGPU + +Phase-correlation based translation registration for 2D images with array-type +driven backend dispatch (CPU via FFTW, GPU via CUDA extensions). + +The core API expects Fourier-domain images (`fft(img)`) and exposes: +- `dftreg!` for integer-pixel translation estimation, +- `dftreg_subpix!` for subpixel translation estimation, +- `subpix_shift!` and `dftreg_resample!` for Fourier-domain shifting/resampling, +- `reg_stack_translate!` for sequential z-stack registration. +""" module FFTRegGPU -using CUDA, AbstractFFTs +using AbstractFFTs include("dftreg_translate.jl") -export dftreg_gpu!, - subpix_shift_gpu!, - dftreg_resample_gpu!, - reg_stack_translate! +export dftreg!, + dftreg_subpix!, + subpix_shift!, + dftreg_resample, + dftreg_resample!, + reg_stack_translate! end # module diff --git a/src/dftreg_translate.jl b/src/dftreg_translate.jl index c37f285..65a18f6 100644 --- a/src/dftreg_translate.jl +++ b/src/dftreg_translate.jl @@ -1,64 +1,265 @@ -function dftups(inp::AbstractArray{T,N},no,usfac::Int=1,offset=zeros(N)) where {T,N} - sz = [size(inp)...] +""" + _fft(inp) + _ifft(inp) + _ifft!(inp) + _plan_fft_inplace(inp) + _fft_inplace!(inp) + _fft_inplace!(inp, plan) + _fftshift(inp) + _ifftshift(inp) + +Internal backend hooks used by FFTRegGPU algorithms. + +These methods default to `AbstractFFTs` behavior and are extended in +`ext/FFTRegGPUCPUExt.jl` and `ext/FFTRegGPUCUDAExt.jl` so that dispatch follows +the input array type. +""" +_fft(inp::AbstractArray) = fft(inp) +_ifft(inp::AbstractArray) = ifft(inp) +_ifft!(inp::AbstractArray) = copyto!(inp, _ifft(inp)) +_plan_fft_inplace(inp::AbstractArray{<:Complex}) = nothing +_fft_inplace!(inp::AbstractArray{<:Complex}) = copyto!(inp, _fft(inp)) +_fft_inplace!(inp::AbstractArray{<:Complex}, ::Nothing) = _fft_inplace!(inp) +_fft_inplace!(inp::AbstractArray{<:Complex}, plan) = _fft_inplace!(inp) +_fftshift(inp::AbstractArray) = fftshift(inp) +_ifftshift(inp::AbstractArray) = ifftshift(inp) + +""" + _scalar_at(inp, idx) + +Read a scalar element from `inp[idx]`. + +CUDA extensions overload this helper with `@allowscalar` so peak extraction can +remain explicit and localized. +""" +_scalar_at(inp::AbstractArray, idx) = inp[idx] + +""" + _backend_template(inp) + +Return a parent array/container that represents where allocations should happen +for `inp` (for example the underlying `CuArray` for wrapped views). +""" +_backend_template(inp::AbstractArray) = inp +_backend_template(inp::SubArray) = _backend_template(parent(inp)) +_backend_template(inp::Base.ReshapedArray) = _backend_template(parent(inp)) +_backend_template(inp::Base.PermutedDimsArray) = _backend_template(parent(inp)) + +""" + _to_backend(ref, src) + +Allocate a new array with the same backend/container family as `ref`, then copy +`src` into it. + +This is used to keep temporary arrays on the same device as the active +computation (for example CPU `Array` vs GPU `CuArray`). +""" +function _to_backend(ref::AbstractArray, src::AbstractArray) + tmpl = _backend_template(ref) + out = similar(tmpl, eltype(src), size(src)) + copyto!(out, src) + out +end + +""" + _findmax_abs2_loc(inp, [work]) + +Return the location of the maximum of `abs2.(inp)`. + +If `work` is provided, it must match `size(inp)` and is reused as a scratch +buffer to avoid allocating `abs2.(inp)`. +""" +function _findmax_abs2_loc(inp::AbstractArray{<:Complex}, work::Union{Nothing,AbstractArray{<:Real}}=nothing) + if work === nothing + _, loc = findmax(abs2.(inp)) + return loc + end + + size(work) == size(inp) || throw(DimensionMismatch("work must have the same size as input")) + @. work = abs2(inp) + _, loc = findmax(work) + loc +end + +""" + dftups(inp, no, usfac=1, offset=nothing) + +Compute an upsampled DFT region by matrix multiplication (Guizar-Sicairos style) +without zero-padding the full spectrum. + +`inp` is typically the Fourier-domain cross-power spectrum. `no` is the output +size per dimension, `usfac` is the upsampling factor, and `offset` optionally +sets the center offset (per-dimension) of the sampled DFT window. +""" +function dftups(inp::AbstractArray{T,N}, no::Integer, usfac::Int=1, offset=nothing) where {T<:Number,N} + no > 0 || throw(ArgumentError("no must be positive")) + usfac > 0 || throw(ArgumentError("usfac must be positive")) + + Treal = float(real(T)) + offset_vals = offset === nothing ? ntuple(_ -> zero(Treal), N) : ntuple(i -> Treal(offset[i]), N) + + sz = ntuple(i -> size(inp, i), N) permV = 1:N + out = inp for i in permV - inp = permutedims(inp,[i;deleteat!(collect(permV),i)]) - kern = exp.((-1im*2*pi/(sz[i]*usfac))*((0:(no-1)).-offset[i])*transpose(ifftshift(0:(sz[i]-1)).-floor(sz[i]/2))) - d = size(inp)[2:N] - inp = kern * reshape(inp, Val(2)) - inp = reshape(inp,(no,d...)) + out = permutedims(out, [i; deleteat!(collect(permV), i)]) + + row = Treal.(0:(no - 1)) .- offset_vals[i] + col = Treal.(ifftshift(0:(sz[i] - 1)) .- div(sz[i], 2)) + phase = -complex(zero(Treal), Treal(2 * pi / (sz[i] * usfac))) + kern_host = exp.(phase .* (row * transpose(col))) + kern = _to_backend(out, convert.(eltype(out), kern_host)) + + d = size(out)[2:N] + out = kern * reshape(out, Val(2)) + out = reshape(out, (no, d...)) end - permutedims(inp,collect(ndims(inp):-1:1)) + permutedims(out, collect(ndims(out):-1:1)) end -function dftreg_gpu!(img1_f_g::CuArray{Complex{Float32},2}, img2_f_g::CuArray{Complex{Float32},2}, - CC_g::CuArray{Complex{Float32},2}) - L = length(img1_f_g) - CC_g .= CUFFT.ifft(img1_f_g .* CUDA.conj(img2_f_g)) - loc = argmax(abs.(CC_g)) - CCmax = Array(CC_g)[loc] - rfzero = sum(abs2, img1_f_g) / L - rgzero = sum(abs2, img2_f_g) / L - error = abs(1 - CCmax * conj(CCmax) / (rgzero * rfzero)) - diffphase = atan(imag(CCmax),real(CCmax)) - - indi = size(img1_f_g) - ind2 = tuple([div(x,2) for x in indi]...) - - locI = [Tuple(loc)...] - - shift = zeros(size(locI)) - for i = eachindex(locI) - if locI[i]>ind2[i] - shift[i]=locI[i]-indi[i]-1 - else shift[i]=locI[i]-1 +""" + dftreg!(img1_f, img2_f, CC; cc_abs2_work=nothing) -> (error, shift, diffphase) + +Estimate integer-pixel translation between two 2D images in the Fourier domain. + +`img1_f` and `img2_f` must be Fourier transforms of real images of equal size. +`CC` is a complex work/output buffer with the same size and is overwritten with +the inverse-FFT cross-correlation field. + +Returns: +- `error`: translation-invariant normalized RMS-like mismatch metric, +- `shift`: real-valued vector `(dx, dy)` to apply to `img2` to align to `img1`, +- `diffphase`: global phase difference (radians). +""" +function dftreg!( + img1_f::AbstractMatrix{T1}, + img2_f::AbstractMatrix{T2}, + CC::AbstractMatrix{TC}, + ; + cc_abs2_work::Union{Nothing,AbstractMatrix{<:Real}}=nothing, +) where {T1<:Complex,T2<:Complex,TC<:Complex} + size(img1_f) == size(img2_f) || throw(DimensionMismatch("img1_f and img2_f must have the same size")) + size(img1_f) == size(CC) || throw(DimensionMismatch("CC must have the same size as img1_f")) + + Treal = float(real(promote_type(T1, T2, TC))) + L = Treal(length(img1_f)) + + @. CC = img1_f * conj(img2_f) + _ifft!(CC) + + loc = _findmax_abs2_loc(CC, cc_abs2_work) + CCmax = _scalar_at(CC, loc) + rfzero = sum(abs2, img1_f) / L + rgzero = sum(abs2, img2_f) / L + error = abs(one(Treal) - CCmax * conj(CCmax) / (rgzero * rfzero)) + diffphase = Treal(atan(imag(CCmax), real(CCmax))) + + indi = size(img1_f) + ind2 = ntuple(i -> div(indi[i], 2), length(indi)) + locI = Tuple(loc) + shift = zeros(Treal, length(locI)) + + for i in eachindex(locI) + if locI[i] > ind2[i] + shift[i] = locI[i] - indi[i] - 1 + else + shift[i] = locI[i] - 1 end end - + error, shift, diffphase end -function dftreg_subpix_gpu!(img1_f_g::CuArray{Complex{Float32},2}, img2_f_g::CuArray{Complex{Float32},2}, - CC2x_g::CuArray{Complex{Float32},2}, up_fac::Int=10) - ## initial estimate by 2x upsample - # embed 2x fft - dim_input = collect(size(img1_f_g)) - ranges = [(x+1-div(x,2)):(x+1+div(x-1,2)) for x in dim_input] - CC2x_g .= 0 - CC2x_g[ranges...] .= CUFFT.fftshift(img1_f_g) .* CUFFT.conj(CUFFT.fftshift(img2_f_g)) +""" + _embed_crosspower_2x!(CC2x, img1_f, img2_f) -> CC2x + +Write the 2x zero-padded cross-power spectrum directly in native FFT order. + +This is equivalent to constructing `fftshift(img1_f) .* conj.(fftshift(img2_f))` +in the centered region of `CC2x` and then applying `ifftshift(CC2x)`, but avoids +materializing the shifted arrays. +""" +function _embed_crosspower_2x!( + CC2x::AbstractMatrix{TC}, + img1_f::AbstractMatrix{T1}, + img2_f::AbstractMatrix{T2}, +) where {TC<:Complex,T1<:Complex,T2<:Complex} + size(img1_f) == size(img2_f) || throw(DimensionMismatch("img1_f and img2_f must have the same size")) + m, n = size(img1_f) + expected_cc_size = (2m, 2n) + size(CC2x) == expected_cc_size || + throw(DimensionMismatch("CC2x must have size $(expected_cc_size), got $(size(CC2x))")) + + fill!(CC2x, zero(TC)) + + h1, t1 = cld(m, 2), fld(m, 2) + h2, t2 = cld(n, 2), fld(n, 2) + + in1_h, in1_t = 1:h1, (h1 + 1):m + in2_h, in2_t = 1:h2, (h2 + 1):n + out1_h, out1_t = 1:h1, (2m - t1 + 1):(2m) + out2_h, out2_t = 1:h2, (2n - t2 + 1):(2n) + + @views @. CC2x[out1_h, out2_h] = img1_f[in1_h, in2_h] * conj(img2_f[in1_h, in2_h]) + + if t2 > 0 + @views @. CC2x[out1_h, out2_t] = img1_f[in1_h, in2_t] * conj(img2_f[in1_h, in2_t]) + end + if t1 > 0 + @views @. CC2x[out1_t, out2_h] = img1_f[in1_t, in2_h] * conj(img2_f[in1_t, in2_h]) + end + if t1 > 0 && t2 > 0 + @views @. CC2x[out1_t, out2_t] = img1_f[in1_t, in2_t] * conj(img2_f[in1_t, in2_t]) + end + + CC2x +end + +""" + dftreg_subpix!(img1_f, img2_f, CC2x, up_fac=10; cc2x_abs2_work=nothing) + -> (error, shift, diffphase) + +Estimate subpixel translation between two 2D Fourier-domain images. + +Algorithm: +1. Build a 2x upsampled cross-correlation grid in `CC2x` for a coarse estimate. +2. Optionally refine the peak with matrix-multiply DFT (`dftups`) at `up_fac`. + +`CC2x` must be size `(2*size(img1_f,1), 2*size(img1_f,2))`. Returned `shift` +is the translation to apply to `img2` so it aligns with `img1`. +""" +function dftreg_subpix!( + img1_f::AbstractMatrix{T1}, + img2_f::AbstractMatrix{T2}, + CC2x::AbstractMatrix{TC}, + up_fac::Int=10, + ; + cc2x_abs2_work::Union{Nothing,AbstractMatrix{<:Real}}=nothing, +) where {T1<:Complex,T2<:Complex,TC<:Complex} + size(img1_f) == size(img2_f) || throw(DimensionMismatch("img1_f and img2_f must have the same size")) + up_fac > 0 || throw(ArgumentError("up_fac must be positive")) + Treal = float(real(promote_type(T1, T2, TC))) + + # initial estimate by 2x upsample + dim_input = size(img1_f) + expected_cc_size = ntuple(i -> 2 * dim_input[i], length(dim_input)) + size(CC2x) == expected_cc_size || + throw(DimensionMismatch("CC2x must have size $(expected_cc_size), got $(size(CC2x))")) + + _embed_crosspower_2x!(CC2x, img1_f, img2_f) # compute cross-correlation and locate the peak - CC2x_g = CUFFT.ifft(CUFFT.ifftshift(CC2x_g)) - loc = argmax(abs.(CC2x_g)) + _ifft!(CC2x) + loc = _findmax_abs2_loc(CC2x, cc2x_abs2_work) - indi = size(CC2x_g) + indi = size(CC2x) locI = collect(Tuple(loc)) - CC2x_max = Array(CC2x_g)[loc] + CC2x_max = _scalar_at(CC2x, loc) # obtain shift in original pixel grid - ind2 = indi ./ 2 - shift = zeros(size(locI)) - for i = eachindex(locI) + ind2 = Treal.(indi) ./ Treal(2) + shift = zeros(Treal, length(locI)) + for i in eachindex(locI) if locI[i] > ind2[i] shift[i] = locI[i] - indi[i] - 1 else @@ -67,88 +268,210 @@ function dftreg_subpix_gpu!(img1_f_g::CuArray{Complex{Float32},2}, img2_f_g::CuA end shift = shift / 2 - ## refine subpixel estimation + # refine subpixel estimation if up_fac > 2 - # refine the estimate with matrix multiply DFT - shift = round.(Int, shift * up_fac) / up_fac # initial shift estimate - dft_shift = ceil(up_fac * 1.5) / 2 # center of output at dft_shift + 1 - - # mat multiplies dft around the current shift estimate - CC_refine = dftups(Array(img2_f_g .* CUFFT.conj(img1_f_g)), ceil(Int, up_fac * 1.5), - up_fac, dft_shift .- shift * up_fac) / (prod(ind2) * up_fac ^ 2) - - # locate max and map back to the original grid - loc = argmax(abs.(CC_refine)) - locI = Tuple(loc) - CC_refine_max = CC_refine[loc] - locI = locI .- dft_shift .- 1 - shift = shift .+ locI ./ up_fac - - img1_00 = dftups(Array(img1_f_g .* CUFFT.conj(img1_f_g)), 1, up_fac)[1] / (prod(ind2) * up_fac ^ 2) - img2_00 = dftups(Array(img2_f_g .* CUFFT.conj(img2_f_g)), 1, up_fac)[1] / (prod(ind2) * up_fac ^ 2) + up = Treal(up_fac) + shift = round.(Treal, shift .* up) ./ up # initial shift estimate + dft_shift = Treal(ceil(up_fac * 1.5) / 2) # center of output at dft_shift + 1 + denom = prod(ind2) * up ^ 2 + + CC_refine = dftups( + img2_f .* conj.(img1_f), + ceil(Int, up_fac * 1.5), + up_fac, + dft_shift .- shift .* up, + ) / denom + + loc = _findmax_abs2_loc(CC_refine) + locI_ref = Treal.(Tuple(loc)) + CC_refine_max = _scalar_at(CC_refine, loc) + shift = shift .+ (locI_ref .- dft_shift .- one(Treal)) ./ up + + img1_00 = _scalar_at(dftups(img1_f .* conj.(img1_f), 1, up_fac), 1) / denom + img2_00 = _scalar_at(dftups(img2_f .* conj.(img2_f), 1, up_fac), 1) / denom CC_max = CC_refine_max else - img1_00 = sum(img1_f_g .* CUFFT.conj(img1_f_g)) / prod(indi) - img2_00 = sum(img2_f_g .* CUFFT.conj(img2_f_g)) / prod(indi) + img1_00 = sum(img1_f .* conj.(img1_f)) / prod(indi) + img2_00 = sum(img2_f .* conj.(img2_f)) / prod(indi) CC_max = CC2x_max end - + error = 1 - CC_max * conj(CC_max) / (img1_00 * img2_00) - error = sqrt(abs.(error)) - diffphase = atan(imag(CC_max), real(CC_max)) - + error = sqrt(abs(error)) |> Treal + diffphase = Treal(atan(imag(CC_max), real(CC_max))) + error, shift, diffphase end -function subpix_shift_gpu!(img_f_g::CuArray{Complex{Float32},2}, N_g::CuArray{Float32,2}, shift, diffphase) - sz = [size(img_f_g)...] - N_ = Float32(0) - for i = eachindex(sz) - shifti = ifftshift((-div(sz[i],2)):(ceil(Integer,sz[i]/2)-1))*shift[i]/sz[i] - resh = (repeat([1],inner=[i-1])...,length(shifti)) - N_ = N_ .- Float32.(reshape(shifti,resh)) +""" + subpix_shift!(out, img_f, N, shift, diffphase) -> out + +Apply a subpixel translation and global phase correction directly in Fourier +space. + +`img_f` is the Fourier-domain moving image, `shift` is the translation vector, +`diffphase` is the global phase correction, and `N` is a real scratch matrix +matching `size(img_f)` used to accumulate the phase ramp. `out` is overwritten +with the shifted Fourier-domain image. +""" +function subpix_shift!( + out::AbstractMatrix{<:Complex}, + img_f::AbstractMatrix{<:Complex}, + N::AbstractMatrix{<:Real}, + shift, + diffphase, +) + size(out) == size(img_f) || throw(DimensionMismatch("out must have the same size as img_f")) + size(img_f) == size(N) || throw(DimensionMismatch("N must have the same size as img_f")) + length(shift) == ndims(img_f) || throw(DimensionMismatch("shift must match image dimensionality")) + + sz = [size(img_f)...] + Tphase = float(real(eltype(out))) + TN = eltype(N) + fill!(N, zero(TN)) + + for i in eachindex(sz) + shifti = ifftshift((-div(sz[i], 2)):(ceil(Int, sz[i] / 2) - 1)) * shift[i] / sz[i] + resh = (ntuple(_ -> 1, i - 1)..., length(shifti)) + shifti_backend = _to_backend(N, reshape(TN.(shifti), resh)) + N .-= shifti_backend end - - copyto!(N_g, N_) - exp(1im * diffphase) .* (img_f_g .* exp.(Complex{Float32}(2im * pi) * N_g)) + + phase = cis(Tphase(diffphase)) + twopi_im = complex(zero(Tphase), Tphase(2 * pi)) + @. out = phase * (img_f * exp(twopi_im * N)) + out end -function dftreg_resample_gpu!(img_f_g::CuArray{Complex{Float32}, 2}, N_g::CuArray{Float32,2}, shift, diffphase) - real(CUFFT.ifft(subpix_shift_gpu!(img_f_g, N_g, shift, diffphase))) +""" + subpix_shift!(img_f, N, shift, diffphase) + +Allocation convenience method for [`subpix_shift!`](@ref) that returns a newly +allocated Fourier-domain output array. +""" +function subpix_shift!( + img_f::AbstractMatrix{<:Complex}, + N::AbstractMatrix{<:Real}, + shift, + diffphase, +) + out = similar(img_f) + subpix_shift!(out, img_f, N, shift, diffphase) end -function reg_stack_translate!(img_stack_reg_g::CuArray{Float32,3}, img1_f_g::CuArray{Complex{Float32},2}, - img2_f_g::CuArray{Complex{Float32},2}, CC2x_g::CuArray{Complex{Float32},2}, - N_g::CuArray{Float32,2}; reg_param::Dict) - size_x, size_y, size_z = size(img_stack_reg_g) - # reset arrays - CC2x_g .= 0 - N_g .= 0 - - # register +""" + dftreg_resample!(out, img_f, N, shift, diffphase; work_f=similar(img_f)) -> out + +Resample a Fourier-domain moving image into real-space after applying +translation/phase correction. + +This combines [`subpix_shift!`](@ref) with inverse FFT and writes the real part +to `out`. `work_f` is a complex scratch/output buffer for the shifted spectrum. +""" +function dftreg_resample!( + out::AbstractMatrix{<:Real}, + img_f::AbstractMatrix{<:Complex}, + N::AbstractMatrix{<:Real}, + shift, + diffphase, + ; + work_f::AbstractMatrix{<:Complex}=similar(img_f), +) + size(out) == size(img_f) || throw(DimensionMismatch("out must have the same size as img_f")) + size(work_f) == size(img_f) || throw(DimensionMismatch("work_f must have the same size as img_f")) + + subpix_shift!(work_f, img_f, N, shift, diffphase) + _ifft!(work_f) + out .= real.(work_f) + out +end + +""" + dftreg_resample(img_f, N, shift, diffphase; work_f=similar(img_f)) + +Allocation convenience method for [`dftreg_resample!`](@ref) that returns a new +real-valued registered image. +""" +function dftreg_resample( + img_f::AbstractMatrix{<:Complex}, + N::AbstractMatrix{<:Real}, + shift, + diffphase, + ; + work_f::AbstractMatrix{<:Complex}=similar(img_f), +) + out = similar(N, float(real(eltype(img_f))), size(img_f)) + dftreg_resample!(out, img_f, N, shift, diffphase; work_f=work_f) +end + +""" + dftreg_resample!(img_f, N, shift, diffphase) + +Legacy convenience alias equivalent to [`dftreg_resample`](@ref). +""" +function dftreg_resample!( + img_f::AbstractMatrix{<:Complex}, + N::AbstractMatrix{<:Real}, + shift, + diffphase, +) + dftreg_resample(img_f, N, shift, diffphase) +end + +""" + reg_stack_translate!(img_stack_reg, img1_f, img2_f, CC2x, N; reg_param=Dict()) + +Register a 3D stack (`x, y, z`) by translating each plane `z` to align with the +previous registered plane `z-1`. + +The operation is in-place on `img_stack_reg`. `img1_f`, `img2_f`, `CC2x`, and +`N` are reusable work buffers: +- `img1_f`, `img2_f`: Fourier-domain work images (`size == size(img_stack_reg[:,:,1])`) +- `CC2x`: coarse cross-correlation buffer (2x in each planar dimension) +- `N`: real phase-ramp scratch buffer + +Forward FFTs are executed in-place on `img1_f`/`img2_f`; when supported by the +active backend (for example FFTW), plans are reused across all z-planes. + +If `reg_param[z]` exists, it must contain `(error, shift, diffphase)` and is +reused instead of recomputing registration for plane `z`. Otherwise the tuple is +computed and stored. +""" +function reg_stack_translate!( + img_stack_reg::AbstractArray{<:Real,3}, + img1_f::AbstractMatrix{<:Complex}, + img2_f::AbstractMatrix{<:Complex}, + CC2x::AbstractMatrix{<:Complex}, + N::AbstractMatrix{<:Real}; + reg_param::AbstractDict{<:Integer}=Dict{Int,Any}(), +) + _, _, size_z = size(img_stack_reg) + CC2x .= zero(eltype(CC2x)) + N .= zero(eltype(N)) + cc2x_abs2_work = similar(CC2x, float(real(eltype(CC2x)))) + fft_plan1 = _plan_fft_inplace(img1_f) + fft_plan2 = _plan_fft_inplace(img2_f) + for z = 2:size_z z1, z2 = z - 1, z - # copy data and fft - img1_g = view(img_stack_reg_g, :,:,z1) - img2_g = view(img_stack_reg_g, :,:,z2) - img1_f_g .= CUFFT.fft(img1_g) - img2_f_g .= CUFFT.fft(img2_g) + img1 = view(img_stack_reg, :, :, z1) + img2 = view(img_stack_reg, :, :, z2) + img1_f .= img1 + _fft_inplace!(img1_f, fft_plan1) + img2_f .= img2 + _fft_inplace!(img2_f, fft_plan2) - # register if !haskey(reg_param, z) - error, shift, diffphase = dftreg_subpix_gpu!(img1_f_g, img2_f_g, CC2x_g) + error, shift, diffphase = dftreg_subpix!(img1_f, img2_f, CC2x; cc2x_abs2_work=cc2x_abs2_work) reg_param[z] = (error, shift, diffphase) else error, shift, diffphase = reg_param[z] end - - # resample - img_stack_reg_g[:,:,z] .= dftreg_resample_gpu!(img2_f_g, N_g, shift, diffphase) - - # reset arrays - CC2x_g .= 0 - N_g .= 0 + + dftreg_resample!(view(img_stack_reg, :, :, z), img2_f, N, shift, diffphase; work_f=img2_f) + CC2x .= zero(eltype(CC2x)) + N .= zero(eltype(N)) end - + nothing -end \ No newline at end of file +end diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..88a5505 --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,6 @@ +using Test +using FFTW +using FFTRegGPU + +include("unit_tests.jl") +include("synthetic_stack_tests.jl") diff --git a/test/synthetic_stack_tests.jl b/test/synthetic_stack_tests.jl new file mode 100644 index 0000000..cccdd33 --- /dev/null +++ b/test/synthetic_stack_tests.jl @@ -0,0 +1,71 @@ +using Random +using Test + +function make_blob_image(nx::Int, ny::Int, rng::AbstractRNG; nspots::Int=10, sigma::Float32=2.8f0) + x = reshape(Float32.(0:(nx - 1)), nx, 1) + y = reshape(Float32.(0:(ny - 1)), 1, ny) + img = zeros(Float32, nx, ny) + + two_sigma2 = 2f0 * sigma^2 + for _ in 1:nspots + cx = rand(rng, Float32) * Float32(nx - 1) + cy = rand(rng, Float32) * Float32(ny - 1) + amp = 0.2f0 + 0.8f0 * rand(rng, Float32) + img .+= amp .* exp.(-((x .- cx).^2 .+ (y .- cy).^2) ./ two_sigma2) + end + + img ./= maximum(img) + img +end + +function stack_adjacent_mse(stack::AbstractArray{<:Real,3}) + _, _, nz = size(stack) + nz <= 1 && return 0.0 + acc = 0.0 + for z in 2:nz + a = @view stack[:, :, z - 1] + b = @view stack[:, :, z] + acc += sum((a .- b) .^ 2) / length(a) + end + acc / (nz - 1) +end + +@testset "synthetic noisy stack registration" begin + rng = MersenneTwister(20260313) + nx, ny = 64, 64 + known_rel = [(1, -1), (2, 1), (-1, 2), (0, -2), (1, 1)] + nz = length(known_rel) + 1 + + base = make_blob_image(nx, ny, rng) + clean_stack = zeros(Float32, nx, ny, nz) + clean_stack[:, :, 1] .= base + for z in 2:nz + clean_stack[:, :, z] .= circshift(@view(clean_stack[:, :, z - 1]), known_rel[z - 1]) + end + + noise_sigma = 0.03f0 + noisy_stack = clean_stack .+ noise_sigma .* randn(rng, Float32, nx, ny, nz) + reg_stack = copy(noisy_stack) + + img1_f = zeros(ComplexF32, nx, ny) + img2_f = zeros(ComplexF32, nx, ny) + cc2x = zeros(ComplexF32, 2 * nx, 2 * ny) + nbuf = zeros(Float32, nx, ny) + reg_param = Dict{Int,Any}() + + mse_before = stack_adjacent_mse(noisy_stack) + reg_stack_translate!(reg_stack, img1_f, img2_f, cc2x, nbuf; reg_param=reg_param) + mse_after = stack_adjacent_mse(reg_stack) + + @test length(reg_param) == nz - 1 + cum_shift = zeros(Float32, 2) + for z in 2:nz + cum_shift .+= Float32.(collect(known_rel[z - 1])) + _, shift, _ = reg_param[z] + expected = -cum_shift + @test length(shift) == 2 + @test all(isapprox.(shift, expected; atol=0.35f0)) + end + + @test mse_after < 0.75 * mse_before +end diff --git a/test/unit_tests.jl b/test/unit_tests.jl new file mode 100644 index 0000000..76bc75a --- /dev/null +++ b/test/unit_tests.jl @@ -0,0 +1,92 @@ +using Random +using Test + +@testset "FFTRegGPU unit tests" begin + rng = MersenneTwister(42) + nx, ny = 32, 24 + + img = rand(rng, Float32, nx, ny) + img_f = fft(img) + + @testset "argument validation" begin + bad_cc = zeros(ComplexF32, nx + 1, ny) + @test_throws DimensionMismatch dftreg!(img_f, img_f, bad_cc) + + cc2x = zeros(ComplexF32, 2 * nx, 2 * ny) + @test_throws ArgumentError dftreg_subpix!(img_f, img_f, cc2x, 0) + + bad_cc2x = zeros(ComplexF32, 2 * nx + 1, 2 * ny) + @test_throws DimensionMismatch dftreg_subpix!(img_f, img_f, bad_cc2x, 10) + + @test_throws ArgumentError FFTRegGPU.dftups(img_f, 0, 1) + @test_throws ArgumentError FFTRegGPU.dftups(img_f, 4, 0) + + out_bad = zeros(Float32, nx + 1, ny) + nbuf = zeros(Float32, nx, ny) + @test_throws DimensionMismatch dftreg_resample!(out_bad, img_f, nbuf, (0f0, 0f0), 0f0) + end + + @testset "identity registration" begin + cc = similar(img_f) + err, shift, phase = dftreg!(img_f, img_f, cc) + + @test err ≤ 1f-5 + @test isapprox(phase, 0f0; atol=1f-5) + @test eltype(shift) == Float32 + @test all(isapprox.(shift, zero(eltype(shift)); atol=1f-5)) + end + + @testset "integer shift recovery" begin + shifted = circshift(img, (3, -2)) + shifted_f = fft(shifted) + cc = similar(img_f) + _, shift, _ = dftreg!(img_f, shifted_f, cc) + @test all(isapprox.(shift, Float32[-3, 2]; atol=1f-5)) + end + + @testset "resampling API" begin + shift = Float32[2.0, -1.0] + phase = 0f0 + nbuf1 = zeros(Float32, nx, ny) + nbuf2 = zeros(Float32, nx, ny) + + moved = circshift(img, (2, -1)) + moved_f = fft(moved) + + alloc_out = dftreg_resample(moved_f, nbuf1, shift, phase) + inpl_out = similar(img) + work_f = similar(moved_f) + ret = dftreg_resample!(inpl_out, moved_f, nbuf2, shift, phase; work_f=work_f) + + @test ret === inpl_out + @test alloc_out ≈ inpl_out atol = 1f-5 + + phase_shifted = similar(moved_f) + ret2 = subpix_shift!(phase_shifted, moved_f, nbuf2, (0f0, 0f0), 0f0) + @test ret2 === phase_shifted + @test phase_shifted ≈ moved_f atol = 1f-6 + end + + @testset "2x cross-power embedding matches shift pipeline" begin + for (mx, my) in ((6, 4), (5, 7)) + a = rand(rng, Float32, mx, my) + b = rand(rng, Float32, mx, my) + a_f = fft(a) + b_f = fft(b) + + cc_new = zeros(ComplexF32, 2 * mx, 2 * my) + cc_ref = similar(cc_new) + + FFTRegGPU._embed_crosspower_2x!(cc_new, a_f, b_f) + + ranges = [(x + 1 - div(x, 2)):(x + 1 + div(x - 1, 2)) for x in size(a_f)] + cc_ref .= 0 + a_shift = FFTRegGPU._fftshift(a_f) + b_shift = FFTRegGPU._fftshift(b_f) + @views @. cc_ref[ranges...] = a_shift * conj(b_shift) + copyto!(cc_ref, FFTRegGPU._ifftshift(cc_ref)) + + @test cc_new == cc_ref + end + end +end