diff --git a/Project.toml b/Project.toml index 9f77054..e1e4c02 100644 --- a/Project.toml +++ b/Project.toml @@ -9,9 +9,11 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2" MatrixDepot = "b51810bb-c9f3-55da-ae3c-350fc1fbce05" OpenAI = "e9f21f70-7185-4079-aca2-91159181367c" @@ -21,15 +23,20 @@ ScikitLearn = "3646fa90-6ef7-5e7e-9f22-8aca16db6324" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +[sources] +Dagger = {rev = "jps/lu-ldiv3", url = "https://github.com/JuliaParallel/Dagger.jl"} + [compat] +BSON = "0.3" BenchmarkTools = "1" CSV = "0.10" -BSON = "0.3" CUDA = "5.9.2" CairoMakie = "0.15" -DecisionTree = "0.12" +Dagger = "0.19.2" DataFrames = "1" +DecisionTree = "0.12" LinearAlgebra = "1.12.0" +Logging = "1.11.0" MKL = "0.9" MatrixDepot = "1.0.13" OpenAI = "0.12.0" diff --git a/examples/agentic/generate-cpu-linear-solver/benchmark1.jl b/examples/agentic/generate-cpu-linear-solver/benchmark1.jl new file mode 100644 index 0000000..ad81cf1 --- /dev/null +++ b/examples/agentic/generate-cpu-linear-solver/benchmark1.jl @@ -0,0 +1,134 @@ +using LinearAlgebra +using SparseArrays +using CUDA +using BenchmarkTools +using OrderedCollections +using Plots + +println("CPU benchmark with error-vs-time plot:\n") + +include("solver.jl") + +# Configuration +N = 15_000 +sparsity_levels = [0.1, 0.5, 0.9] + +umfpack_control = SparseArrays.UMFPACK.get_umfpack_control(Float64, Int64) # read Julia default configuration for a Float64 sparse matrix +#SparseArrays.UMFPACK.show_umf_ctrl(umfpack_control) # optional - display values +umfpack_control[SparseArrays.UMFPACK.JL_UMFPACK_IRSTEP] = 2.0 # reenable iterative refinement (2 is UMFPACK default max iterative refinement steps) + +solvers = OrderedDict( + "UMFPACK (Baseline Direct Solve)" => (A, b) -> (A \ b), + "UMFPACK + Iterative Refinement" => (A, b) -> lu(A; control = umfpack_control) \ b, + "SmartSolve-Generated Solver" => (A, b) -> proposed_fn(A, b) +) + +# Store results for plotting +results = Dict() + +for sparsity in sparsity_levels + println("\n=== Sparsity: $sparsity ===") + + # Generate problem + A = sprand(N, N, sparsity) + b = rand(N) + + results[sparsity] = Dict() + + for (solver_name, solver_fn) in solvers + println(" $solver_name...") + + # Warm-up + b_warm = copy(b) + try + x_warm = solver_fn(A, b_warm) + catch e + println(" Warning: solver failed during warm-up: $e") + continue + end + + # Benchmark + b_bench = copy(b) + try + bench = @benchmark begin + x = $(solver_fn)($A, $b_bench) + end + + time_ms = median(bench.times) / 1e9 # Convert to s + + # Compute error + b_err = copy(b) + x_sol = solver_fn(A, b_err) + error = norm(A*x_sol - b_err) / norm(b_err) + + results[sparsity][solver_name] = (time=time_ms, error=error) + println(" Time: $(round(time_ms, digits=3)) s, Error: $(round(error, sigdigits=3))") + catch e + println(" Error during benchmark: $e") + end + end + + GC.gc() +end + +# Create error-vs-time plot +p = plot( + size=(800, 800), + #legend=:topright, + legend=:topleft, + xlabel="Time (s)", + ylabel="Relative residual: ||Ax - b||₂ / ||b||₂", + # xscale=:log10, + yscale=:log10, + guidefontsize=22,#18, + tickfontsize=20, #16, + legendfontsize=14, #14, + margin=5*Plots.mm, + framestyle=:box, + title="Random Matrices of Size $(N)x$(N),\n Varying Sparsity Levels (ρ) and\n CPU Solvers", + titlefontsize=22, +) + +# Symbols encode sparsity levels; colors encode solvers. +# Define marker for each sparsity and a color for each solver. +## Sparsity shapes +marker_map_sparsity = OrderedDict(0.1=>:circle, 0.5=>:square, 0.9=>:utriangle) +## Solver color shades +colors = [:red, :blue, :green] + +# Plot each point individually so marker shape shows sparsity and color shows solver. +for (i, solver_name) in enumerate(keys(solvers)) + for sparsity in sparsity_levels + if sparsity in keys(results) && solver_name in keys(results[sparsity]) + t = results[sparsity][solver_name].time + e = results[sparsity][solver_name].error + scatter!(p, [t], [e]; + label="", + marker=marker_map_sparsity[sparsity], + markersize=15, + color=colors[i], + markerstrokecolor=:black, + markerstrokewidth=0.0,#0.8, + alpha=0.45) + end + end +end + +# Create a combined legend +for (i, solver_name) in enumerate(keys(solvers)) + for s in sparsity_levels + lbl = "$(solver_name), ρ:$(s)" + scatter!(p, [NaN], [NaN]; label=lbl, + marker=marker_map_sparsity[s], + markersize=15, + color=colors[i], + markerstrokecolor=:black, + markerstrokewidth=0.0, + alpha=0.45) + end +end + +savefig(p, "error_vs_time.pdf") +println("\n✓ Plot saved as error_vs_time.pdf") + +display(p) \ No newline at end of file diff --git a/examples/agentic/generate-cpu-linear-solver/benchmark2.jl b/examples/agentic/generate-cpu-linear-solver/benchmark2.jl new file mode 100644 index 0000000..88c2aa2 --- /dev/null +++ b/examples/agentic/generate-cpu-linear-solver/benchmark2.jl @@ -0,0 +1,37 @@ +using LinearAlgebra +using SparseArrays +using BenchmarkTools + +include("solver.jl") + +N = 15_000 +A = sprand(N, N, 0.5) +b = rand(N) + +# See https://docs.julialang.org/en/v1/stdlib/LinearAlgebra/#LinearAlgebra.lu +umfpack_control = SparseArrays.UMFPACK.get_umfpack_control(Float64, Int64) # read Julia default configuration for a Float64 sparse matrix +#SparseArrays.UMFPACK.show_umf_ctrl(umfpack_control) # optional - display values +umfpack_control[SparseArrays.UMFPACK.JL_UMFPACK_IRSTEP] = 2.0 # reenable iterative refinement (2 is UMFPACK default max iterative refinement steps) + +# warm-up +x1 = A \ b # solve Ax = b +x2 = lu(A; control = umfpack_control) \ b # solve Ax = b, including UMFPACK iterative refinement +x3 = proposed_fn(A, b) + +@benchmark begin + x = $A \ $b # solve Ax = b +end + +@benchmark begin + x = lu($A; control = $umfpack_control) \ $b # solve Ax = b, including UMFPACK iterative refinement +end + +@benchmark begin + x = proposed_fn($A, $b) +end + +error1 = norm(A*x1 - b) / norm(b) + +error2 = norm(A*x2 - b) / norm(b) + +error3 = norm(A*x3 - b) / norm(b) \ No newline at end of file diff --git a/examples/agentic/generate-cpu-linear-solver/error_vs_time.pdf b/examples/agentic/generate-cpu-linear-solver/error_vs_time.pdf new file mode 100644 index 0000000..0d33a22 Binary files /dev/null and b/examples/agentic/generate-cpu-linear-solver/error_vs_time.pdf differ diff --git a/examples/agentic/generate-cpu-linear-solver/solver.jl b/examples/agentic/generate-cpu-linear-solver/solver.jl index f9b8a54..05fc6ca 100644 --- a/examples/agentic/generate-cpu-linear-solver/solver.jl +++ b/examples/agentic/generate-cpu-linear-solver/solver.jl @@ -1,65 +1,44 @@ -function proposed_fn(A::SparseMatrixCSC, b::AbstractVector) - @assert size(A,1) == size(A,2) "A must be square" - n = length(b) - @assert size(A,2) == n "Dimensions of A and b must agree" - - niters = 4 - - # Convert sparse matrix to dense double for accurate residual computation - # and to dense single for fast factorization/solves with multithreaded BLAS. - Ad64 = Array(A) # dense Float64 - Ad32 = Array{Float32}(undef, n, n) - @inbounds for j in 1:n - for i in 1:n - Ad32[i,j] = Float32(Ad64[i,j]) - end +function proposed_fn(A, b) + # Cache Float32 factorizations and work buffers per matrix identity + if !isdefined(@__MODULE__, :LU32_CACHE) + global LU32_CACHE = IdDict{UInt64, Tuple{Any, Vector{Float64}, Vector{Float32}, Vector{Float32}}}() end - # Convert rhs to Float32 once - b32 = Vector{Float32}(undef, n) - @inbounds @simd for i in 1:n - b32[i] = Float32(b[i]) - end - - # Factorize dense single-precision matrix (uses LAPACK/BLAS and is multithreaded) - F32 = lu(Ad32) - - # Initial solve in single precision, in-place if possible - x32 = copy(b32) - try - LinearAlgebra.ldiv!(F32, x32) # in-place: x32 <- Ad32 \ b32 - catch - x32 = F32 \ b32 # fallback + # Ensure b as Float64 vector (avoid copy if already Float64) + b64 = eltype(b) === Float64 ? b : Vector{Float64}(b) + n = length(b64) + + key = objectid(A) + F32, r64, work32, bf32 = get!(LU32_CACHE, key) do + # Build a single-precision copy of the numeric values (structure reuse) + nz32 = Float32.(A.nzval) + Af = SparseMatrixCSC{Float32, Int}(size(A,1), size(A,2), + copy(A.colptr), copy(A.rowval), + nz32) + F32_local = lu(Af) # single-precision sparse LU + r64_local = Vector{Float64}(undef, n) # residual buffer (double) + work32_local = Vector{Float32}(undef, n) # temp residual in single + bf32_local = Vector{Float32}(undef, n) # temp right-hand side in single + return (F32_local, r64_local, work32_local, bf32_local) end - # Promote to double precision for accumulation and residual computation - x = Vector{Float64}(undef, n) - @inbounds @simd for i in 1:n - x[i] = Float64(x32[i]) + # Initial solve in single precision, accumulate in double + @inbounds for i = 1:n + bf32[i] = Float32(b64[i]) end - - # Preallocate working vectors - r = similar(b) # Float64 residual - r32 = Vector{Float32}(undef, n) # single-precision correction (in-place) - - for iter in 1:niters - # r = b - Ad64 * x (use BLAS for dense matvec) - mul!(r, Ad64, x) # r = Ad64 * x - @inbounds @simd for i in 1:n - r[i] = b[i] - r[i] - r32[i] = Float32(r[i]) + xf32 = F32 \ bf32 + x = Float64.(xf32) + + # Iterative refinement: compute residual in double, solve correction in single, update double solution + for _ = 1:5 + mul!(r64, A, x) # r64 = A * x (double) + @inbounds for i = 1:n + r64[i] = b64[i] - r64[i] # r64 = b - A*x + work32[i] = Float32(r64[i]) # convert residual to single end - - # Solve correction in single precision using the LU factorization - try - LinearAlgebra.ldiv!(F32, r32) # r32 <- Ad32 \ r32 (in-place) - catch - r32 = F32 \ r32 # fallback - end - - # Update double-precision solution - @inbounds @simd for i in 1:n - x[i] += Float64(r32[i]) + d32 = F32 \ work32 + @inbounds for i = 1:n + x[i] += Float64(d32[i]) # update solution in double end end diff --git a/examples/agentic/generate-dagger-linear-solver/Project.toml b/examples/agentic/generate-dagger-linear-solver/Project.toml new file mode 100644 index 0000000..8362c6c --- /dev/null +++ b/examples/agentic/generate-dagger-linear-solver/Project.toml @@ -0,0 +1,12 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54" +OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +SmartSolve = "4fbb3a3c-2fa1-4c19-8d57-bae8bc1e16ac" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[sources] +Dagger = {rev = "jps/lu-ldiv3", url = "https://github.com/JuliaParallel/Dagger.jl"} +SmartSolve = {path = "../../.."} diff --git a/examples/agentic/generate-cpu-linear-solver/benchmark.jl b/examples/agentic/generate-dagger-linear-solver/benchmark.jl similarity index 76% rename from examples/agentic/generate-cpu-linear-solver/benchmark.jl rename to examples/agentic/generate-dagger-linear-solver/benchmark.jl index db7d835..9ed3a2d 100644 --- a/examples/agentic/generate-cpu-linear-solver/benchmark.jl +++ b/examples/agentic/generate-dagger-linear-solver/benchmark.jl @@ -5,22 +5,21 @@ using BenchmarkTools using OrderedCollections using Plots -println("CPU benchmark with error-vs-time plot:\n") +println("GPU benchmark with error-vs-time plot:\n") include("solver.jl") # Configuration N = 15_000 sparsity_levels = [0.1, 0.5, 0.9] - -umfpack_control = SparseArrays.UMFPACK.get_umfpack_control(Float64, Int64) # read Julia default configuration for a Float64 sparse matrix -#SparseArrays.UMFPACK.show_umf_ctrl(umfpack_control) # optional - display values -umfpack_control[SparseArrays.UMFPACK.JL_UMFPACK_IRSTEP] = 2.0 # reenable iterative refinement (2 is UMFPACK default max iterative refinement steps) - solvers = OrderedDict( - "Default" => (A, b) -> (A \ b), - "UMFPACK" => (A, b) -> lu(A; control = umfpack_control) \ b, - "Generated" => (A, b) -> proposed_fn(A, b) + "Default" => (Ad, bd) -> (Ad \ bd), + "gesv!" => (Ad, bd) -> begin + x = CuArray(zeros(size(Ad, 1))) + CUDA.CUSOLVER.gesv!(x, Ad, bd) + x + end, + "Generated" => (Ad, bd) -> proposed_fn(Ad, bd) ) # Store results for plotting @@ -32,6 +31,8 @@ for sparsity in sparsity_levels # Generate problem A = sprand(N, N, sparsity) b = rand(N) + Ad = CuArray(Matrix(A)) + bd = CuArray(b) results[sparsity] = Dict() @@ -39,27 +40,30 @@ for sparsity in sparsity_levels println(" $solver_name...") # Warm-up - b_warm = copy(b) + bd_warm = CuArray(copy(b)) try - x_warm = solver_fn(A, b_warm) + x_warm = solver_fn(Ad, bd_warm) + CUDA.synchronize() catch e println(" Warning: solver failed during warm-up: $e") continue end # Benchmark - b_bench = copy(b) + bd_bench = CuArray(copy(b)) try bench = @benchmark begin - x = $(solver_fn)($A, $b_bench) + x = $(solver_fn)($Ad, $bd_bench) + CUDA.synchronize() end seconds = 5 samples = 10 time_ms = median(bench.times) / 1e9 # Convert to s # Compute error - b_err = copy(b) - x_sol = solver_fn(A, b_err) - error = norm(A*x_sol - b_err) / norm(b_err) + bd_err = CuArray(copy(b)) + x_sol = solver_fn(Ad, bd_err) + CUDA.synchronize() + error = norm(Ad*x_sol - bd_err) / norm(bd_err) results[sparsity][solver_name] = (time=time_ms, error=error) println(" Time: $(round(time_ms, digits=3)) s, Error: $(round(error, sigdigits=3))") @@ -73,7 +77,7 @@ end p = plot( size=(800, 800), #legend=:topright, - legend=:bottomleft, + legend=:bottomright, xlabel="Time (s)", ylabel="Relative residual: ||Ax - b||₂ / ||b||₂", # xscale=:log10, @@ -83,7 +87,7 @@ p = plot( legendfontsize=18, #14, margin=5*Plots.mm, framestyle=:box, - title="Random Matrices of Size $(N)x$(N),\n Varying Sparsity Levels (ρ) and\n CPU Solvers", + title="Random Matrices of Size $(N)x$(N),\n Varying Sparsity Levels (ρ) and\n GPU Solvers", titlefontsize=22, ) @@ -92,7 +96,7 @@ p = plot( ## Sparsity shapes marker_map_sparsity = OrderedDict(0.1=>:circle, 0.5=>:square, 0.9=>:utriangle) ## Solver color shades -color_map_solver = OrderedDict("Default"=>:red, "UMFPACK"=>:blue, "Generated"=>:green) +color_map_solver = OrderedDict("Default"=>:red, "gesv!"=>:blue, "Generated"=>:green) # Plot each point individually so marker shape shows sparsity and color shows solver. for solver_name in keys(solvers) diff --git a/examples/agentic/generate-dagger-linear-solver/cholesky_benchmark.jl b/examples/agentic/generate-dagger-linear-solver/cholesky_benchmark.jl new file mode 100644 index 0000000..0e1bc2d --- /dev/null +++ b/examples/agentic/generate-dagger-linear-solver/cholesky_benchmark.jl @@ -0,0 +1,75 @@ +using LinearAlgebra, CUDA, Dagger, BenchmarkTools + +# SPD Matrix +N = 10_000 +A = randn(N, N) +A = A*A' + N*I + +# Right-hand side +b = randn(N) + + +# Cholesky: CUDA ############################################################# + +# SPD Matrix and right-hand side on GPU (CUDA) +A_cuda = CUDA.CuArray(A) +b_cuda = CUDA.CuArray(b) + +# Warm-up +x_cuda = cholesky(A_cuda) \ b_cuda + +# Benchmark time and memory +@time cholesky(A_cuda) # 0.895192 seconds (660 allocations: 12.094 KiB) +#@benchmark cholesky($A_cuda) +@time cholesky(A_cuda) \ b_cuda # 0.882263 seconds (953 allocations: 16.844 KiB) +#@benchmark cholesky($A_cuda) \ $b_cuda + +# Relative error +e_cuda = norm(A_cuda*x_cuda - b_cuda)/norm(b_cuda) + +# Free memory +A_cuda = nothing +b_cuda = nothing +GC.gc() +CUDA.reclaim() + + +# Cholesky: Dagger ########################################################### + +# SPD Matrix and right-hand side on GPU (Dagger Distributed) +A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do + distribute(A, Blocks(N÷4, N÷4)) +end +b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do + distribute(b, Blocks(N÷4)) +end + +# Warm-up +x_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do + cholesky(A_d) \ b_d +end + +CUDA.reclaim() + +# Benchmark time and memory +@time Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do + cholesky(A_d) +end # 1.039517 seconds (88.80 k allocations: 4.146 MiB, 7.74% gc time, 18 lock conflicts, 5.70% compilation time: <1% of which was recompilation) +@time Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do + cholesky(A_d) \ b_d +end # 1.046506 seconds (88.80 k allocations: 4.146 MiB, 7.74% gc time, 18 lock conflicts, 5.70% compilation time: <1% of which was recompilation) +# @benchmark Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do +# cholesky($A_d) +# end samples = 1 +# @benchmark Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do +# cholesky($A_d) \ $b_d +# end samples = 1 + +# Relative error +e_d = norm(A_d*x_d - b_d)/norm(b_d) + +# Free memory +A_d = nothing +b_d = nothing +GC.gc() + diff --git a/examples/agentic/generate-dagger-linear-solver/generate.jl b/examples/agentic/generate-dagger-linear-solver/generate.jl new file mode 100644 index 0000000..e3e3709 --- /dev/null +++ b/examples/agentic/generate-dagger-linear-solver/generate.jl @@ -0,0 +1,102 @@ +using SmartSolve +using LinearAlgebra +using SparseArrays +using CUDA +using BenchmarkTools +using Dagger + +prompt = """ +- Task: Write a high-performance Julia implementation of a linear solver for sparse matrices, using Dagger.jl on the GPU. The solver must be based on Cholesky factorization with iterative refinement. + +- Requirements + +1) Libraries and references + +1.1) Use Dagger.jl as documented here: +https://juliaparallel.org/Dagger.jl/dev/ + +1.2) Follow the iterative refinement algorithm described here: +https://nhigham.com/2023/03/13/what-is-iterative-refinement/ + +2) Dagger.jl + Cholesky + +2.1) Dagger.jl already has an Cholesky routine that can be used for distributed linear solves: cholesky(A_d) where A_d is a distributed matrix (DMatrix). This implementation extends cholesky from LinearAlgebra.jl. See https://github.com/JuliaParallel/Dagger.jl/blob/master/src/array/cholesky.jl. + +2.2) Use that Cholesky implementation within Dagger.jl (do not re-implement Cholesky from scratch). + +2.3) The computation must be fully on GPU, using Dagger's GPU support. + +2.4) Do not move data back to the CPU for intermediate computations. + +2.5) All linear algebra operations (factorization, forward/back substitution, residual computation, refinement updates) must be performed on GPU-resident Dagger arrays. + +3) Function API + +3.1) Implement exactly one Julia function with the following signature: +function proposed_fn(A_d, b_d) + # your code here +end +A_d: distributed sparse matrix (Dagger-distributed, GPU-resident). +b_d: distributed vector (Dagger-distributed, GPU-resident). +x: solution vector (Dagger-distributed, GPU-resident). You may treat x as an initial guess and overwrite it with the final refined solution. + +3.2) The function must return the final solution x (and anything else you consider useful, e.g., a residual norm, but the first return value must be the solution). + +4)Iterative refinement details + +4.1) Use Cholesky factorization of A_d to compute an initial solution x₀. + +4.2) Then apply iterative refinement: + + 4.2.1) At each iteration k, compute residual r_k = b_d - A_d * x_k on the GPU. + 4.2.2) Solve Ad d_k = r_k using the Cholesky factors (on GPU). + 4.2.3) Update x_k+1 = x_k + d_k on GPU. + +4.3) Perform at least 5 refinement iterations (you can use a loop with a fixed number of iterations ≥ 5; optional extra stopping criteria are allowed but not required). + +5) Performance and style constraints + +5.1) Use Dagger tasks / computation graphs appropriately so that the Cholesky factorization and solves are executed in parallel where possible. + +5.2) Avoid unnecessary data movement or conversions. + +5.3) Do not use CPU-only arrays or operations (no Array, no collect to CPU, etc.). + +5.4) Assume that using LinearAlgebra, using SparseArrays, and using Dagger have already been executed. + +5.5) Focus on clarity and correctness first, but structure the code with performance in mind (e.g., reuse LU factors, avoid recomputing them each iteration). + +6) Output format + +6.1) Output only the Julia code for the function: + +function proposed_fn(A_d, b_d) + ... +end + +6.2) Do not include any explanation, comments, or text outside the function definition. + +""" + +secret_key = ENV["OPENAI_API_KEY"] +solver, hist, conv = gen_linear_solver_dagger(prompt, secret_key; max_iters = 50) + +println("Generated Code:\n") +println(solver) +write("solver.jl", solver) + + + + +# using Dagger, CUDA, LinearAlgebra +# N = 2000 +# A = rand(N, N) +# A = A * A' +# A[diagind(A)] .+= size(A, 1) +# A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do +# view(A, Blocks(500, 500)) +# end +# b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do +# randn(Blocks(500), N) +# end +# cholesky(A_d) \ b_d \ No newline at end of file diff --git a/examples/agentic/generate-dagger-linear-solver/readme b/examples/agentic/generate-dagger-linear-solver/readme new file mode 100644 index 0000000..e566ed3 --- /dev/null +++ b/examples/agentic/generate-dagger-linear-solver/readme @@ -0,0 +1 @@ +This example generates a high-performance Dagger.jl implementation in Julia for solving sparse linear systems using an LU-based method with iterative refinement. diff --git a/src/Agentic.jl b/src/Agentic.jl index a3a9842..bf3c9e3 100644 --- a/src/Agentic.jl +++ b/src/Agentic.jl @@ -20,7 +20,11 @@ function error_prompt_maker(err_message) end proposed_fn(x) = x -function generate_default_code(prompt, secret_key, checker_filename, model = "gpt-5-mini", dev_prompt_fn = dev_prompt_maker; max_iters = 3) +evaluator(x) = (true, "") +function generate_default_code(prompt, secret_key, checker_filename; + model = "gpt-5-mini", + dev_prompt_fn = dev_prompt_maker, + max_iters = 3) """ - checker_fn: proposed_fn -> check : Bool, performance_description : String """ @@ -40,6 +44,8 @@ function generate_default_code(prompt, secret_key, checker_filename, model = "gp println("Iteration $iters") + println("Code:\n $gen_code") + # println(gen_code) push!(chat_history, Dict("role" => "assistant", "content" => gen_code)) try @@ -51,7 +57,7 @@ function generate_default_code(prompt, secret_key, checker_filename, model = "gp next_prompt = description_prompt_maker(check, performance_description) push!(chat_history, Dict("role" => "user", "content" => next_prompt)) - converged = ~(iters == max_iters) + converevlged = ~(iters == max_iters) check && break catch e error_msg = sprint(showerror, e) @@ -59,6 +65,7 @@ function generate_default_code(prompt, secret_key, checker_filename, model = "gp next_prompt = error_prompt_maker(error_msg * "\n" * st) push!(chat_history, Dict("role" => "user", "content" => next_prompt)) + println("error: $error_msg\n$st") end converged = ~(iters == max_iters) end @@ -81,18 +88,63 @@ function ls_cuda_dev_prompt_maker(fn_str) " Assume that LinearAlgebra and SparseArrays is already imported." end +function ls_dagger_dev_prompt_maker(fn_str) + return "You are a numerical linear algebra expert, and an expert Julia programmer. You are very experienced in GPU programming using Dagger.jl" * + " The user will ask you to generate a function and use the following code the check if your solution is accurate and fast." * + " Make sure the code you produce uses Dagger." * + " Here is the code: \n" * fn_str * "\nOnly return the function. Make sure the function name is proposed_fn. Do not return extra text." * + " Assume that LinearAlgebra and SparseArrays is already imported." * + " Assume that Dagger is already imported." * + " Use the following Dagger.jl documentation: https://juliaparallel.org/Dagger.jl/dev/" +end + src_dir = @__DIR__ -function gen_linear_solver(prompt, secret_key, checker_filename = src_dir * "/test_performance.jl", model = "gpt-5-mini"; max_iters = 10) - return generate_default_code(prompt, secret_key, checker_filename, model, ls_dev_prompt_maker; max_iters = max_iters) +function gen_linear_solver(prompt, secret_key; + checker_filename = src_dir * "/test_performance.jl", + model = "gpt-5-mini", + max_iters = 10) + return generate_default_code(prompt, secret_key, checker_filename; + model = model, + dev_prompt_fn=ls_dev_prompt_maker, + max_iters = max_iters) end -function gen_linear_solver_cuda(prompt, secret_key,checker_filename = src_dir *"/test_performance_cuda.jl", model = "gpt-5-mini"; max_iters = 10) - return generate_default_code(prompt, secret_key, checker_filename, model, ls_cuda_dev_prompt_maker; max_iters = max_iters) +function gen_linear_solver_cuda(prompt, secret_key; + checker_filename = src_dir *"/test_performance_cuda.jl", + model = "gpt-5-mini", + max_iters = 10) + return generate_default_code(prompt, secret_key, checker_filename; + model=model, + dev_prompt_fn=ls_cuda_dev_prompt_maker, + max_iters = max_iters) +end + +function gen_linear_solver_dagger(prompt, secret_key; + checker_filename = src_dir *"/test_performance_dagger.jl", + model = "gpt-5-mini", + max_iters = 10) + return generate_default_code(prompt, secret_key, checker_filename; + model=model, + dev_prompt_fn=ls_dagger_dev_prompt_maker, + max_iters = max_iters) end function printhist(hist) for (i, (role, message)) in enumerate(hist) println("Message $i $(role[2]):\n$(message[2])\n") end +end + +function get_report(m_err, m_runtime, m_alloc, + err_threshold, runtime_threshold, alloc_threshold) + report = """ + Median error ratio (error_default / error_gen): $(m_err) + Desired median error ratio: >= $err_threshold + Median runtime ratio or speedup (runtime_default / runtime_gen): $(m_runtime) + Desired median runtime ratio: >= $runtime_threshold + Allocation median ratio (alloc_default / alloc_gen): $(m_alloc) + Desired median allocation ratio: >= $alloc_threshold + """ + return report end \ No newline at end of file diff --git a/src/SmartSolve.jl b/src/SmartSolve.jl index 9d3023e..5fc64d5 100644 --- a/src/SmartSolve.jl +++ b/src/SmartSolve.jl @@ -1,5 +1,6 @@ module SmartSolve + using MatrixDepot using LinearAlgebra using DataFrames @@ -13,15 +14,14 @@ using BSON using SparseArrays using OpenAI using CUDA +using Dagger include("SmartDiscovery.jl") include("SmartDB.jl") include("SmartModel.jl") include("Utils.jl") include("Agentic.jl") -include("test_performance.jl") -# include("test_performance_cuda.jl") -export generate_default_code, gen_linear_solver, gen_linear_solver_cuda, printhist +export generate_default_code, gen_linear_solver, gen_linear_solver_cuda, gen_linear_solver_dagger, printhist end # module SmartSolve diff --git a/src/test_performance.jl b/src/test_performance.jl index 85cb8b9..fefdeaf 100644 --- a/src/test_performance.jl +++ b/src/test_performance.jl @@ -4,19 +4,6 @@ push!(test_matrices, sprand(N, N, 0.1)) push!(test_matrices, sprand(N, N, 0.2)) push!(test_matrices, sprand(N, N, 0.3)) -function get_report(m_err, m_runtime, m_alloc, - err_threshold, runtime_threshold, alloc_threshold) - report = """ - Median error ratio (error_default / error_gen): $(m_err) - Desired median error ratio: >= $err_threshold - Median Runtime ratio or speedup (runtime_default / runtime_gen): $(m_runtime) - Desired median runtime ratio: >= $runtime_threshold - Allocation median ratio (alloc_default / alloc_gen): $(m_alloc) - Desired median allocation ratio: >= $alloc_threshold - """ - return report -end - function evaluator(proposed_fn, err_threshold=1.0, runtime_threshold=1.1, alloc_threshold=0.0) diff --git a/src/test_performance_cuda.jl b/src/test_performance_cuda.jl index 44921f3..269ba51 100644 --- a/src/test_performance_cuda.jl +++ b/src/test_performance_cuda.jl @@ -4,23 +4,10 @@ push!(test_matrices, sprand(N, N, 0.1)) push!(test_matrices, sprand(N, N, 0.2)) push!(test_matrices, sprand(N, N, 0.3)) -function get_report(m_err, m_runtime, m_alloc, - err_threshold, runtime_threshold, alloc_threshold) - report = """ - Median error ratio (error_default / error_gen): $(m_err) - Desired median error ratio: >= $err_threshold - Median runtime ratio or speedup (runtime_default / runtime_gen): $(m_runtime) - Desired median runtime ratio: >= $runtime_threshold - Allocation median ratio (alloc_default / alloc_gen): $(m_alloc) - Desired median allocation ratio: >= $alloc_threshold - """ - return report -end - -function evaluator_cuda(proposed_fn; - err_threshold::Float64 = 1.0, - runtime_threshold::Float64 = 1.1, - alloc_threshold::Float64 = 0.0) +function evaluator( proposed_fn; + err_threshold::Float64 = 1.0, + runtime_threshold::Float64 = 1.1, + alloc_threshold::Float64 = 0.0) error_ratios = Float64[] runtime_ratios = Float64[] diff --git a/src/test_performance_dagger.jl b/src/test_performance_dagger.jl new file mode 100644 index 0000000..36dd211 --- /dev/null +++ b/src/test_performance_dagger.jl @@ -0,0 +1,60 @@ +function evaluator( proposed_fn; + err_threshold::Float64 = 1.0, + runtime_threshold::Float64 = 1.1, + alloc_threshold::Float64 = 0.0) + N = 2000 #10_000 + error_ratios = Float64[] + runtime_ratios = Float64[] + alloc_ratios = Float64[] + for _ in 1:3 + + # SPD Matrix + A = randn(N, N) + A = A*A' + N*I + + # Right-hand side + b = randn(N) + + # SPD Matrix and right hand side on GPU (CUDA) + A_cuda = CuArray(A) + b_cuda = CuArray(b) + + # SPD Matrix and right-hand side on GPU (Dagger Distributed) + A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do + distribute(A, Blocks(N÷4, N÷4)) + end + b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do + distribute(b, Blocks(N÷4)) + end + + # --- Solve once to ensure kernels are compiled (warm-up) --- + x_default = cholesky(A_cuda) \ b_cuda + x_gen = Base.invokelatest(proposed_fn, A_d, b_d) + CUDA.synchronize() + + # --- Error ratios (all on GPU, scalars on CPU) --- + err_default = norm(A_cuda * x_default - b_cuda) + err_gen = norm(A_d * x_gen - b_d) + push!(error_ratios, err_default / err_gen) + # --- Runtime ratios (GPU) --- + b_default = @benchmark begin + cholesky($A_cuda) \ $b_cuda + CUDA.synchronize() + end + b_gen = @benchmark begin + Base.invokelatest($proposed_fn, $A_d, $b_d) + end + push!(runtime_ratios, median(b_default.times) / median(b_gen.times)) + push!(alloc_ratios, median(b_default.allocs) / median(b_gen.allocs)) + end + m_err = median(error_ratios) + m_runtime = median(runtime_ratios) + m_alloc = median(alloc_ratios) + report = get_report(m_err, m_runtime, m_alloc, + err_threshold, runtime_threshold, alloc_threshold) + println(report) + ok = (m_err >= err_threshold) && # 1.0 => no worse error + (m_runtime >= runtime_threshold) && # 1.1 => at least 10% faster + (m_alloc >= alloc_threshold) # 0.0 => no alloc requirement + return ok, report +end \ No newline at end of file