JuliaLabs · emmanuellujan · Dec 5, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 13, 2025
diff --git a/Project.toml b/Project.toml
@@ -9,9 +9,11 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
 MatrixDepot = "b51810bb-c9f3-55da-ae3c-350fc1fbce05"
 OpenAI = "e9f21f70-7185-4079-aca2-91159181367c"
@@ -21,15 +23,20 @@ ScikitLearn = "3646fa90-6ef7-5e7e-9f22-8aca16db6324"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+[sources]
+Dagger = {rev = "jps/lu-ldiv3", url = "https://github.com/JuliaParallel/Dagger.jl"}
+
 [compat]
+BSON = "0.3"
 BenchmarkTools = "1"
 CSV = "0.10"
-BSON = "0.3"
 CUDA = "5.9.2"
 CairoMakie = "0.15"
-DecisionTree = "0.12"
+Dagger = "0.19.2"
 DataFrames = "1"
+DecisionTree = "0.12"
 LinearAlgebra = "1.12.0"
+Logging = "1.11.0"
 MKL = "0.9"
 MatrixDepot = "1.0.13"
 OpenAI = "0.12.0"

diff --git a/examples/agentic/generate-cpu-linear-solver/benchmark1.jl b/examples/agentic/generate-cpu-linear-solver/benchmark1.jl
@@ -0,0 +1,134 @@
+using LinearAlgebra
+using SparseArrays
+using CUDA
+using BenchmarkTools
+using OrderedCollections
+using Plots
+
+println("CPU benchmark with error-vs-time plot:\n")
+
+include("solver.jl")
+
+# Configuration
+N = 15_000
+sparsity_levels = [0.1, 0.5, 0.9]
+
+umfpack_control = SparseArrays.UMFPACK.get_umfpack_control(Float64, Int64) # read Julia default configuration for a Float64 sparse matrix
+#SparseArrays.UMFPACK.show_umf_ctrl(umfpack_control) # optional - display values
+umfpack_control[SparseArrays.UMFPACK.JL_UMFPACK_IRSTEP] = 2.0 # reenable iterative refinement (2 is UMFPACK default max iterative refinement steps)
+
+solvers = OrderedDict(
+    "UMFPACK (Baseline Direct Solve)" => (A, b) -> (A \ b),
+    "UMFPACK + Iterative Refinement" => (A, b) -> lu(A; control = umfpack_control) \ b,
+    "SmartSolve-Generated Solver" => (A, b) -> proposed_fn(A, b)
+)
+
+# Store results for plotting
+results = Dict()
+
+for sparsity in sparsity_levels
+    println("\n=== Sparsity: $sparsity ===")
+
+    # Generate problem
+    A = sprand(N, N, sparsity)
+    b = rand(N)
+
+    results[sparsity] = Dict()
+
+    for (solver_name, solver_fn) in solvers
+        println("  $solver_name...")
+
+        # Warm-up
+        b_warm = copy(b)
+        try
+            x_warm = solver_fn(A, b_warm)
+        catch e
+            println("    Warning: solver failed during warm-up: $e")
+            continue
+        end
+
+        # Benchmark
+        b_bench = copy(b)
+        try
+            bench = @benchmark begin
+                x = $(solver_fn)($A, $b_bench)
+            end
+
+            time_ms = median(bench.times) / 1e9  # Convert to s
+
+            # Compute error
+            b_err = copy(b)
+            x_sol = solver_fn(A, b_err)
+            error = norm(A*x_sol - b_err) / norm(b_err)
+
+            results[sparsity][solver_name] = (time=time_ms, error=error)
+            println("    Time: $(round(time_ms, digits=3)) s, Error: $(round(error, sigdigits=3))")
+        catch e
+            println("    Error during benchmark: $e")
+        end
+    end
+
+    GC.gc()
+end
+
+# Create error-vs-time plot
+p = plot(
+    size=(800, 800),
+    #legend=:topright,
+    legend=:topleft,
+    xlabel="Time (s)",
+    ylabel="Relative residual: ||Ax - b||₂ / ||b||₂",
+ #   xscale=:log10,
+    yscale=:log10,
+    guidefontsize=22,#18,
+    tickfontsize=20, #16,
+    legendfontsize=14, #14,
+    margin=5*Plots.mm,
+    framestyle=:box,
+    title="Random Matrices of Size $(N)x$(N),\n Varying Sparsity Levels (ρ) and\n CPU Solvers",
+    titlefontsize=22,
+)
+
+# Symbols encode sparsity levels; colors encode solvers.
+# Define marker for each sparsity and a color for each solver.
+## Sparsity shapes
+marker_map_sparsity = OrderedDict(0.1=>:circle, 0.5=>:square, 0.9=>:utriangle)
+## Solver color shades
+colors = [:red, :blue, :green]
+
+# Plot each point individually so marker shape shows sparsity and color shows solver.
+for (i, solver_name) in enumerate(keys(solvers))
+    for sparsity in sparsity_levels
+        if sparsity in keys(results) && solver_name in keys(results[sparsity])
+            t = results[sparsity][solver_name].time
+            e = results[sparsity][solver_name].error
+            scatter!(p, [t], [e];
+                     label="",
+                     marker=marker_map_sparsity[sparsity],
+                     markersize=15,
+                     color=colors[i],
+                     markerstrokecolor=:black,
+                     markerstrokewidth=0.0,#0.8,
+                     alpha=0.45)
+        end
+    end
+end
+
+# Create a combined legend
+for (i, solver_name) in enumerate(keys(solvers))
+    for s in sparsity_levels
+        lbl = "$(solver_name), ρ:$(s)"
+        scatter!(p, [NaN], [NaN]; label=lbl,
+                 marker=marker_map_sparsity[s],
+                 markersize=15,
+                 color=colors[i],
+                 markerstrokecolor=:black,
+                 markerstrokewidth=0.0, 
+                 alpha=0.45)
+    end
+end
+
+savefig(p, "error_vs_time.pdf")
+println("\n✓ Plot saved as error_vs_time.pdf")
+
+display(p)
diff --git a/examples/agentic/generate-cpu-linear-solver/benchmark2.jl b/examples/agentic/generate-cpu-linear-solver/benchmark2.jl
@@ -0,0 +1,37 @@
+using LinearAlgebra
+using SparseArrays
+using BenchmarkTools
+
+include("solver.jl")
+
+N = 15_000
+A = sprand(N, N, 0.5) 
+b = rand(N)
+
+# See https://docs.julialang.org/en/v1/stdlib/LinearAlgebra/#LinearAlgebra.lu
+umfpack_control = SparseArrays.UMFPACK.get_umfpack_control(Float64, Int64) # read Julia default configuration for a Float64 sparse matrix
+#SparseArrays.UMFPACK.show_umf_ctrl(umfpack_control) # optional - display values
+umfpack_control[SparseArrays.UMFPACK.JL_UMFPACK_IRSTEP] = 2.0 # reenable iterative refinement (2 is UMFPACK default max iterative refinement steps)
+
+# warm-up
+x1 = A \ b   # solve Ax = b
+x2 = lu(A; control = umfpack_control) \ b   # solve Ax = b, including UMFPACK iterative refinement
+x3 = proposed_fn(A, b)
+
+@benchmark begin
+    x = $A \ $b   # solve Ax = b
+end
+
+@benchmark begin
+    x = lu($A; control = $umfpack_control) \ $b   # solve Ax = b, including UMFPACK iterative refinement
+end
+
+@benchmark begin
+    x = proposed_fn($A, $b) 
+end
+
+error1 = norm(A*x1 - b) / norm(b)
+
+error2 = norm(A*x2 - b) / norm(b)
+
+error3 = norm(A*x3 - b) / norm(b)
diff --git a/examples/agentic/generate-cpu-linear-solver/error_vs_time.pdf b/examples/agentic/generate-cpu-linear-solver/error_vs_time.pdf
diff --git a/examples/agentic/generate-cpu-linear-solver/solver.jl b/examples/agentic/generate-cpu-linear-solver/solver.jl
@@ -1,65 +1,44 @@
-function proposed_fn(A::SparseMatrixCSC, b::AbstractVector)
-    @assert size(A,1) == size(A,2) "A must be square"
-    n = length(b)
-    @assert size(A,2) == n "Dimensions of A and b must agree"
-
-    niters = 4
-
-    # Convert sparse matrix to dense double for accurate residual computation
-    # and to dense single for fast factorization/solves with multithreaded BLAS.
-    Ad64 = Array(A)                    # dense Float64
-    Ad32 = Array{Float32}(undef, n, n)
-    @inbounds for j in 1:n
-        for i in 1:n
-            Ad32[i,j] = Float32(Ad64[i,j])
-        end
+function proposed_fn(A, b)
+    # Cache Float32 factorizations and work buffers per matrix identity
+    if !isdefined(@__MODULE__, :LU32_CACHE)
+        global LU32_CACHE = IdDict{UInt64, Tuple{Any, Vector{Float64}, Vector{Float32}, Vector{Float32}}}()
     end
 
-    # Convert rhs to Float32 once
-    b32 = Vector{Float32}(undef, n)
-    @inbounds @simd for i in 1:n
-        b32[i] = Float32(b[i])
-    end
-
-    # Factorize dense single-precision matrix (uses LAPACK/BLAS and is multithreaded)
-    F32 = lu(Ad32)
-
-    # Initial solve in single precision, in-place if possible
-    x32 = copy(b32)
-    try
-        LinearAlgebra.ldiv!(F32, x32)   # in-place: x32 <- Ad32 \ b32
-    catch
-        x32 = F32 \ b32                 # fallback
+    # Ensure b as Float64 vector (avoid copy if already Float64)
+    b64 = eltype(b) === Float64 ? b : Vector{Float64}(b)
+    n = length(b64)
+
+    key = objectid(A)
+    F32, r64, work32, bf32 = get!(LU32_CACHE, key) do
+        # Build a single-precision copy of the numeric values (structure reuse)
+        nz32 = Float32.(A.nzval)
+        Af = SparseMatrixCSC{Float32, Int}(size(A,1), size(A,2),
+                                           copy(A.colptr), copy(A.rowval),
+                                           nz32)
+        F32_local = lu(Af)                           # single-precision sparse LU
+        r64_local = Vector{Float64}(undef, n)       # residual buffer (double)
+        work32_local = Vector{Float32}(undef, n)    # temp residual in single
+        bf32_local = Vector{Float32}(undef, n)      # temp right-hand side in single
+        return (F32_local, r64_local, work32_local, bf32_local)
     end
 
-    # Promote to double precision for accumulation and residual computation
-    x = Vector{Float64}(undef, n)
-    @inbounds @simd for i in 1:n
-        x[i] = Float64(x32[i])
+    # Initial solve in single precision, accumulate in double
+    @inbounds for i = 1:n
+        bf32[i] = Float32(b64[i])
     end
-
-    # Preallocate working vectors
-    r = similar(b)                     # Float64 residual
-    r32 = Vector{Float32}(undef, n)    # single-precision correction (in-place)
-
-    for iter in 1:niters
-        # r = b - Ad64 * x   (use BLAS for dense matvec)
-        mul!(r, Ad64, x)              # r = Ad64 * x
-        @inbounds @simd for i in 1:n
-            r[i] = b[i] - r[i]
-            r32[i] = Float32(r[i])
+    xf32 = F32 \ bf32
+    x = Float64.(xf32)
+
+    # Iterative refinement: compute residual in double, solve correction in single, update double solution
+    for _ = 1:5
+        mul!(r64, A, x)                       # r64 = A * x (double)
+        @inbounds for i = 1:n
+            r64[i] = b64[i] - r64[i]          # r64 = b - A*x
+            work32[i] = Float32(r64[i])       # convert residual to single
         end
-
-        # Solve correction in single precision using the LU factorization
-        try
-            LinearAlgebra.ldiv!(F32, r32)   # r32 <- Ad32 \ r32 (in-place)
-        catch
-            r32 = F32 \ r32                 # fallback
-        end
-
-        # Update double-precision solution
-        @inbounds @simd for i in 1:n
-            x[i] += Float64(r32[i])
+        d32 = F32 \ work32
+        @inbounds for i = 1:n
+            x[i] += Float64(d32[i])           # update solution in double
         end
     end
 

diff --git a/examples/agentic/generate-dagger-linear-solver/Project.toml b/examples/agentic/generate-dagger-linear-solver/Project.toml
@@ -0,0 +1,12 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
+OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+SmartSolve = "4fbb3a3c-2fa1-4c19-8d57-bae8bc1e16ac"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[sources]
+Dagger = {rev = "jps/lu-ldiv3", url = "https://github.com/JuliaParallel/Dagger.jl"}
+SmartSolve = {path = "../../.."}