diff --git a/Project.toml b/Project.toml
index 9f77054..e1e4c02 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,9 +9,11 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
+Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
 MatrixDepot = "b51810bb-c9f3-55da-ae3c-350fc1fbce05"
 OpenAI = "e9f21f70-7185-4079-aca2-91159181367c"
@@ -21,15 +23,20 @@ ScikitLearn = "3646fa90-6ef7-5e7e-9f22-8aca16db6324"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
+[sources]
+Dagger = {rev = "jps/lu-ldiv3", url = "https://github.com/JuliaParallel/Dagger.jl"}
+
 [compat]
+BSON = "0.3"
 BenchmarkTools = "1"
 CSV = "0.10"
-BSON = "0.3"
 CUDA = "5.9.2"
 CairoMakie = "0.15"
-DecisionTree = "0.12"
+Dagger = "0.19.2"
 DataFrames = "1"
+DecisionTree = "0.12"
 LinearAlgebra = "1.12.0"
+Logging = "1.11.0"
 MKL = "0.9"
 MatrixDepot = "1.0.13"
 OpenAI = "0.12.0"
diff --git a/examples/agentic/generate-cpu-linear-solver/benchmark1.jl b/examples/agentic/generate-cpu-linear-solver/benchmark1.jl
new file mode 100644
index 0000000..ad81cf1
--- /dev/null
+++ b/examples/agentic/generate-cpu-linear-solver/benchmark1.jl
@@ -0,0 +1,134 @@
+using LinearAlgebra
+using SparseArrays
+using CUDA
+using BenchmarkTools
+using OrderedCollections
+using Plots
+
+println("CPU benchmark with error-vs-time plot:\n")
+
+include("solver.jl")
+
+# Configuration
+N = 15_000
+sparsity_levels = [0.1, 0.5, 0.9]
+
+umfpack_control = SparseArrays.UMFPACK.get_umfpack_control(Float64, Int64) # read Julia default configuration for a Float64 sparse matrix
+#SparseArrays.UMFPACK.show_umf_ctrl(umfpack_control) # optional - display values
+umfpack_control[SparseArrays.UMFPACK.JL_UMFPACK_IRSTEP] = 2.0 # reenable iterative refinement (2 is UMFPACK default max iterative refinement steps)
+
+solvers = OrderedDict(
+    "UMFPACK (Baseline Direct Solve)" => (A, b) -> (A \ b),
+    "UMFPACK + Iterative Refinement" => (A, b) -> lu(A; control = umfpack_control) \ b,
+    "SmartSolve-Generated Solver" => (A, b) -> proposed_fn(A, b)
+)
+
+# Store results for plotting
+results = Dict()
+
+for sparsity in sparsity_levels
+    println("\n=== Sparsity: $sparsity ===")
+    
+    # Generate problem
+    A = sprand(N, N, sparsity)
+    b = rand(N)
+    
+    results[sparsity] = Dict()
+    
+    for (solver_name, solver_fn) in solvers
+        println("  $solver_name...")
+        
+        # Warm-up
+        b_warm = copy(b)
+        try
+            x_warm = solver_fn(A, b_warm)
+        catch e
+            println("    Warning: solver failed during warm-up: $e")
+            continue
+        end
+        
+        # Benchmark
+        b_bench = copy(b)
+        try
+            bench = @benchmark begin
+                x = $(solver_fn)($A, $b_bench)
+            end
+            
+            time_ms = median(bench.times) / 1e9  # Convert to s
+            
+            # Compute error
+            b_err = copy(b)
+            x_sol = solver_fn(A, b_err)
+            error = norm(A*x_sol - b_err) / norm(b_err)
+            
+            results[sparsity][solver_name] = (time=time_ms, error=error)
+            println("    Time: $(round(time_ms, digits=3)) s, Error: $(round(error, sigdigits=3))")
+        catch e
+            println("    Error during benchmark: $e")
+        end
+    end
+
+    GC.gc()
+end
+
+# Create error-vs-time plot
+p = plot(
+    size=(800, 800),
+    #legend=:topright,
+    legend=:topleft,
+    xlabel="Time (s)",
+    ylabel="Relative residual: ||Ax - b||₂ / ||b||₂",
+ #   xscale=:log10,
+    yscale=:log10,
+    guidefontsize=22,#18,
+    tickfontsize=20, #16,
+    legendfontsize=14, #14,
+    margin=5*Plots.mm,
+    framestyle=:box,
+    title="Random Matrices of Size $(N)x$(N),\n Varying Sparsity Levels (ρ) and\n CPU Solvers",
+    titlefontsize=22,
+)
+
+# Symbols encode sparsity levels; colors encode solvers.
+# Define marker for each sparsity and a color for each solver.
+## Sparsity shapes
+marker_map_sparsity = OrderedDict(0.1=>:circle, 0.5=>:square, 0.9=>:utriangle)
+## Solver color shades
+colors = [:red, :blue, :green]
+
+# Plot each point individually so marker shape shows sparsity and color shows solver.
+for (i, solver_name) in enumerate(keys(solvers))
+    for sparsity in sparsity_levels
+        if sparsity in keys(results) && solver_name in keys(results[sparsity])
+            t = results[sparsity][solver_name].time
+            e = results[sparsity][solver_name].error
+            scatter!(p, [t], [e];
+                     label="",
+                     marker=marker_map_sparsity[sparsity],
+                     markersize=15,
+                     color=colors[i],
+                     markerstrokecolor=:black,
+                     markerstrokewidth=0.0,#0.8,
+                     alpha=0.45)
+        end
+    end
+end
+
+# Create a combined legend
+for (i, solver_name) in enumerate(keys(solvers))
+    for s in sparsity_levels
+        lbl = "$(solver_name), ρ:$(s)"
+        scatter!(p, [NaN], [NaN]; label=lbl,
+                 marker=marker_map_sparsity[s],
+                 markersize=15,
+                 color=colors[i],
+                 markerstrokecolor=:black,
+                 markerstrokewidth=0.0, 
+                 alpha=0.45)
+    end
+end
+
+savefig(p, "error_vs_time.pdf")
+println("\n✓ Plot saved as error_vs_time.pdf")
+
+display(p)
\ No newline at end of file
diff --git a/examples/agentic/generate-cpu-linear-solver/benchmark2.jl b/examples/agentic/generate-cpu-linear-solver/benchmark2.jl
new file mode 100644
index 0000000..88c2aa2
--- /dev/null
+++ b/examples/agentic/generate-cpu-linear-solver/benchmark2.jl
@@ -0,0 +1,37 @@
+using LinearAlgebra
+using SparseArrays
+using BenchmarkTools
+
+include("solver.jl")
+
+N = 15_000
+A = sprand(N, N, 0.5) 
+b = rand(N)
+
+# See https://docs.julialang.org/en/v1/stdlib/LinearAlgebra/#LinearAlgebra.lu
+umfpack_control = SparseArrays.UMFPACK.get_umfpack_control(Float64, Int64) # read Julia default configuration for a Float64 sparse matrix
+#SparseArrays.UMFPACK.show_umf_ctrl(umfpack_control) # optional - display values
+umfpack_control[SparseArrays.UMFPACK.JL_UMFPACK_IRSTEP] = 2.0 # reenable iterative refinement (2 is UMFPACK default max iterative refinement steps)
+
+# warm-up
+x1 = A \ b   # solve Ax = b
+x2 = lu(A; control = umfpack_control) \ b   # solve Ax = b, including UMFPACK iterative refinement
+x3 = proposed_fn(A, b)
+
+@benchmark begin
+    x = $A \ $b   # solve Ax = b
+end
+
+@benchmark begin
+    x = lu($A; control = $umfpack_control) \ $b   # solve Ax = b, including UMFPACK iterative refinement
+end
+
+@benchmark begin
+    x = proposed_fn($A, $b) 
+end
+
+error1 = norm(A*x1 - b) / norm(b)
+
+error2 = norm(A*x2 - b) / norm(b)
+
+error3 = norm(A*x3 - b) / norm(b)
\ No newline at end of file
diff --git a/examples/agentic/generate-cpu-linear-solver/error_vs_time.pdf b/examples/agentic/generate-cpu-linear-solver/error_vs_time.pdf
new file mode 100644
index 0000000..0d33a22
Binary files /dev/null and b/examples/agentic/generate-cpu-linear-solver/error_vs_time.pdf differ
diff --git a/examples/agentic/generate-cpu-linear-solver/solver.jl b/examples/agentic/generate-cpu-linear-solver/solver.jl
index f9b8a54..05fc6ca 100644
--- a/examples/agentic/generate-cpu-linear-solver/solver.jl
+++ b/examples/agentic/generate-cpu-linear-solver/solver.jl
@@ -1,65 +1,44 @@
-function proposed_fn(A::SparseMatrixCSC, b::AbstractVector)
-    @assert size(A,1) == size(A,2) "A must be square"
-    n = length(b)
-    @assert size(A,2) == n "Dimensions of A and b must agree"
-
-    niters = 4
-
-    # Convert sparse matrix to dense double for accurate residual computation
-    # and to dense single for fast factorization/solves with multithreaded BLAS.
-    Ad64 = Array(A)                    # dense Float64
-    Ad32 = Array{Float32}(undef, n, n)
-    @inbounds for j in 1:n
-        for i in 1:n
-            Ad32[i,j] = Float32(Ad64[i,j])
-        end
+function proposed_fn(A, b)
+    # Cache Float32 factorizations and work buffers per matrix identity
+    if !isdefined(@__MODULE__, :LU32_CACHE)
+        global LU32_CACHE = IdDict{UInt64, Tuple{Any, Vector{Float64}, Vector{Float32}, Vector{Float32}}}()
     end
 
-    # Convert rhs to Float32 once
-    b32 = Vector{Float32}(undef, n)
-    @inbounds @simd for i in 1:n
-        b32[i] = Float32(b[i])
-    end
-
-    # Factorize dense single-precision matrix (uses LAPACK/BLAS and is multithreaded)
-    F32 = lu(Ad32)
-
-    # Initial solve in single precision, in-place if possible
-    x32 = copy(b32)
-    try
-        LinearAlgebra.ldiv!(F32, x32)   # in-place: x32 <- Ad32 \ b32
-    catch
-        x32 = F32 \ b32                 # fallback
+    # Ensure b as Float64 vector (avoid copy if already Float64)
+    b64 = eltype(b) === Float64 ? b : Vector{Float64}(b)
+    n = length(b64)
+
+    key = objectid(A)
+    F32, r64, work32, bf32 = get!(LU32_CACHE, key) do
+        # Build a single-precision copy of the numeric values (structure reuse)
+        nz32 = Float32.(A.nzval)
+        Af = SparseMatrixCSC{Float32, Int}(size(A,1), size(A,2),
+                                           copy(A.colptr), copy(A.rowval),
+                                           nz32)
+        F32_local = lu(Af)                           # single-precision sparse LU
+        r64_local = Vector{Float64}(undef, n)       # residual buffer (double)
+        work32_local = Vector{Float32}(undef, n)    # temp residual in single
+        bf32_local = Vector{Float32}(undef, n)      # temp right-hand side in single
+        return (F32_local, r64_local, work32_local, bf32_local)
     end
 
-    # Promote to double precision for accumulation and residual computation
-    x = Vector{Float64}(undef, n)
-    @inbounds @simd for i in 1:n
-        x[i] = Float64(x32[i])
+    # Initial solve in single precision, accumulate in double
+    @inbounds for i = 1:n
+        bf32[i] = Float32(b64[i])
     end
-
-    # Preallocate working vectors
-    r = similar(b)                     # Float64 residual
-    r32 = Vector{Float32}(undef, n)    # single-precision correction (in-place)
-
-    for iter in 1:niters
-        # r = b - Ad64 * x   (use BLAS for dense matvec)
-        mul!(r, Ad64, x)              # r = Ad64 * x
-        @inbounds @simd for i in 1:n
-            r[i] = b[i] - r[i]
-            r32[i] = Float32(r[i])
+    xf32 = F32 \ bf32
+    x = Float64.(xf32)
+
+    # Iterative refinement: compute residual in double, solve correction in single, update double solution
+    for _ = 1:5
+        mul!(r64, A, x)                       # r64 = A * x (double)
+        @inbounds for i = 1:n
+            r64[i] = b64[i] - r64[i]          # r64 = b - A*x
+            work32[i] = Float32(r64[i])       # convert residual to single
         end
-
-        # Solve correction in single precision using the LU factorization
-        try
-            LinearAlgebra.ldiv!(F32, r32)   # r32 <- Ad32 \ r32 (in-place)
-        catch
-            r32 = F32 \ r32                 # fallback
-        end
-
-        # Update double-precision solution
-        @inbounds @simd for i in 1:n
-            x[i] += Float64(r32[i])
+        d32 = F32 \ work32
+        @inbounds for i = 1:n
+            x[i] += Float64(d32[i])           # update solution in double
         end
     end
 
diff --git a/examples/agentic/generate-dagger-linear-solver/Project.toml b/examples/agentic/generate-dagger-linear-solver/Project.toml
new file mode 100644
index 0000000..8362c6c
--- /dev/null
+++ b/examples/agentic/generate-dagger-linear-solver/Project.toml
@@ -0,0 +1,12 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Dagger = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
+OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+SmartSolve = "4fbb3a3c-2fa1-4c19-8d57-bae8bc1e16ac"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[sources]
+Dagger = {rev = "jps/lu-ldiv3", url = "https://github.com/JuliaParallel/Dagger.jl"}
+SmartSolve = {path = "../../.."}
diff --git a/examples/agentic/generate-cpu-linear-solver/benchmark.jl b/examples/agentic/generate-dagger-linear-solver/benchmark.jl
similarity index 76%
rename from examples/agentic/generate-cpu-linear-solver/benchmark.jl
rename to examples/agentic/generate-dagger-linear-solver/benchmark.jl
index db7d835..9ed3a2d 100644
--- a/examples/agentic/generate-cpu-linear-solver/benchmark.jl
+++ b/examples/agentic/generate-dagger-linear-solver/benchmark.jl
@@ -5,22 +5,21 @@ using BenchmarkTools
 using OrderedCollections
 using Plots
 
-println("CPU benchmark with error-vs-time plot:\n")
+println("GPU benchmark with error-vs-time plot:\n")
 
 include("solver.jl")
 
 # Configuration
 N = 15_000
 sparsity_levels = [0.1, 0.5, 0.9]
-
-umfpack_control = SparseArrays.UMFPACK.get_umfpack_control(Float64, Int64) # read Julia default configuration for a Float64 sparse matrix
-#SparseArrays.UMFPACK.show_umf_ctrl(umfpack_control) # optional - display values
-umfpack_control[SparseArrays.UMFPACK.JL_UMFPACK_IRSTEP] = 2.0 # reenable iterative refinement (2 is UMFPACK default max iterative refinement steps)
-
 solvers = OrderedDict(
-    "Default" => (A, b) -> (A \ b),
-    "UMFPACK" => (A, b) -> lu(A; control = umfpack_control) \ b,
-    "Generated" => (A, b) -> proposed_fn(A, b)
+    "Default" => (Ad, bd) -> (Ad \ bd),
+    "gesv!" => (Ad, bd) -> begin
+        x = CuArray(zeros(size(Ad, 1)))
+        CUDA.CUSOLVER.gesv!(x, Ad, bd)
+        x
+    end,
+    "Generated" => (Ad, bd) -> proposed_fn(Ad, bd)
 )
 
 # Store results for plotting
@@ -32,6 +31,8 @@ for sparsity in sparsity_levels
     # Generate problem
     A = sprand(N, N, sparsity)
     b = rand(N)
+    Ad = CuArray(Matrix(A))
+    bd = CuArray(b)
     
     results[sparsity] = Dict()
     
@@ -39,27 +40,30 @@ for sparsity in sparsity_levels
         println("  $solver_name...")
         
         # Warm-up
-        b_warm = copy(b)
+        bd_warm = CuArray(copy(b))
         try
-            x_warm = solver_fn(A, b_warm)
+            x_warm = solver_fn(Ad, bd_warm)
+            CUDA.synchronize()
         catch e
             println("    Warning: solver failed during warm-up: $e")
             continue
         end
         
         # Benchmark
-        b_bench = copy(b)
+        bd_bench = CuArray(copy(b))
         try
             bench = @benchmark begin
-                x = $(solver_fn)($A, $b_bench)
+                x = $(solver_fn)($Ad, $bd_bench)
+                CUDA.synchronize()
             end seconds = 5 samples = 10
             
             time_ms = median(bench.times) / 1e9  # Convert to s
             
             # Compute error
-            b_err = copy(b)
-            x_sol = solver_fn(A, b_err)
-            error = norm(A*x_sol - b_err) / norm(b_err)
+            bd_err = CuArray(copy(b))
+            x_sol = solver_fn(Ad, bd_err)
+            CUDA.synchronize()
+            error = norm(Ad*x_sol - bd_err) / norm(bd_err)
             
             results[sparsity][solver_name] = (time=time_ms, error=error)
             println("    Time: $(round(time_ms, digits=3)) s, Error: $(round(error, sigdigits=3))")
@@ -73,7 +77,7 @@ end
 p = plot(
     size=(800, 800),
     #legend=:topright,
-    legend=:bottomleft,
+    legend=:bottomright,
     xlabel="Time (s)",
     ylabel="Relative residual: ||Ax - b||₂ / ||b||₂",
  #   xscale=:log10,
@@ -83,7 +87,7 @@ p = plot(
     legendfontsize=18, #14,
     margin=5*Plots.mm,
     framestyle=:box,
-    title="Random Matrices of Size $(N)x$(N),\n Varying Sparsity Levels (ρ) and\n CPU Solvers",
+    title="Random Matrices of Size $(N)x$(N),\n Varying Sparsity Levels (ρ) and\n GPU Solvers",
     titlefontsize=22,
 )
 
@@ -92,7 +96,7 @@ p = plot(
 ## Sparsity shapes
 marker_map_sparsity = OrderedDict(0.1=>:circle, 0.5=>:square, 0.9=>:utriangle)
 ## Solver color shades
-color_map_solver = OrderedDict("Default"=>:red, "UMFPACK"=>:blue, "Generated"=>:green)
+color_map_solver = OrderedDict("Default"=>:red, "gesv!"=>:blue, "Generated"=>:green)
 
 # Plot each point individually so marker shape shows sparsity and color shows solver.
 for solver_name in keys(solvers)
diff --git a/examples/agentic/generate-dagger-linear-solver/cholesky_benchmark.jl b/examples/agentic/generate-dagger-linear-solver/cholesky_benchmark.jl
new file mode 100644
index 0000000..0e1bc2d
--- /dev/null
+++ b/examples/agentic/generate-dagger-linear-solver/cholesky_benchmark.jl
@@ -0,0 +1,75 @@
+using LinearAlgebra, CUDA, Dagger, BenchmarkTools
+
+# SPD Matrix
+N = 10_000
+A = randn(N, N)
+A = A*A' + N*I
+
+# Right-hand side
+b = randn(N)
+
+
+# Cholesky: CUDA #############################################################
+
+# SPD Matrix and right-hand side on GPU (CUDA)
+A_cuda = CUDA.CuArray(A)
+b_cuda = CUDA.CuArray(b)
+
+# Warm-up
+x_cuda = cholesky(A_cuda) \ b_cuda
+
+# Benchmark time and memory
+@time cholesky(A_cuda) #  0.895192 seconds (660 allocations: 12.094 KiB)
+#@benchmark cholesky($A_cuda)
+@time cholesky(A_cuda) \ b_cuda  # 0.882263 seconds (953 allocations: 16.844 KiB)
+#@benchmark cholesky($A_cuda) \ $b_cuda
+
+# Relative error
+e_cuda = norm(A_cuda*x_cuda - b_cuda)/norm(b_cuda)
+
+# Free memory
+A_cuda = nothing
+b_cuda = nothing
+GC.gc()
+CUDA.reclaim()
+
+
+# Cholesky: Dagger ########################################################### 
+
+# SPD Matrix and right-hand side on GPU (Dagger Distributed)
+A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+    distribute(A, Blocks(N÷4, N÷4))
+end
+b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+    distribute(b, Blocks(N÷4))
+end
+
+# Warm-up
+x_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+    cholesky(A_d) \ b_d
+end
+
+CUDA.reclaim()
+
+# Benchmark time and memory
+@time Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+     cholesky(A_d)
+end #  1.039517 seconds (88.80 k allocations: 4.146 MiB, 7.74% gc time, 18 lock conflicts, 5.70% compilation time: <1% of which was recompilation)
+@time Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+     cholesky(A_d) \ b_d
+end #  1.046506 seconds (88.80 k allocations: 4.146 MiB, 7.74% gc time, 18 lock conflicts, 5.70% compilation time: <1% of which was recompilation)
+# @benchmark Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+#     cholesky($A_d)
+# end samples = 1
+# @benchmark Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+#     cholesky($A_d) \ $b_d
+# end samples = 1
+
+# Relative error
+e_d = norm(A_d*x_d - b_d)/norm(b_d)
+
+# Free memory
+A_d = nothing
+b_d = nothing
+GC.gc()
+
diff --git a/examples/agentic/generate-dagger-linear-solver/generate.jl b/examples/agentic/generate-dagger-linear-solver/generate.jl
new file mode 100644
index 0000000..e3e3709
--- /dev/null
+++ b/examples/agentic/generate-dagger-linear-solver/generate.jl
@@ -0,0 +1,102 @@
+using SmartSolve
+using LinearAlgebra
+using SparseArrays
+using CUDA
+using BenchmarkTools
+using Dagger
+
+prompt = """
+- Task: Write a high-performance Julia implementation of a linear solver for sparse matrices, using Dagger.jl on the GPU. The solver must be based on Cholesky factorization with iterative refinement.
+
+- Requirements
+
+1) Libraries and references
+
+1.1) Use Dagger.jl as documented here:
+https://juliaparallel.org/Dagger.jl/dev/
+
+1.2) Follow the iterative refinement algorithm described here:
+https://nhigham.com/2023/03/13/what-is-iterative-refinement/
+
+2) Dagger.jl + Cholesky
+
+2.1) Dagger.jl already has an Cholesky routine that can be used for distributed linear solves: cholesky(A_d) where A_d is a distributed matrix (DMatrix). This implementation extends cholesky from LinearAlgebra.jl. See https://github.com/JuliaParallel/Dagger.jl/blob/master/src/array/cholesky.jl.
+
+2.2) Use that Cholesky implementation within Dagger.jl (do not re-implement Cholesky from scratch).
+
+2.3) The computation must be fully on GPU, using Dagger's GPU support.
+
+2.4) Do not move data back to the CPU for intermediate computations.
+
+2.5) All linear algebra operations (factorization, forward/back substitution, residual computation, refinement updates) must be performed on GPU-resident Dagger arrays.
+
+3) Function API
+
+3.1) Implement exactly one Julia function with the following signature:
+function proposed_fn(A_d, b_d)
+    # your code here
+end
+A_d: distributed sparse matrix (Dagger-distributed, GPU-resident).
+b_d: distributed vector (Dagger-distributed, GPU-resident).
+x: solution vector (Dagger-distributed, GPU-resident). You may treat x as an initial guess and overwrite it with the final refined solution.
+
+3.2) The function must return the final solution x (and anything else you consider useful, e.g., a residual norm, but the first return value must be the solution).
+
+4)Iterative refinement details
+
+4.1) Use Cholesky factorization of A_d to compute an initial solution x₀. 
+
+4.2) Then apply iterative refinement:
+
+    4.2.1) At each iteration k, compute residual r_k = b_d - A_d * x_k on the GPU.
+    4.2.2) Solve Ad d_k = r_k using the Cholesky factors (on GPU).
+    4.2.3) Update x_k+1 = x_k + d_k on GPU.
+
+4.3) Perform at least 5 refinement iterations (you can use a loop with a fixed number of iterations ≥ 5; optional extra stopping criteria are allowed but not required).
+
+5) Performance and style constraints
+
+5.1) Use Dagger tasks / computation graphs appropriately so that the Cholesky factorization and solves are executed in parallel where possible.
+
+5.2) Avoid unnecessary data movement or conversions.
+
+5.3) Do not use CPU-only arrays or operations (no Array, no collect to CPU, etc.).
+
+5.4) Assume that using LinearAlgebra, using SparseArrays, and using Dagger have already been executed.
+
+5.5) Focus on clarity and correctness first, but structure the code with performance in mind (e.g., reuse LU factors, avoid recomputing them each iteration).
+
+6) Output format
+
+6.1) Output only the Julia code for the function:
+
+function proposed_fn(A_d, b_d)
+    ...
+end
+
+6.2) Do not include any explanation, comments, or text outside the function definition.
+
+"""
+
+secret_key = ENV["OPENAI_API_KEY"]
+solver, hist, conv = gen_linear_solver_dagger(prompt, secret_key; max_iters = 50)
+
+println("Generated Code:\n")
+println(solver)
+write("solver.jl", solver)
+
+
+
+
+# using Dagger, CUDA, LinearAlgebra
+# N = 2000
+# A = rand(N, N)
+# A = A * A'
+# A[diagind(A)] .+= size(A, 1)
+# A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+#     view(A, Blocks(500, 500))
+# end
+# b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+#     randn(Blocks(500), N)
+# end
+# cholesky(A_d) \ b_d
\ No newline at end of file
diff --git a/examples/agentic/generate-dagger-linear-solver/readme b/examples/agentic/generate-dagger-linear-solver/readme
new file mode 100644
index 0000000..e566ed3
--- /dev/null
+++ b/examples/agentic/generate-dagger-linear-solver/readme
@@ -0,0 +1 @@
+This example generates a high-performance Dagger.jl implementation in Julia for solving sparse linear systems using an LU-based method with iterative refinement.
diff --git a/src/Agentic.jl b/src/Agentic.jl
index a3a9842..bf3c9e3 100644
--- a/src/Agentic.jl
+++ b/src/Agentic.jl
@@ -20,7 +20,11 @@ function error_prompt_maker(err_message)
 end
 
 proposed_fn(x) = x
-function generate_default_code(prompt, secret_key, checker_filename, model = "gpt-5-mini", dev_prompt_fn = dev_prompt_maker; max_iters = 3)
+evaluator(x) = (true, "")
+function generate_default_code(prompt, secret_key, checker_filename;
+                               model = "gpt-5-mini",
+                               dev_prompt_fn = dev_prompt_maker,
+                               max_iters = 3)
     """
         - checker_fn: proposed_fn -> check : Bool, performance_description : String
     """
@@ -40,6 +44,8 @@ function generate_default_code(prompt, secret_key, checker_filename, model = "gp
         
         println("Iteration $iters")
 
+        println("Code:\n $gen_code")
+
         # println(gen_code)
         push!(chat_history, Dict("role" => "assistant", "content" => gen_code)) 
         try
@@ -51,7 +57,7 @@ function generate_default_code(prompt, secret_key, checker_filename, model = "gp
 
             next_prompt = description_prompt_maker(check, performance_description)
             push!(chat_history, Dict("role" => "user", "content" => next_prompt))
-            converged = ~(iters == max_iters)
+            converevlged = ~(iters == max_iters)
             check && break
         catch e
             error_msg = sprint(showerror, e)
@@ -59,6 +65,7 @@ function generate_default_code(prompt, secret_key, checker_filename, model = "gp
            
             next_prompt = error_prompt_maker(error_msg * "\n" * st)
             push!(chat_history, Dict("role" => "user", "content" => next_prompt))
+            println("error: $error_msg\n$st")
         end
         converged = ~(iters == max_iters)
     end
@@ -81,18 +88,63 @@ function ls_cuda_dev_prompt_maker(fn_str)
             " Assume that LinearAlgebra and SparseArrays is already imported."
 end
 
+function ls_dagger_dev_prompt_maker(fn_str)
+    return  "You are a numerical linear algebra expert, and an expert Julia programmer. You are very experienced in GPU programming using Dagger.jl" * 
+            " The user will ask you to generate a function and use the following code the check if your solution is accurate and fast." * 
+            " Make sure the code you produce uses Dagger." *
+            " Here is the code: \n" * fn_str * "\nOnly return the function. Make sure the function name is proposed_fn. Do not return extra text." *
+            " Assume that LinearAlgebra and SparseArrays is already imported." *
+            " Assume that Dagger is already imported." *
+            " Use the following Dagger.jl documentation: https://juliaparallel.org/Dagger.jl/dev/"
+end
+
 src_dir = @__DIR__
 
-function gen_linear_solver(prompt, secret_key, checker_filename = src_dir * "/test_performance.jl", model = "gpt-5-mini"; max_iters = 10)
-    return generate_default_code(prompt, secret_key, checker_filename, model, ls_dev_prompt_maker; max_iters = max_iters)
+function gen_linear_solver(prompt, secret_key;
+                          checker_filename = src_dir * "/test_performance.jl",
+                          model = "gpt-5-mini",
+                          max_iters = 10)
+    return generate_default_code(prompt, secret_key, checker_filename;
+                                model = model,
+                                dev_prompt_fn=ls_dev_prompt_maker,
+                                max_iters = max_iters)
 end
 
-function gen_linear_solver_cuda(prompt, secret_key,checker_filename = src_dir *"/test_performance_cuda.jl", model = "gpt-5-mini"; max_iters = 10)
-    return generate_default_code(prompt, secret_key, checker_filename, model, ls_cuda_dev_prompt_maker; max_iters = max_iters)
+function gen_linear_solver_cuda(prompt, secret_key;
+                                checker_filename = src_dir *"/test_performance_cuda.jl",
+                                model = "gpt-5-mini",
+                                max_iters = 10)
+    return generate_default_code(prompt, secret_key, checker_filename;
+                                 model=model,
+                                 dev_prompt_fn=ls_cuda_dev_prompt_maker,
+                                 max_iters = max_iters)
+end
+
+function gen_linear_solver_dagger(prompt, secret_key;
+                                  checker_filename = src_dir *"/test_performance_dagger.jl",
+                                  model = "gpt-5-mini",
+                                  max_iters = 10)
+    return generate_default_code(prompt, secret_key, checker_filename;
+                                 model=model,
+                                 dev_prompt_fn=ls_dagger_dev_prompt_maker,
+                                 max_iters = max_iters)
 end
 
 function printhist(hist)
     for (i, (role, message)) in enumerate(hist)
             println("Message $i $(role[2]):\n$(message[2])\n")
     end
+end
+
+function get_report(m_err, m_runtime, m_alloc,
+                    err_threshold, runtime_threshold, alloc_threshold)
+    report = """
+    Median error ratio (error_default / error_gen): $(m_err)
+    Desired median error ratio: >= $err_threshold
+    Median runtime ratio or speedup (runtime_default / runtime_gen): $(m_runtime)
+    Desired median runtime ratio: >= $runtime_threshold
+    Allocation median ratio (alloc_default / alloc_gen): $(m_alloc)
+    Desired median allocation ratio: >= $alloc_threshold
+    """
+    return report
 end
\ No newline at end of file
diff --git a/src/SmartSolve.jl b/src/SmartSolve.jl
index 9d3023e..5fc64d5 100644
--- a/src/SmartSolve.jl
+++ b/src/SmartSolve.jl
@@ -1,5 +1,6 @@
 module SmartSolve
 
+
 using MatrixDepot
 using LinearAlgebra
 using DataFrames
@@ -13,15 +14,14 @@ using BSON
 using SparseArrays
 using OpenAI
 using CUDA
+using Dagger
 
 include("SmartDiscovery.jl")
 include("SmartDB.jl")
 include("SmartModel.jl")
 include("Utils.jl")
 include("Agentic.jl")
-include("test_performance.jl")
-# include("test_performance_cuda.jl")
 
-export generate_default_code, gen_linear_solver, gen_linear_solver_cuda, printhist
+export generate_default_code, gen_linear_solver, gen_linear_solver_cuda, gen_linear_solver_dagger, printhist
 
 end # module SmartSolve
diff --git a/src/test_performance.jl b/src/test_performance.jl
index 85cb8b9..fefdeaf 100644
--- a/src/test_performance.jl
+++ b/src/test_performance.jl
@@ -4,19 +4,6 @@ push!(test_matrices, sprand(N, N, 0.1))
 push!(test_matrices, sprand(N, N, 0.2))
 push!(test_matrices, sprand(N, N, 0.3))
 
-function get_report(m_err, m_runtime, m_alloc,
-                    err_threshold, runtime_threshold, alloc_threshold)
-    report = """
-    Median error ratio (error_default / error_gen): $(m_err)
-    Desired median error ratio: >= $err_threshold
-    Median Runtime ratio or speedup (runtime_default / runtime_gen): $(m_runtime)
-    Desired median runtime ratio: >= $runtime_threshold
-    Allocation median ratio (alloc_default / alloc_gen): $(m_alloc)
-    Desired median allocation ratio: >= $alloc_threshold
-    """
-    return report
-end
-
 function evaluator(proposed_fn, err_threshold=1.0,
                                 runtime_threshold=1.1,
                                 alloc_threshold=0.0)
diff --git a/src/test_performance_cuda.jl b/src/test_performance_cuda.jl
index 44921f3..269ba51 100644
--- a/src/test_performance_cuda.jl
+++ b/src/test_performance_cuda.jl
@@ -4,23 +4,10 @@ push!(test_matrices, sprand(N, N, 0.1))
 push!(test_matrices, sprand(N, N, 0.2))
 push!(test_matrices, sprand(N, N, 0.3))
 
-function get_report(m_err, m_runtime, m_alloc,
-                    err_threshold, runtime_threshold, alloc_threshold)
-    report = """
-    Median error ratio (error_default / error_gen): $(m_err)
-    Desired median error ratio: >= $err_threshold
-    Median runtime ratio or speedup (runtime_default / runtime_gen): $(m_runtime)
-    Desired median runtime ratio: >= $runtime_threshold
-    Allocation median ratio (alloc_default / alloc_gen): $(m_alloc)
-    Desired median allocation ratio: >= $alloc_threshold
-    """
-    return report
-end
-
-function evaluator_cuda(proposed_fn;
-                        err_threshold::Float64 = 1.0,
-                        runtime_threshold::Float64 = 1.1,
-                        alloc_threshold::Float64 = 0.0)
+function evaluator( proposed_fn;
+                    err_threshold::Float64 = 1.0,
+                    runtime_threshold::Float64 = 1.1,
+                    alloc_threshold::Float64 = 0.0)
 
     error_ratios  = Float64[]
     runtime_ratios = Float64[]
diff --git a/src/test_performance_dagger.jl b/src/test_performance_dagger.jl
new file mode 100644
index 0000000..36dd211
--- /dev/null
+++ b/src/test_performance_dagger.jl
@@ -0,0 +1,60 @@
+function evaluator( proposed_fn;
+                    err_threshold::Float64 = 1.0,
+                    runtime_threshold::Float64 = 1.1,
+                    alloc_threshold::Float64 = 0.0)
+    N = 2000 #10_000
+    error_ratios   = Float64[]
+    runtime_ratios = Float64[]
+    alloc_ratios   = Float64[]
+    for _ in 1:3
+
+        # SPD Matrix
+        A = randn(N, N)
+        A = A*A' + N*I
+
+        # Right-hand side
+        b = randn(N)
+
+        # SPD Matrix and right hand side on GPU (CUDA)
+        A_cuda = CuArray(A)
+        b_cuda = CuArray(b)
+
+        # SPD Matrix and right-hand side on GPU (Dagger Distributed)
+        A_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+            distribute(A, Blocks(N÷4, N÷4))
+        end
+        b_d = Dagger.with_options(scope=Dagger.scope(;cuda_gpu=1)) do
+            distribute(b, Blocks(N÷4))
+        end
+        
+        # --- Solve once to ensure kernels are compiled (warm-up) ---
+        x_default = cholesky(A_cuda) \ b_cuda
+        x_gen = Base.invokelatest(proposed_fn, A_d, b_d)
+        CUDA.synchronize()
+
+        # --- Error ratios (all on GPU, scalars on CPU) ---
+        err_default = norm(A_cuda * x_default - b_cuda)
+        err_gen     = norm(A_d * x_gen - b_d)
+        push!(error_ratios, err_default / err_gen)
+        # --- Runtime ratios (GPU) ---
+        b_default = @benchmark begin
+            cholesky($A_cuda) \ $b_cuda
+            CUDA.synchronize()
+        end
+        b_gen = @benchmark begin
+            Base.invokelatest($proposed_fn, $A_d, $b_d)
+        end
+        push!(runtime_ratios, median(b_default.times) / median(b_gen.times))
+        push!(alloc_ratios,   median(b_default.allocs) / median(b_gen.allocs))
+    end
+    m_err     = median(error_ratios)
+    m_runtime = median(runtime_ratios)
+    m_alloc   = median(alloc_ratios)
+    report = get_report(m_err, m_runtime, m_alloc,
+                        err_threshold, runtime_threshold, alloc_threshold)
+    println(report)
+    ok = (m_err     >= err_threshold)      &&  # 1.0 => no worse error
+         (m_runtime >= runtime_threshold)  &&  # 1.1 => at least 10% faster
+         (m_alloc   >= alloc_threshold)        # 0.0 => no alloc requirement
+    return ok, report
+end
\ No newline at end of file