From 09bf2c20ed34d91a51e469576c1cf86ca9777d9c Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Wed, 29 Apr 2026 18:51:17 +0200
Subject: [PATCH 1/2] dflash: split target/draft StepGraphs to fix gallocr
 realloc per spec-decode step
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue #55: every spec-decode iteration calls build_target_step_tree
(target verify, ~3127 graph nodes) and build_draft_step (draft forward,
~186 graph nodes) on the SAME StepGraph, sharing one ggml_gallocr.
ggml_gallocr_needs_realloc compares galloc->n_nodes to graph->n_nodes,
so every call sees a mismatch left over from the previous call's
opposite topology, forcing ggml_gallocr_reserve to re-walk the entire
graph (CPU cost) and often cudaFree+cudaMalloc the activation buffer
(GPU driver cost).  Reporter on Windows/RTX 4090 sees the
"graph has different number of nodes" debug log fire every step and
decode tok/s halving from 90 @ 16k context to 55 @ 32k context.

Fix: introduce target_sg and draft_sg, each with its own ggml_gallocr.
Target verify settles into the 3127-node graph topology, draft into
the 186-node topology, and neither bounces.  Existing prefill /
target-verify call sites keep their `sg` references via a
StepGraph & sg = target_sg alias; only the draft block (~10 calls)
swaps `sg.X` for `draft_sg.X`.  Daemon-mode reset and migrate-cache
sites destroy both StepGraphs.

Verified with one-line instrumentation patch on ggml_gallocr_alloc_graph
(unconditionally fprintf to stderr at each "needs_realloc returns
true" site, removing the #ifndef NDEBUG gate the upstream messages
are silenced by in Release builds).  HE prompt 00 + ddtree-budget=22 +
n_gen=256 over 26 spec-decode steps:

  Before: 56 needs_realloc events (alternating n_nodes 186 ↔ 3127),
          14 cudaFree+cudaMalloc events.
  After:  3 needs_realloc events (initial only: 0 -> 3127, 0 -> 3079,
          0 -> 186), 0 cudaFree+cudaMalloc events during decode.

bench_he.py (RTX 3090, --n-gen 128, --ddtree-budget 22, 3-run mean):
  main:     86.72 tok/s
  this fix: 84.99 tok/s
Within bench-noise on Linux/CUDA 12.6 because cudaMalloc is cheap on
this stack — the saved per-step cost is small.  The reporter's stack
(Windows/CUDA 13/RTX 4090) has a slower stream-allocator where the
saved cost should translate into measurable tok/s recovery; that
needs verification on the reporter's box.
---
 dflash/test/test_dflash.cpp | 42 ++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/dflash/test/test_dflash.cpp b/dflash/test/test_dflash.cpp
index 022c5a7b..7b869cd9 100644
--- a/dflash/test/test_dflash.cpp
+++ b/dflash/test/test_dflash.cpp
@@ -1213,7 +1213,15 @@ int main(int argc, char ** argv) {
         std::fflush(stdout);
     }
 
-    StepGraph sg;
+    // Two StepGraphs: target verify (huge graph ~3000 nodes) and draft forward
+    // (small graph ~200 nodes) have very different topologies.  Sharing one
+    // ggml_gallocr made every call to one path see needs_realloc=true after a
+    // call to the other (n_nodes mismatch), forcing a graph re-walk and often
+    // a cudaMalloc on every spec-decode iteration (issue #55).  Splitting the
+    // gallocrs lets each settle into a steady state and stop reallocating.
+    StepGraph target_sg;
+    StepGraph draft_sg;
+    StepGraph & sg = target_sg;   // alias for prefill / target-verify call sites
     bool daemon_first_iter = true;
 
     while (true) {
@@ -1229,7 +1237,8 @@ int main(int argc, char ** argv) {
             // Rebuild cache + step graph between requests so KV / SSM / conv /
             // target_feat ring start fresh. Weights stay resident.
             if (!daemon_first_iter) {
-                step_graph_destroy(sg);
+                step_graph_destroy(target_sg);
+                step_graph_destroy(draft_sg);
                 free_target_cache(cache);
                 if (!create_target_cache(w, max_ctx, max_verify_tokens, backend, cache,
                                          /*prefill_only=*/true)) {
@@ -1413,7 +1422,8 @@ int main(int argc, char ** argv) {
 
         // Promote prefill-only cache to full decode cache
         auto t_mig0 = std::chrono::steady_clock::now();
-        step_graph_destroy(sg);
+        step_graph_destroy(target_sg);
+        step_graph_destroy(draft_sg);
         if (!migrate_prefill_cache(w, max_ctx, max_verify_tokens, backend, cache)) {
             std::fprintf(stderr, "cache migration: %s\n", dflash27b_last_error());
             return 1;
@@ -1494,7 +1504,8 @@ int main(int argc, char ** argv) {
     // Promote prefill-only cache to full decode cache with rollback tensors.
     // Copies KV, SSM/conv state, and target_feat device→device (~1 ms).
     auto t_mig0 = std::chrono::steady_clock::now();
-    step_graph_destroy(sg);
+    step_graph_destroy(target_sg);
+    step_graph_destroy(draft_sg);
     if (!migrate_prefill_cache(w, max_ctx, max_verify_tokens, backend, cache)) {
         std::fprintf(stderr, "cache migration: %s\n", dflash27b_last_error());
         return 1;
@@ -1552,14 +1563,16 @@ int main(int argc, char ** argv) {
         const int draft_ctx   = std::min(committed, DRAFT_CTX_MAX);
         const int draft_start = committed - draft_ctx;
 
-        // 2) Draft forward
-        if (!build_draft_step(sg, dw, w, backend, /*ctx_len=*/draft_ctx)) {
+        // 2) Draft forward — uses draft_sg so its gallocr settles into a
+        // small-graph shape (≈200 nodes) instead of bouncing with the target
+        // verify path's huge-graph shape (≈3000 nodes) (issue #55).
+        if (!build_draft_step(draft_sg, dw, w, backend, /*ctx_len=*/draft_ctx)) {
             std::fprintf(stderr, "draft build failed\n"); return 1;
         }
         auto T_draft_build = sync_us();
         tt_draft_build += std::chrono::duration<double, std::micro>(T_draft_build - T0).count();
 
-        ggml_backend_tensor_set(sg.inp_embed, noise_embed_buf.data(), 0,
+        ggml_backend_tensor_set(draft_sg.inp_embed, noise_embed_buf.data(), 0,
                                 sizeof(float) * noise_embed_buf.size());
 
         // target_hidden_cat: copy the draft-window slice of cache.target_feat
@@ -1576,13 +1589,13 @@ int main(int argc, char ** argv) {
 
         dflash27b_launch_bf16_to_f32(
             (const char *)cache.target_feat->data + (size_t)slot0 * elt_feat * fc_in,
-            sg.target_hidden_cat->data,
+            draft_sg.target_hidden_cat->data,
             (size_t)pre_n * fc_in,
             nullptr);
         if (post_n > 0) {
             dflash27b_launch_bf16_to_f32(
                 (const char *)cache.target_feat->data,
-                (char *)sg.target_hidden_cat->data + (size_t)pre_n * fc_in * sizeof(float),
+                (char *)draft_sg.target_hidden_cat->data + (size_t)pre_n * fc_in * sizeof(float),
                 (size_t)post_n * fc_in,
                 nullptr);
         }
@@ -1591,17 +1604,17 @@ int main(int argc, char ** argv) {
 
         for (int i = 0; i < q_len; i++) pos_q_buf[i] = draft_ctx + i;
         for (int i = 0; i < draft_ctx + q_len; i++) pos_k_buf[i] = i;
-        ggml_backend_tensor_set(sg.positions,   pos_q_buf.data(), 0, sizeof(int32_t) * q_len);
-        ggml_backend_tensor_set(sg.positions_k, pos_k_buf.data(), 0, sizeof(int32_t) * (draft_ctx + q_len));
+        ggml_backend_tensor_set(draft_sg.positions,   pos_q_buf.data(), 0, sizeof(int32_t) * q_len);
+        ggml_backend_tensor_set(draft_sg.positions_k, pos_k_buf.data(), 0, sizeof(int32_t) * (draft_ctx + q_len));
         auto T_draft_set = sync_us();
         tt_draft_set += std::chrono::duration<double, std::micro>(T_draft_set - T_draft_copy).count();
 
-        auto st = ggml_backend_graph_compute(backend, sg.gf);
+        auto st = ggml_backend_graph_compute(backend, draft_sg.gf);
         if (st != GGML_STATUS_SUCCESS) { std::fprintf(stderr, "draft compute %d\n", (int)st); return 1; }
         auto T_draft_compute = sync_us();
         tt_draft_compute += std::chrono::duration<double, std::micro>(T_draft_compute - T_draft_set).count();
 
-        ggml_backend_tensor_get(sg.logits, draft_logits_buf.data(), 0,
+        ggml_backend_tensor_get(draft_sg.logits, draft_logits_buf.data(), 0,
                                 sizeof(float) * vocab * q_len);
         for (int i = 0; i < q_len; i++) {
             draft_tok[i] = argmax_f32(draft_logits_buf.data() + (size_t)i * vocab, vocab);
@@ -2363,7 +2376,8 @@ int main(int argc, char ** argv) {
 
     } // end while(true)
 
-    step_graph_destroy(sg);
+    step_graph_destroy(target_sg);
+    step_graph_destroy(draft_sg);
     free_target_cache(cache);
     free_draft_weights(dw);
     free_target_weights(w);

From c69740dbd6b7641b97eaed31ce90ed1085c71ffe Mon Sep 17 00:00:00 2001
From: dusterbloom <32869278+dusterbloom@users.noreply.github.com>
Date: Thu, 30 Apr 2026 15:07:22 +0200
Subject: [PATCH 2/2] dflash: preserve daemon cache reuse with split step
 graphs

---
 dflash/src/internal.h                         |  5 +++
 dflash/src/qwen35_target_graph.cpp            | 25 +++++++++++++++
 .../test_daemon_reset_merge_resolution.py     | 31 +++++++++++++++++++
 dflash/test/test_dflash.cpp                   | 20 ++++++------
 4 files changed, 70 insertions(+), 11 deletions(-)
 create mode 100644 dflash/test/test_daemon_reset_merge_resolution.py

diff --git a/dflash/src/internal.h b/dflash/src/internal.h
index ce2fe381..fa77b0f6 100644
--- a/dflash/src/internal.h
+++ b/dflash/src/internal.h
@@ -255,6 +255,11 @@ bool create_target_cache(const TargetWeights & w,
 
 void free_target_cache(TargetCache & c);
 
+// Zero all state tensors (KV, SSM, conv, target_feat, rollback) in place
+// without freeing/reallocating GPU buffers. Used by daemon mode between
+// requests to avoid the ~5 s overhead of full cache destruction + recreation.
+void reset_target_cache(TargetCache & c);
+
 // Reallocate a prefill-only cache with full rollback tensors, copying all live
 // state (KV, SSM, conv, target_feat) device-to-device. Frees the old cache.
 bool migrate_prefill_cache(const TargetWeights & w,
diff --git a/dflash/src/qwen35_target_graph.cpp b/dflash/src/qwen35_target_graph.cpp
index 25ff39c1..98e09d58 100644
--- a/dflash/src/qwen35_target_graph.cpp
+++ b/dflash/src/qwen35_target_graph.cpp
@@ -245,14 +245,39 @@ void free_target_cache(TargetCache & c) {
     c.cur_pos = 0;
 }
 
+void reset_target_cache(TargetCache & c) {
+    c.cur_pos = 0;
+    std::vector<uint8_t> zeros(1 * 1024 * 1024, 0);
+    ggml_context * ctx_list[] = { c.base_ctx, c.rollback_ctx };
+    for (int ci = 0; ci < 2; ci++) {
+        ggml_context * ctx = ctx_list[ci];
+        if (!ctx) continue;
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr;
+             t = ggml_get_next_tensor(ctx, t)) {
+            size_t nb = ggml_nbytes(t);
+            size_t off = 0;
+            while (off < nb) {
+                size_t chunk = std::min(nb - off, zeros.size());
+                ggml_backend_tensor_set(t, zeros.data(), off, chunk);
+                off += chunk;
+            }
+        }
+    }
+}
+
 // Attach rollback tensors to an existing prefill cache without touching the
 // base tensors (KV, SSM, conv, target_feat) that prefill already populated.
 // No D2D copies — the base tensors stay right where the graph wrote them.
+// If rollback tensors are already present (e.g. daemon mode second request),
+// this is a no-op.
 bool migrate_prefill_cache(const TargetWeights & w,
                            int max_ctx,
                            int max_verify_tokens,
                            ggml_backend_t backend,
                            TargetCache & cache) {
+    // Already migrated (e.g. daemon mode second+ request after reset_target_cache).
+    if (cache.rollback_ctx) return true;
+
     const int n_delta = (int)cache.ssm_state.size(); // 48
     if (max_verify_tokens <= 0) {
         max_verify_tokens = DFLASH27B_DRAFT_BLOCK_SIZE;
diff --git a/dflash/test/test_daemon_reset_merge_resolution.py b/dflash/test/test_daemon_reset_merge_resolution.py
new file mode 100644
index 00000000..4f35cc62
--- /dev/null
+++ b/dflash/test/test_daemon_reset_merge_resolution.py
@@ -0,0 +1,31 @@
+import re
+import unittest
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parents[1]
+SOURCE = ROOT / "test" / "test_dflash.cpp"
+
+
+class DaemonResetMergeResolutionTest(unittest.TestCase):
+    def test_daemon_reset_reuses_cache_and_frees_both_transient_graphs(self):
+        source = SOURCE.read_text()
+        match = re.search(
+            r"if \(!daemon_first_iter\) \{\n(?P<body>.*?)\n\s+\}\n\s+daemon_first_iter = false;",
+            source,
+            re.S,
+        )
+        self.assertIsNotNone(match, "daemon reset block not found")
+
+        body = match.group("body")
+        self.assertIn("step_graph_free(target_sg);", body)
+        self.assertIn("step_graph_free(draft_sg);", body)
+        self.assertIn("reset_target_cache(cache);", body)
+        self.assertNotIn("step_graph_destroy(target_sg);", body)
+        self.assertNotIn("step_graph_destroy(draft_sg);", body)
+        self.assertNotIn("free_target_cache(cache);", body)
+        self.assertNotIn("create_target_cache(", body)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/dflash/test/test_dflash.cpp b/dflash/test/test_dflash.cpp
index 7b869cd9..66957fa7 100644
--- a/dflash/test/test_dflash.cpp
+++ b/dflash/test/test_dflash.cpp
@@ -1234,18 +1234,16 @@ int main(int argc, char ** argv) {
             prompt_file_str = ppath;
             prompt_path = prompt_file_str.c_str();
 
-            // Rebuild cache + step graph between requests so KV / SSM / conv /
-            // target_feat ring start fresh. Weights stay resident.
+            // Reset cache state between requests. On the first request the
+            // cache was promoted from prefill-only to full (with rollback
+            // tensors) by migrate_prefill_cache. On subsequent requests we
+            // just zero all state tensors in place and drop transient graph
+            // descriptors for both target/draft graphs; persistent gallocr
+            // buffers stay resident.
             if (!daemon_first_iter) {
-                step_graph_destroy(target_sg);
-                step_graph_destroy(draft_sg);
-                free_target_cache(cache);
-                if (!create_target_cache(w, max_ctx, max_verify_tokens, backend, cache,
-                                         /*prefill_only=*/true)) {
-                    std::fprintf(stderr, "cache realloc: %s\n", dflash27b_last_error());
-                    stream_emit(-1);
-                    continue;
-                }
+                step_graph_free(target_sg);
+                step_graph_free(draft_sg);
+                reset_target_cache(cache);
             }
             daemon_first_iter = false;
         }