From 09bf2c20ed34d91a51e469576c1cf86ca9777d9c Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Wed, 29 Apr 2026 18:51:17 +0200 Subject: [PATCH 1/2] dflash: split target/draft StepGraphs to fix gallocr realloc per spec-decode step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue #55: every spec-decode iteration calls build_target_step_tree (target verify, ~3127 graph nodes) and build_draft_step (draft forward, ~186 graph nodes) on the SAME StepGraph, sharing one ggml_gallocr. ggml_gallocr_needs_realloc compares galloc->n_nodes to graph->n_nodes, so every call sees a mismatch left over from the previous call's opposite topology, forcing ggml_gallocr_reserve to re-walk the entire graph (CPU cost) and often cudaFree+cudaMalloc the activation buffer (GPU driver cost). Reporter on Windows/RTX 4090 sees the "graph has different number of nodes" debug log fire every step and decode tok/s halving from 90 @ 16k context to 55 @ 32k context. Fix: introduce target_sg and draft_sg, each with its own ggml_gallocr. Target verify settles into the 3127-node graph topology, draft into the 186-node topology, and neither bounces. Existing prefill / target-verify call sites keep their `sg` references via a StepGraph & sg = target_sg alias; only the draft block (~10 calls) swaps `sg.X` for `draft_sg.X`. Daemon-mode reset and migrate-cache sites destroy both StepGraphs. Verified with one-line instrumentation patch on ggml_gallocr_alloc_graph (unconditionally fprintf to stderr at each "needs_realloc returns true" site, removing the #ifndef NDEBUG gate the upstream messages are silenced by in Release builds). HE prompt 00 + ddtree-budget=22 + n_gen=256 over 26 spec-decode steps: Before: 56 needs_realloc events (alternating n_nodes 186 ↔ 3127), 14 cudaFree+cudaMalloc events. After: 3 needs_realloc events (initial only: 0 -> 3127, 0 -> 3079, 0 -> 186), 0 cudaFree+cudaMalloc events during decode. bench_he.py (RTX 3090, --n-gen 128, --ddtree-budget 22, 3-run mean): main: 86.72 tok/s this fix: 84.99 tok/s Within bench-noise on Linux/CUDA 12.6 because cudaMalloc is cheap on this stack — the saved per-step cost is small. The reporter's stack (Windows/CUDA 13/RTX 4090) has a slower stream-allocator where the saved cost should translate into measurable tok/s recovery; that needs verification on the reporter's box. --- dflash/test/test_dflash.cpp | 42 ++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/dflash/test/test_dflash.cpp b/dflash/test/test_dflash.cpp index 022c5a7b..7b869cd9 100644 --- a/dflash/test/test_dflash.cpp +++ b/dflash/test/test_dflash.cpp @@ -1213,7 +1213,15 @@ int main(int argc, char ** argv) { std::fflush(stdout); } - StepGraph sg; + // Two StepGraphs: target verify (huge graph ~3000 nodes) and draft forward + // (small graph ~200 nodes) have very different topologies. Sharing one + // ggml_gallocr made every call to one path see needs_realloc=true after a + // call to the other (n_nodes mismatch), forcing a graph re-walk and often + // a cudaMalloc on every spec-decode iteration (issue #55). Splitting the + // gallocrs lets each settle into a steady state and stop reallocating. + StepGraph target_sg; + StepGraph draft_sg; + StepGraph & sg = target_sg; // alias for prefill / target-verify call sites bool daemon_first_iter = true; while (true) { @@ -1229,7 +1237,8 @@ int main(int argc, char ** argv) { // Rebuild cache + step graph between requests so KV / SSM / conv / // target_feat ring start fresh. Weights stay resident. if (!daemon_first_iter) { - step_graph_destroy(sg); + step_graph_destroy(target_sg); + step_graph_destroy(draft_sg); free_target_cache(cache); if (!create_target_cache(w, max_ctx, max_verify_tokens, backend, cache, /*prefill_only=*/true)) { @@ -1413,7 +1422,8 @@ int main(int argc, char ** argv) { // Promote prefill-only cache to full decode cache auto t_mig0 = std::chrono::steady_clock::now(); - step_graph_destroy(sg); + step_graph_destroy(target_sg); + step_graph_destroy(draft_sg); if (!migrate_prefill_cache(w, max_ctx, max_verify_tokens, backend, cache)) { std::fprintf(stderr, "cache migration: %s\n", dflash27b_last_error()); return 1; @@ -1494,7 +1504,8 @@ int main(int argc, char ** argv) { // Promote prefill-only cache to full decode cache with rollback tensors. // Copies KV, SSM/conv state, and target_feat device→device (~1 ms). auto t_mig0 = std::chrono::steady_clock::now(); - step_graph_destroy(sg); + step_graph_destroy(target_sg); + step_graph_destroy(draft_sg); if (!migrate_prefill_cache(w, max_ctx, max_verify_tokens, backend, cache)) { std::fprintf(stderr, "cache migration: %s\n", dflash27b_last_error()); return 1; @@ -1552,14 +1563,16 @@ int main(int argc, char ** argv) { const int draft_ctx = std::min(committed, DRAFT_CTX_MAX); const int draft_start = committed - draft_ctx; - // 2) Draft forward - if (!build_draft_step(sg, dw, w, backend, /*ctx_len=*/draft_ctx)) { + // 2) Draft forward — uses draft_sg so its gallocr settles into a + // small-graph shape (≈200 nodes) instead of bouncing with the target + // verify path's huge-graph shape (≈3000 nodes) (issue #55). + if (!build_draft_step(draft_sg, dw, w, backend, /*ctx_len=*/draft_ctx)) { std::fprintf(stderr, "draft build failed\n"); return 1; } auto T_draft_build = sync_us(); tt_draft_build += std::chrono::duration(T_draft_build - T0).count(); - ggml_backend_tensor_set(sg.inp_embed, noise_embed_buf.data(), 0, + ggml_backend_tensor_set(draft_sg.inp_embed, noise_embed_buf.data(), 0, sizeof(float) * noise_embed_buf.size()); // target_hidden_cat: copy the draft-window slice of cache.target_feat @@ -1576,13 +1589,13 @@ int main(int argc, char ** argv) { dflash27b_launch_bf16_to_f32( (const char *)cache.target_feat->data + (size_t)slot0 * elt_feat * fc_in, - sg.target_hidden_cat->data, + draft_sg.target_hidden_cat->data, (size_t)pre_n * fc_in, nullptr); if (post_n > 0) { dflash27b_launch_bf16_to_f32( (const char *)cache.target_feat->data, - (char *)sg.target_hidden_cat->data + (size_t)pre_n * fc_in * sizeof(float), + (char *)draft_sg.target_hidden_cat->data + (size_t)pre_n * fc_in * sizeof(float), (size_t)post_n * fc_in, nullptr); } @@ -1591,17 +1604,17 @@ int main(int argc, char ** argv) { for (int i = 0; i < q_len; i++) pos_q_buf[i] = draft_ctx + i; for (int i = 0; i < draft_ctx + q_len; i++) pos_k_buf[i] = i; - ggml_backend_tensor_set(sg.positions, pos_q_buf.data(), 0, sizeof(int32_t) * q_len); - ggml_backend_tensor_set(sg.positions_k, pos_k_buf.data(), 0, sizeof(int32_t) * (draft_ctx + q_len)); + ggml_backend_tensor_set(draft_sg.positions, pos_q_buf.data(), 0, sizeof(int32_t) * q_len); + ggml_backend_tensor_set(draft_sg.positions_k, pos_k_buf.data(), 0, sizeof(int32_t) * (draft_ctx + q_len)); auto T_draft_set = sync_us(); tt_draft_set += std::chrono::duration(T_draft_set - T_draft_copy).count(); - auto st = ggml_backend_graph_compute(backend, sg.gf); + auto st = ggml_backend_graph_compute(backend, draft_sg.gf); if (st != GGML_STATUS_SUCCESS) { std::fprintf(stderr, "draft compute %d\n", (int)st); return 1; } auto T_draft_compute = sync_us(); tt_draft_compute += std::chrono::duration(T_draft_compute - T_draft_set).count(); - ggml_backend_tensor_get(sg.logits, draft_logits_buf.data(), 0, + ggml_backend_tensor_get(draft_sg.logits, draft_logits_buf.data(), 0, sizeof(float) * vocab * q_len); for (int i = 0; i < q_len; i++) { draft_tok[i] = argmax_f32(draft_logits_buf.data() + (size_t)i * vocab, vocab); @@ -2363,7 +2376,8 @@ int main(int argc, char ** argv) { } // end while(true) - step_graph_destroy(sg); + step_graph_destroy(target_sg); + step_graph_destroy(draft_sg); free_target_cache(cache); free_draft_weights(dw); free_target_weights(w); From c69740dbd6b7641b97eaed31ce90ed1085c71ffe Mon Sep 17 00:00:00 2001 From: dusterbloom <32869278+dusterbloom@users.noreply.github.com> Date: Thu, 30 Apr 2026 15:07:22 +0200 Subject: [PATCH 2/2] dflash: preserve daemon cache reuse with split step graphs --- dflash/src/internal.h | 5 +++ dflash/src/qwen35_target_graph.cpp | 25 +++++++++++++++ .../test_daemon_reset_merge_resolution.py | 31 +++++++++++++++++++ dflash/test/test_dflash.cpp | 20 ++++++------ 4 files changed, 70 insertions(+), 11 deletions(-) create mode 100644 dflash/test/test_daemon_reset_merge_resolution.py diff --git a/dflash/src/internal.h b/dflash/src/internal.h index ce2fe381..fa77b0f6 100644 --- a/dflash/src/internal.h +++ b/dflash/src/internal.h @@ -255,6 +255,11 @@ bool create_target_cache(const TargetWeights & w, void free_target_cache(TargetCache & c); +// Zero all state tensors (KV, SSM, conv, target_feat, rollback) in place +// without freeing/reallocating GPU buffers. Used by daemon mode between +// requests to avoid the ~5 s overhead of full cache destruction + recreation. +void reset_target_cache(TargetCache & c); + // Reallocate a prefill-only cache with full rollback tensors, copying all live // state (KV, SSM, conv, target_feat) device-to-device. Frees the old cache. bool migrate_prefill_cache(const TargetWeights & w, diff --git a/dflash/src/qwen35_target_graph.cpp b/dflash/src/qwen35_target_graph.cpp index 25ff39c1..98e09d58 100644 --- a/dflash/src/qwen35_target_graph.cpp +++ b/dflash/src/qwen35_target_graph.cpp @@ -245,14 +245,39 @@ void free_target_cache(TargetCache & c) { c.cur_pos = 0; } +void reset_target_cache(TargetCache & c) { + c.cur_pos = 0; + std::vector zeros(1 * 1024 * 1024, 0); + ggml_context * ctx_list[] = { c.base_ctx, c.rollback_ctx }; + for (int ci = 0; ci < 2; ci++) { + ggml_context * ctx = ctx_list[ci]; + if (!ctx) continue; + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; + t = ggml_get_next_tensor(ctx, t)) { + size_t nb = ggml_nbytes(t); + size_t off = 0; + while (off < nb) { + size_t chunk = std::min(nb - off, zeros.size()); + ggml_backend_tensor_set(t, zeros.data(), off, chunk); + off += chunk; + } + } + } +} + // Attach rollback tensors to an existing prefill cache without touching the // base tensors (KV, SSM, conv, target_feat) that prefill already populated. // No D2D copies — the base tensors stay right where the graph wrote them. +// If rollback tensors are already present (e.g. daemon mode second request), +// this is a no-op. bool migrate_prefill_cache(const TargetWeights & w, int max_ctx, int max_verify_tokens, ggml_backend_t backend, TargetCache & cache) { + // Already migrated (e.g. daemon mode second+ request after reset_target_cache). + if (cache.rollback_ctx) return true; + const int n_delta = (int)cache.ssm_state.size(); // 48 if (max_verify_tokens <= 0) { max_verify_tokens = DFLASH27B_DRAFT_BLOCK_SIZE; diff --git a/dflash/test/test_daemon_reset_merge_resolution.py b/dflash/test/test_daemon_reset_merge_resolution.py new file mode 100644 index 00000000..4f35cc62 --- /dev/null +++ b/dflash/test/test_daemon_reset_merge_resolution.py @@ -0,0 +1,31 @@ +import re +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +SOURCE = ROOT / "test" / "test_dflash.cpp" + + +class DaemonResetMergeResolutionTest(unittest.TestCase): + def test_daemon_reset_reuses_cache_and_frees_both_transient_graphs(self): + source = SOURCE.read_text() + match = re.search( + r"if \(!daemon_first_iter\) \{\n(?P.*?)\n\s+\}\n\s+daemon_first_iter = false;", + source, + re.S, + ) + self.assertIsNotNone(match, "daemon reset block not found") + + body = match.group("body") + self.assertIn("step_graph_free(target_sg);", body) + self.assertIn("step_graph_free(draft_sg);", body) + self.assertIn("reset_target_cache(cache);", body) + self.assertNotIn("step_graph_destroy(target_sg);", body) + self.assertNotIn("step_graph_destroy(draft_sg);", body) + self.assertNotIn("free_target_cache(cache);", body) + self.assertNotIn("create_target_cache(", body) + + +if __name__ == "__main__": + unittest.main() diff --git a/dflash/test/test_dflash.cpp b/dflash/test/test_dflash.cpp index 7b869cd9..66957fa7 100644 --- a/dflash/test/test_dflash.cpp +++ b/dflash/test/test_dflash.cpp @@ -1234,18 +1234,16 @@ int main(int argc, char ** argv) { prompt_file_str = ppath; prompt_path = prompt_file_str.c_str(); - // Rebuild cache + step graph between requests so KV / SSM / conv / - // target_feat ring start fresh. Weights stay resident. + // Reset cache state between requests. On the first request the + // cache was promoted from prefill-only to full (with rollback + // tensors) by migrate_prefill_cache. On subsequent requests we + // just zero all state tensors in place and drop transient graph + // descriptors for both target/draft graphs; persistent gallocr + // buffers stay resident. if (!daemon_first_iter) { - step_graph_destroy(target_sg); - step_graph_destroy(draft_sg); - free_target_cache(cache); - if (!create_target_cache(w, max_ctx, max_verify_tokens, backend, cache, - /*prefill_only=*/true)) { - std::fprintf(stderr, "cache realloc: %s\n", dflash27b_last_error()); - stream_emit(-1); - continue; - } + step_graph_free(target_sg); + step_graph_free(draft_sg); + reset_target_cache(cache); } daemon_first_iter = false; }