Luce-Org · dusterbloom · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/.gitmodules b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "dflash/deps/llama.cpp"]
 	path = dflash/deps/llama.cpp
-	url = https://github.com/Luce-Org/llama.cpp-dflash-ggml.git
-	branch = luce-dflash
+	url = https://github.com/dusterbloom/llama-cpp-turboquant-cuda.git
+	branch = feature/tq3-kv-cache
 [submodule "dflash/deps/Block-Sparse-Attention"]
 	path = dflash/deps/Block-Sparse-Attention
 	url = https://github.com/mit-han-lab/Block-Sparse-Attention.git
diff --git a/.sisyphus/notes/gemma4-baseline/generate_prompts.py b/.sisyphus/notes/gemma4-baseline/generate_prompts.py
diff --git a/.sisyphus/notes/gemma4-baseline/matrix-64k-v2/SUMMARY.md b/.sisyphus/notes/gemma4-baseline/matrix-64k-v2/SUMMARY.md
@@ -0,0 +1,57 @@
+# Matrix v2 at 64k — all fixes in. 2026-05-09T23:48:10+02:00
+
+=== V1_none starting at 23:48:10 ===
+V1_none rc=0
+=== V2_mtp starting at 23:50:30 ===
+V2_mtp rc=0
+=== V3_dflash_dm8 starting at 23:52:54 ===
+V3_dflash_dm8 rc=0
+
+## Per-cell stats
+
+### V1_none
+```
+[cache] narrow asymmetric: forced Q8_0 on 2 captured full-attn layer(s) (remaining 8 full-attn keep TQ3)
+[cache] kv types: SWA=tq3_0, full=tq3_0
+[prefill] 49904 tokens in 85278.6 ms (585.2 tok/s) [chunked+pflash, chunk_size=1024]  (last sampled token: 100)
+[stats] generated=256  decode_ms=37108.4  tok/s=6.90  first_tok_ms=145.88
+[stats] prefill=49904 tokens  context_used=50160/65536
+[mem]  VRAM used=21.25 GB  total=24.00 GB
+```
+
+### V2_mtp
+```
+[cache] narrow asymmetric: forced Q8_0 on 2 captured full-attn layer(s) (remaining 8 full-attn keep TQ3)
+[cache] kv types: SWA=tq3_0, full=tq3_0
+[prefill] 49904 tokens in 85189.9 ms (585.8 tok/s) [chunked+pflash, chunk_size=1024]  (last sampled token: 100)
+[mtp] steps=256 accepted=5 accept_rate=0.02
+[stats] generated=256  decode_ms=40432.7  tok/s=6.33  first_tok_ms=164.94
+[stats] prefill=49904 tokens  context_used=50160/65536
+[mem]  VRAM used=21.70 GB  total=24.00 GB
+```
+
+### V3_dflash_dm8
+```
+[cache] narrow asymmetric: forced Q8_0 on 2 captured full-attn layer(s) (remaining 8 full-attn keep TQ3)
+[cache] kv types: SWA=tq3_0, full=tq3_0
+[draft] KV cache allocated: 2096 slots
+[prefill] 49904 tokens in 85184.4 ms (585.8 tok/s) [chunked+pflash, chunk_size=1024]  (last sampled token: 100)
+[draft] KV prefill done: 2096 positions materialized (skipped 47808 early tokens, cap=2096)
+[stats] generated=256  decode_ms=27753.8  tok/s=9.22  first_tok_ms=257.06
+[stats] prefill=49904 tokens  context_used=50160/65536
+[spec] draft_steps=112 total_accepted=256 avg_accept=2.29
+[mem]  VRAM used=23.59 GB  total=24.00 GB
+```
+
+## Decoded text comparison (first 80 generated tokens)
+
+### V1_none
+first_80_decoded: 'swe relentless<unused0>os<bos><unused94><pad><unk><unused6>ock<bos><blockquote><unused0>8<unused6>ublic<unused63>thought<unused94>thought\n<unused95>### Summary of Themes and Characters\n\nThis text consists of several fragmented scenes (likely from a play or a series of dramatic sketches) focusing on the political instability of Rome and the personal conflicts of its leaders.\n\n#### **Major Themes**\n\n*   **Pride vs. Humility:** The'
+
+### V2_mtp
+first_80_decoded: 'swe absorber<unused3>os<unused2><unused94><unused94>thought\n<unused95>### Summary of Themes and Characters\n\nThe provided text consists of several fragmented scenes (likely from a composite or modified version of Shakespearean-style plays, including elements of *Coriolanus* and *Richard III*). The narrative focuses on the intersection of military glory, political instability, and the volatility of public favor.\n\n#### Major Themes\n\n'
+
+### V3_dflash_dm8
+first_80_decoded: 'swe Bras<mask>os<unused2><unused94><unused94>thought\n<unused95>### Summary of Themes and Characters\n\nThe provided text is a fragmented collection of scenes (likely from a composite or modified version of Shakespearean-style plays, blending elements of *Coriolanus* and *Richard III*). It depicts a world of political instability, violent ambition, and the volatile relationship between the ruling elite and the common people.'
+
+DONE
diff --git a/.sisyphus/notes/gemma4-baseline/matrix-64k/SUMMARY.md b/.sisyphus/notes/gemma4-baseline/matrix-64k/SUMMARY.md
@@ -0,0 +1,71 @@
+# 64k drafter A/B with TQ3 + pFlash (dense 31B) — 2026-05-09T23:05:51+02:00
+Prompt: long_50k.txt (~50k tokens), ctx=65536, n_predict=256
+
+=== T1_none ===
+T1_none rc=0
+=== T2_mtp ===
+T2_mtp rc=0
+=== T3_dflash ===
+T3_dflash rc=143
+
+## Per-cell stats
+
+### T1_none
+```
+[cache] narrow asymmetric: forced Q8_0 on 2 captured full-attn layer(s) (remaining 8 full-attn keep TQ3)
+[cache] kv types: SWA=tq3_0, full=tq3_0
+[prefill] 49904 tokens in 87859.2 ms (568.0 tok/s) [chunked+pflash, chunk_size=1024]  (last sampled token: 100)
+[stats] generated=256  decode_ms=37952.4  tok/s=6.75  first_tok_ms=150.63
+[stats] prefill=49904 tokens  context_used=50160/65536
+[mem]  VRAM used=21.40 GB  total=24.00 GB
+```
+
+### T2_mtp
+```
+[cache] narrow asymmetric: forced Q8_0 on 2 captured full-attn layer(s) (remaining 8 full-attn keep TQ3)
+[cache] kv types: SWA=tq3_0, full=tq3_0
+[prefill] 49904 tokens in 87919.8 ms (567.6 tok/s) [chunked+pflash, chunk_size=1024]  (last sampled token: 100)
+[mtp-step 8] accept_rate=0.00
+532 81179 108 818 3847 1816 10594 529 [mtp-step 16] accept_rate=0.00
+3131 89144 18583 568 19609 699 496 22907 [mtp-step 24] accept_rate=0.00
+653 12269 3567 529 36951 508 236772 3061 [mtp-step 32] accept_rate=0.00
+10772 236764 2440 4820 529 808 236780 6886 [mtp-step 40] accept_rate=0.03
+40707 605 236829 532 808 40421 8488 236829 [mtp-step 48] accept_rate=0.02
+769 669 22323 21132 580 506 18074 529 [mtp-step 56] accept_rate=0.02
+7820 27877 236764 5255 32202 236764 532 506 [mtp-step 64] accept_rate=0.05
+43866 529 1237 4664 236761 108 2595 18787 [mtp-step 72] accept_rate=0.04
+137944 108 236829 139 1018 203460 532 19839 [mtp-step 80] accept_rate=0.04
+4499 53121 669 6082 12160 84022 2101 506 [mtp-step 88] accept_rate=0.03
+16625 1534 33641 532 125860 236761 102301 605 [mtp-step 96] accept_rate=0.03
+2481 81341 568 236780 6886 40707 605 236768 [mtp-step 104] accept_rate=0.03
+563 496 24240 1933 31451 236764 840 914 [mtp-step 112] accept_rate=0.04
+125688 573 506 3364 1331 532 914 45208 [mtp-step 120] accept_rate=0.03
+531 623 1674 2737 236775 1091 2080 531 [mtp-step 128] accept_rate=0.03
+914 124466 236761 4923 21077 3590 1515 496 [mtp-step 136] accept_rate=0.03
+623 45513 236775 528 506 6114 529 914 [mtp-step 144] accept_rate=0.03
+22816 532 496 179267 531 506 11838 236761 [mtp-step 152] accept_rate=0.03
+107 236829 139 1018 818 6285 26633 529 [mtp-step 160] accept_rate=0.03
+506 623 13666 4637 1083 1018 669 1816 [mtp-step 168] accept_rate=0.02
+46235 506 214696 4135 529 506 3364 1331 [mtp-step 176] accept_rate=0.02
+```
+
+### T3_dflash
+```
+[cache] narrow asymmetric: forced Q8_0 on 2 captured full-attn layer(s) (remaining 8 full-attn keep TQ3)
+[cache] kv types: SWA=tq3_0, full=tq3_0
+```
+
+## First 80 generated tokens (decoded)
+
+### T1_none
+raw extracted (first 80): [49904, 87859, 2, 568, 0, 100, 0, 3, 13, 134, 2, 895, 5, 308, 13, 206, 376, 45518, 100, 45518, 107, 101, 10354, 25252, 529, 137944, 532, 81179, 108, 818, 3847, 1816, 10594, 529, 3131, 89144, 18583, 568, 19609, 699, 496, 22907, 653, 12269, 3567, 529, 36951, 508, 236772, 3061, 10772, 236764, 2440, 4820, 529, 808, 236780, 6886, 40707, 605, 236829, 532, 808, 40421, 8488, 236829, 769, 669, 22323, 21132, 580, 506, 18074, 529, 7820, 27877, 236764, 5255, 32202, 236764]
+decoded (first 80): 'sweולם<bos> (<pad><unused94><pad><unk><unused7>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<bos> Y[multimodal]F<unused7><code>�thought<unused94>thought\n<unused95>### Summary of Themes and Characters\n\nThe provided text consists of several fragmented scenes (likely from a composite or modified version of Shakespearean-style plays, including elements of *Coriolanus* and *Richard III*). The narrative focuses on the intersection of military glory, political instability,'
+
+### T2_mtp
+raw extracted (first 80): [49904, 87919, 8, 567, 6, 100, 100, 45518, 107, 101, 10354, 25252, 529, 137944, 532, 81179, 108, 818, 3847, 1816, 10594, 529, 3131, 89144, 18583, 568, 19609, 699, 496, 22907, 653, 12269, 3567, 529, 36951, 508, 236772, 3061, 10772, 236764, 2440, 4820, 529, 808, 236780, 6886, 40707, 605, 236829, 532, 808, 40421, 8488, 236829, 769, 669, 22323, 21132, 580, 506, 18074, 529, 7820, 27877, 236764, 5255, 32202, 236764, 532, 506, 43866, 529, 1237, 4664, 236761, 108, 2595, 18787, 137944, 108]
+decoded (first 80): 'swe Tahun<unused2>ation<unused0><unused94><unused94>thought\n<unused95>### Summary of Themes and Characters\n\nThe provided text consists of several fragmented scenes (likely from a composite or modified version of Shakespearean-style plays, including elements of *Coriolanus* and *Richard III*). The narrative focuses on the intersection of military glory, political instability, and the volatility of public favor.\n\n#### Major Themes\n\n'
+
+### T3_dflash: no [prefill] marker
+
+DONE
+
diff --git a/.sisyphus/notes/gemma4-baseline/matrix-v3/SUMMARY.md b/.sisyphus/notes/gemma4-baseline/matrix-v3/SUMMARY.md
@@ -0,0 +1,6 @@
+# Matrix v3 with SWA mask fix — 2026-05-09T22:17:43+02:00
+=== N1_none_q8_tq3 (K=q8_0 V=tq3_0 draft=none) ===
+N1_none_q8_tq3 rc=0
+=== N2_none_q8_q8 (K=q8_0 V=q8_0 draft=none) ===
+N2_none_q8_q8 rc=0
+=== N3_mtp_q8_tq3 (K=q8_0 V=tq3_0 draft=mtp) ===
diff --git a/.sisyphus/notes/gemma4-baseline/prompts/humaneval_2.meta b/.sisyphus/notes/gemma4-baseline/prompts/humaneval_2.meta
@@ -0,0 +1,6 @@
+tokenizer: google/gemma-3-27b-it
+chat_template: yes (EvalPlus canonical instruction + opening code-fence)
+source: HumanEval/2 (truncate_number)
+token_count: 139
+first_20: [105, 2364, 107, 9366, 2847, 496, 1265, 236772, 66436, 17856, 8948, 600, 64744, 506, 2269, 2608, 528, 496, 127532, 3393]
+last_5: [236787, 107, 2717, 6719, 107]
diff --git a/.sisyphus/notes/gemma4-baseline/prompts/humaneval_2.txt b/.sisyphus/notes/gemma4-baseline/prompts/humaneval_2.txt
@@ -0,0 +1 @@
+105,2364,107,9366,2847,496,1265,236772,66436,17856,8948,600,64744,506,2269,2608,528,496,127532,3393,3355,236787,107,2717,109,2063,102267,236779,5640,236769,5640,236787,6803,236768,3921,6803,236787,107,140,12234,17770,496,4414,18224,1523,1548,236764,625,740,577,81153,1131,107,140,624,11995,912,568,65020,11995,7100,1082,2238,1548,236768,532,70208,107,140,236769,989,1749,912,2462,7100,1082,236743,236770,769,108,140,13293,506,20632,912,529,506,1548,236761,107,140,22539,102267,236779,5640,236769,236800,236761,236810,236768,107,140,236771,236761,236810,107,140,12234,108,2717,106,107,105,4368,107,43760,563,496,17856,8948,607,496,1265,236772,66436,1292,600,64744,506,2608,532,16349,7041,7713,236787,107,2717,6719,107
diff --git a/.sisyphus/notes/gemma4-baseline/prompts/long_2k.meta b/.sisyphus/notes/gemma4-baseline/prompts/long_2k.meta
@@ -0,0 +1,11 @@
+file: long_2k.txt
+tool: HuggingFace transformers AutoTokenizer, model=google/gemma-3-27b-it (local cache)
+tokenizer_vocab_size: 262144
+gguf_vocab_size: 262144 (verified via gguf.GGUFReader)
+chat_template_applied: yes
+bos_prepended_in_csv: no (driver prepends BOS=2 automatically)
+token_count: 2611
+first_20_ids: [105, 2364, 107, 85305, 691, 6534, 531, 974, 1401, 20718, 529, 8116, 684, 1116, 12198, 580, 506, 4856, 236764, 532]
+last_5_ids: [106, 107, 105, 4368, 107]
+source_text:
+Alice in Wonderland Chapter I 'Down the Rabbit-Hole' in full. Source: Project Gutenberg https://www.gutenberg.org/cache/epub/11/pg11.txt (public domain). 2611 tokens, within [2048, 3072] target range.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		105,2364,107,9366,2847,496,1265,236772,66436,17856,8948,600,64744,506,2269,2608,528,496,127532,3393,3355,236787,107,2717,109,2063,102267,236779,5640,236769,5640,236787,6803,236768,3921,6803,236787,107,140,12234,17770,496,4414,18224,1523,1548,236764,625,740,577,81153,1131,107,140,624,11995,912,568,65020,11995,7100,1082,2238,1548,236768,532,70208,107,140,236769,989,1749,912,2462,7100,1082,236743,236770,769,108,140,13293,506,20632,912,529,506,1548,236761,107,140,22539,102267,236779,5640,236769,236800,236761,236810,236768,107,140,236771,236761,236810,107,140,12234,108,2717,106,107,105,4368,107,43760,563,496,17856,8948,607,496,1265,236772,66436,1292,600,64744,506,2608,532,16349,7041,7713,236787,107,2717,6719,107