Luce-Org · easel · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt
@@ -42,7 +42,42 @@ if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/CMakeLists.txt")
         "deps/llama.cpp submodule missing. Run: "
         "git submodule update --init --recursive")
 endif()
+
+# ─── CUDA architecture auto-detection ──────────────────────────────
+# CUDA 12.8+ resolves "native" on Blackwell to sm_120a (DGX Spark variant)
+# which is forward-incompatible with consumer Blackwell (RTX 5090, SM 12.0).
+# Query nvidia-smi at configure time to get the exact SM so ggml-cuda
+# compiles for the physical GPU rather than an incompatible variant.
+if(NOT CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
+    find_program(_DFLASH_NVIDIA_SMI nvidia-smi)
+    if(_DFLASH_NVIDIA_SMI)
+        execute_process(
+            COMMAND "${_DFLASH_NVIDIA_SMI}"
+                    --query-gpu=compute_cap --format=csv,noheader
+            OUTPUT_VARIABLE _dflash_gpu_caps
+            ERROR_QUIET
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            RESULT_VARIABLE _dflash_smi_rc
+        )
+        if(_dflash_smi_rc EQUAL 0 AND _dflash_gpu_caps)
+            string(REGEX MATCH "^[0-9]+\\.[0-9]+" _dflash_cap "${_dflash_gpu_caps}")
+            string(REPLACE "." "" _dflash_arch "${_dflash_cap}")
+            if(_dflash_arch)
+                set(CMAKE_CUDA_ARCHITECTURES "${_dflash_arch}" CACHE STRING
+                    "CUDA architectures (auto-detected from nvidia-smi: ${_dflash_cap})" FORCE)
+                message(STATUS "dflash27b: GPU compute_cap ${_dflash_cap} → CUDA_ARCHITECTURES=${_dflash_arch}")
+                # Consumer Blackwell (SM 12.x, no FP4 tensor cores) must skip
+                # ggml's sm_120→sm_120a replacement or kernels will fault.
+                if(_dflash_arch MATCHES "^12[0-9]$")
+                    set(GGML_CUDA_BLACKWELL_CONSUMER ON CACHE BOOL
+                        "Skip sm_12X→sm_12Xa for consumer Blackwell (no FP4)" FORCE)
+                endif()
+            endif()
+        endif()
+    endif()
+endif()
 set(GGML_CUDA_FA_ALL_QUANTS ON CACHE BOOL "Compile fattn kernels for all quant pairs (needed for asymmetric KV-quant)" FORCE)
+
 add_subdirectory(deps/llama.cpp/ggml EXCLUDE_FROM_ALL)
 
 # ─── dflash27b static library ──────────────────────────────────────

diff --git a/dflash/README.md b/dflash/README.md
@@ -166,8 +166,16 @@ python3 examples/chat.py
 
 # OpenAI-compatible HTTP server (drop-in for Open WebUI / LM Studio / Cline)
 python3 -m venv .venv
-.venv/bin/pip install fastapi uvicorn transformers jinja2
-.venv/bin/python scripts/server.py --port 8000 --daemon
+.venv/bin/pip install -e .
+.venv/bin/dflash-server --profile agent-code-text --port 8000
+
+# Safe text-only coding-agent profile (default)
+# - max_ctx=48000, max_prompt_tokens=42000
+# - DFLASH27B_PREFILL_UBATCH=384, DFLASH27B_LAYER_PREFILL=0
+# - optional KV cache passthrough: --cache-type-k q4_0 --cache-type-v q8_0
+# - vision/multimodal payloads rejected
+# - prefix cache intentionally disabled; see docs/prefix-cache-design.md
+.venv/bin/dflash-server --profile agent-code-text --prefill-ubatch 256 --cache-type-k q4_0 --cache-type-v q8_0
 
 # Reproduce paper numbers
 python3 scripts/bench_llm.py                                 # HE + GSM8K + Math500
@@ -182,7 +190,7 @@ DFLASH27B_KV_TQ3=1 DFLASH27B_PREFILL_UBATCH=16 \
   --fast-rollback --ddtree --ddtree-budget=16 --max-ctx=4096   # align_up(prompt + n_gen + 64, 256); raise up to 262144 for long prompts
 ```
 
-**Requirements:** NVIDIA sm_86+ GPU (3090, A10, A40, 4090) or Jetson AGX Thor sm_110, CUDA 12+ (CUDA 13+ required for Thor), 24 GB VRAM, ~80 GB disk.
+**Requirements:** NVIDIA sm_86+ GPU (3090, A10, A40, 4090), sm_120 (RTX 5090), or Jetson AGX Thor sm_110, CUDA 12+ (CUDA 13+ required for sm_120/Thor), 24 GB VRAM, ~80 GB disk.
 
 ## How it works
 

diff --git a/dflash/RESULTS.md b/dflash/RESULTS.md
@@ -1,19 +1,60 @@
 # Luce DFlash benchmark results
 
-Single RTX 3090 24 GB, CUDA 12, driver 535.
 Target: `unsloth/Qwen3.5-27B-GGUF` (Q4_K_M, ~16 GB).
 Draft:  `z-lab/Qwen3.5-27B-DFlash` (BF16, 3.46 GB).
 Concurrency = 1, greedy decoding, `n_gen=256`.
 Reproduce with `python3 scripts/bench_llm.py` (samples 10 prompts/dataset, seed=42).
 
 ## Headline — AR vs Luce DFlash at concurrency 1
 
+### RTX 3090 24 GB desktop (sm_86) — CUDA 12, driver 535
+
 | Task      | AR tok/s | DFlash tok/s | AL   | Speedup |
 |-----------|:--------:|:------------:|:----:|:-------:|
 | HumanEval | 37.78    | **129.52**   | 8.31 | **3.43×** |
 | Math500   | 37.71    | **110.51**   | 7.04 | **2.93×** |
 | GSM8K     | 37.65    | **96.15**    | 6.14 | **2.55×** |
 
+### RTX 5090 Laptop 24 GB (sm_120) — CUDA 13.2, driver 581.80
+
+| Task      | AR tok/s | DFlash tok/s | AL   | Speedup |
+|-----------|:--------:|:------------:|:----:|:-------:|
+| HumanEval | 23.96    | **87.30**    | 8.49 | **3.64×** |
+| GSM8K     | 23.77    | **70.92**    | 6.92 | **2.98×** |
+| Math500   | 23.77    | **72.97**    | 7.15 | **3.07×** |
+
+AR is lower than on the 3090 (~24 vs ~38 tok/s) due to laptop power limits and memory bandwidth. The DFlash speedup ratio holds — HumanEval actually improves to 3.64× at AL 8.49, consistent with the draft having been distilled on Qwen3.5 hidden states which transfer across quantisation targets.
+
+### RTX 5090 Laptop — TQ3_0 KV cache (`DFLASH27B_KV_TQ3=1`)
+
+| Task      | AR tok/s | DFlash tok/s | AL   | Speedup |
+|-----------|:--------:|:------------:|:----:|:-------:|
+| HumanEval | 24.09    | 78.91        | 7.76 | 3.28×   |
+| GSM8K     | 23.95    | 64.75        | 6.40 | 2.70×   |
+| Math500   | 24.01    | 67.85        | 6.57 | 2.83×   |
+
+TQ3_0 (3.5 bpv) costs ~0.7 AL and ~10 tok/s vs the default KV format at short contexts. The memory saving (9.7× vs F16, vs 8× for Q4_0) is the point — TQ3 enables longer contexts on the same VRAM budget, not higher short-context throughput.
+
+### RTX 5090 Laptop — Long-context sweep: Q4_0 vs TQ3_0
+
+`ddtree_budget=16`, `n_gen=128`, layer-segmented prefill for prompts > 8 K (`DFLASH27B_LAYER_PREFILL=1`).
+KV sizes are actual quantized sizes (not F16 equivalent).
+
+| Ctx   | KV    | Prefill  | Decode tok/s | AL    | KV size |
+|:-----:|:-----:|:--------:|:------------:|:-----:|:-------:|
+| 32K   | Q4_0  | 54.3 s   | 64.8         | 11.64 | 0.61 GB |
+| 32K   | TQ3_0 | 59.8 s   | 60.1         | 10.67 | 0.47 GB |
+| 64K   | Q4_0  | 142.0 s  | 48.6         | 11.64 | 1.21 GB |
+| 64K   | TQ3_0 | 151.5 s  | 46.5         | 10.67 | 0.94 GB |
+| 128K  | Q4_0  | OOM      | —            | —     | 2.42 GB |
+| 128K  | TQ3_0 | 436.1 s  | 24.6         | 10.67 | 1.88 GB |
+
+At 32K–64K: TQ3_0 costs ~5–7% decode throughput and ~8% AL vs Q4_0 while saving ~22% KV memory.
+At 128K: Q4_0 exhausts available VRAM (model ~17 GB + draft ~3.5 GB + SSM compute buffer ~1.2 GB
+leaves ~2.3 GB free; Q4_0 KV needs 2.42 GB → OOM). TQ3_0 (1.88 GB KV) fits and decodes at 24.6 tok/s.
+At 256K: TQ3_0 KV grows to 3.76 GB, also exceeding the VRAM budget — timed out after 60 min.
+TQ3_0 is the enabling KV format for 128K on this hardware; 256K is not reachable on 24 GB.
+
 AR = autoregressive target-only decode via `test_generate`.
 DFlash = block-diffusion draft + DDTree budget 22 verify + fast rollback.
 AL = mean committed tokens per draft/verify step (acceptance length).
@@ -25,7 +66,61 @@ Datasets pulled live via HuggingFace `datasets`:
 
 ## Per-prompt numbers (seed 42)
 
-### HumanEval (10 samples)
+### RTX 5090 Laptop
+
+#### HumanEval (10 samples)
+
+| # | n_tok | AR    | DFlash | AL    |
+|:-:|:-----:|:-----:|:------:|:-----:|
+| 01|  84   | 23.99 |  91.55 |  8.83 |
+| 02| 138   | 24.12 |  87.75 |  8.53 |
+| 03| 134   | 23.90 |  95.30 |  9.14 |
+| 04| 120   | 23.97 |  96.23 |  9.14 |
+| 05| 172   | 24.00 |  87.56 |  8.53 |
+| 06| 118   | 23.96 |  66.36 |  6.40 |
+| 07|  51   | 23.96 |  85.35 |  8.26 |
+| 08| 141   | 23.94 | **100.43** | **9.85** |
+| 09| 125   | 23.94 | **103.38** | **10.67** |
+| 10|  95   | 23.78 |  59.04 |  5.57 |
+| **mean** | | **23.96** | **87.30** | **8.49** |
+
+#### GSM8K (10 samples)
+
+| # | n_tok | AR    | DFlash | AL   |
+|:-:|:-----:|:-----:|:------:|:----:|
+| 01|  45   | 23.75 |  72.46 | 6.92 |
+| 02| 111   | 23.89 |  60.99 | 5.95 |
+| 03|  49   | 23.87 |  88.37 | 8.53 |
+| 04|  70   | 23.68 |  57.84 | 5.45 |
+| 05| 102   | 23.94 |  80.51 | 7.76 |
+| 06| 118   | 23.86 |  66.60 | 6.40 |
+| 07| 113   | 23.93 |  79.68 | 8.12 |
+| 08|  50   | 23.16 |  66.76 | 6.74 |
+| 09|  43   | 23.81 |  72.02 | 7.11 |
+| 10|  96   | 23.85 |  63.93 | 6.24 |
+| **mean** | | **23.77** | **70.92** | **6.92** |
+
+#### Math500 (10 samples)
+
+| # | n_tok | AR    | DFlash | AL   |
+|:-:|:-----:|:-----:|:------:|:----:|
+| 01| 257   | 23.94 |  72.87 | 7.11 |
+| 02|  53   | 24.03 |  74.69 | 7.31 |
+| 03|  40   | 23.34 |  81.72 | 8.00 |
+| 04|  50   | 23.77 |  88.77 | 8.83 |
+| 05| 117   | 23.49 |  63.59 | 6.40 |
+| 06|  76   | 23.89 |  64.93 | 6.40 |
+| 07|  43   | 23.59 |  68.49 | 6.74 |
+| 08|  79   | 23.81 |  63.08 | 6.10 |
+| 09|  52   | 23.92 |  60.94 | 5.82 |
+| 10|  57   | 23.93 |  90.61 | 8.83 |
+| **mean** | | **23.77** | **72.97** | **7.15** |
+
+---
+
+### RTX 3090 Desktop
+
+#### HumanEval (10 samples)
 
 | # | n_tok | AR    | DFlash | AL    |
 |:-:|:-----:|:-----:|:------:|:-----:|

diff --git a/dflash/deps/llama.cpp b/dflash/deps/llama.cpp
diff --git a/dflash/docs/prefix-cache-design.md b/dflash/docs/prefix-cache-design.md
@@ -0,0 +1,166 @@
+# DFlash Prefix Cache Intended Design
+
+Status: design note, not implemented.
+
+## Goal
+
+Improve repeated coding-agent turns by avoiding full prefill when a later request can safely reuse token state from an earlier request.
+
+The immediate serving default remains the safe text-only `agent-code-text` profile with prefix cache disabled. Prefix caching must be proven through token-level and chat-template equivalence tests before it becomes a default server behavior.
+
+## Non-goals for v1
+
+- No multi-slot trie cache.
+- No cross-process persistence.
+- No concurrent generations on one GPU worker.
+- No arbitrary rollback to a shorter longest common prefix.
+- No support for layer-segmented prefill reuse.
+- No default-on OpenAI/Anthropic chat integration until real chat-template round trips are proven.
+
+## Terminology
+
+- `RUN`: current behavior. Reset daemon KV/SSM/conv state and prefill the full prompt.
+- `RUN_PREFIX`: experimental behavior. Reuse daemon state only if the new prompt exactly extends the resident token sequence.
+- `RESET`: destroy/recreate request state and invalidate resident metadata.
+- `resident state`: daemon-side model state plus metadata describing the exact token sequence represented by that state.
+
+## Required daemon protocol
+
+The daemon should accept both the legacy protocol and an explicit protocol:
+
+```text
+/tmp/prompt.bin 512              # legacy, maps to RUN
+RUN /tmp/prompt.bin 512          # reset + full prefill
+RUN_PREFIX /tmp/prompt.bin 512   # exact-extension reuse attempt
+RESET                            # clear resident state
+```
+
+Old clients must continue to work unchanged.
+
+## Required daemon state model
+
+Prefix reuse needs more than a vector of resident tokens. The daemon should track a state object similar to:
+
+```cpp
+enum class DaemonCommandKind {
+    Run,
+    RunPrefix,
+    Reset,
+};
+
+struct DaemonCommand {
+    DaemonCommandKind kind;
+    std::string prompt_path;
+    int n_gen = 0;
+};
+
+struct DaemonResidentState {
+    bool valid = false;
+    std::vector<int32_t> tokens;
+    int committed = 0;
+    int32_t last_tok = -1;
+    bool cache_decode_ready = false;
+};
+```
+
+The state must make cache lifecycle explicit:
+
+- whether cache is prefill-only or decode-ready;
+- whether `migrate_prefill_cache()` has already happened;
+- what token position `committed` represents;
+- what token `last_tok` should seed decode with;
+- whether any error invalidated reuse.
+
+## v1 reuse rule
+
+`RUN_PREFIX` v1 may reuse only exact extensions:
+
+```cpp
+reuse_len = lcp_tokens(resident.tokens, prompt);
+can_reuse = resident.valid
+    && reuse_len == resident.tokens.size()
+    && reuse_len <= prompt.size();
+```
+
+If `can_reuse` is false, the daemon must fall back to `RUN` semantics and log a miss.
+
+This is intentionally not a general longest-prefix cache. It avoids rollback in v1.
+
+## Generated-token caveat
+
+Do not assume `resident.tokens = prompt + generated_tokens` is reusable for OpenAI/Anthropic chat traffic.
+
+The Python server may:
+
+- stop streaming when a stop token is observed;
+- decode with `skip_special_tokens=True`;
+- parse/normalize tool-call markup;
+- omit hidden raw tokens from the visible assistant message.
+
+Therefore the client’s next templated prompt may not exactly extend the daemon’s raw generated token sequence.
+
+For chat-serving integration, prove token round-trip first:
+
+1. Tokenize request A.
+2. Run generation.
+3. Build request B exactly as a real client would from visible assistant/tool output.
+4. Tokenize request B.
+5. Check whether request B starts with the daemon’s intended resident tokens.
+
+If that assertion fails, Python must not use `RUN_PREFIX` for chat traffic.
+
+## Prefill scope
+
+Prefix reuse v1 should support only token-segmented prefill.
+
+If `DFLASH27B_LAYER_PREFILL=1` and `RUN_PREFIX` is requested, the daemon should either:
+
+- fall back to `RUN` and log `reason=layer-prefill`; or
+- return a clear unsupported error during development.
+
+Layer-segmented prefill allocates full-prompt activation buffers and should not be part of the first reuse implementation.
+
+## Observability
+
+Every daemon request should log one stable parseable prefix-cache line:
+
+```text
+[prefix-cache] mode=disabled reason=run prompt_tokens=12000
+[prefix-cache] mode=miss reason=not-extension reuse_tokens=0 suffix_tokens=13023 prompt_tokens=13023
+[prefix-cache] mode=reuse reuse_tokens=12345 suffix_tokens=678 prompt_tokens=13023
+```
+
+Failures in daemon mode must emit `-1` to the stream fd and invalidate resident state.
+
+## Validation milestones
+
+1. Protocol compatibility:
+   - legacy `<prompt.bin> <n_gen>` still works;
+   - `RUN` matches legacy behavior;
+   - `RESET` clears state.
+
+2. Token-file equivalence:
+   - let B = A + suffix;
+   - compare `RUN B` against `RESET; RUN_PREFIX A; RUN_PREFIX B` under deterministic decoding;
+   - outputs must match exactly.
+
+3. Chat-template equivalence:
+   - perform the same comparison through `/v1/chat/completions` and `/v1/messages`;
+   - include stop-token behavior, tool-call parsing, and `skip_special_tokens` behavior;
+   - only then consider Python-side `--prefix-cache` wiring.
+
+4. Default-on decision:
+   - prefix cache may become default only after real coding-agent traces show reliable reuse and exact output equivalence.
+
+## Relationship to `agent-code-text`
+
+The safe `agent-code-text` profile does not depend on prefix caching.
+
+Default profile behavior should remain:
+
+- text-only;
+- 48K max context;
+- 42K prompt admission limit;
+- one serialized request per daemon;
+- conservative prefill ubatch;
+- prefix cache disabled.
diff --git a/dflash/pyproject.toml b/dflash/pyproject.toml
@@ -0,0 +1,32 @@
+[project]
+name = "dflash"
+version = "0.1.0"
+description = "Python scripts for dflash inference, benchmarking, and serving"
+requires-python = ">=3.10"
+dependencies = [
+    "transformers",
+    "numpy",
+    "gguf",
+    "fastapi",
+    "uvicorn[standard]",
+    "jinja2",
+    "pytest",
+    "httpx",
+    "datasets",
+]
+
+[project.optional-dependencies]
+oracle = ["torch"]
+
+[project.scripts]
+dflash-server = "dflash.server:main"
+
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[tool.uv]
+package = true
+
+[tool.setuptools.packages.find]
+where = ["src"]