Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
0213af0
docs: replace huggingface-cli with hf
easel Apr 24, 2026
96d71ec
build: add pyproject.toml, switch to uv for Python deps
easel Apr 24, 2026
549ee7d
build: use native CUDA architecture instead of hardcoded sm_86
easel Apr 24, 2026
dcc6c5c
results: add RTX 5090 Laptop benchmark numbers
easel Apr 24, 2026
50208c4
feat(dflash): integrate TQ3_0 KV cache type
mraxai Apr 24, 2026
79aecbf
chore(deps): bump llama.cpp submodule to luce-dflash merge tip
davide221 Apr 24, 2026
e7010e9
docs: promote TQ3_0 to default KV cache across scripts and READMEs
davide221 Apr 24, 2026
bff3375
docs(dflash): bump long-context example from 128K to 256K
davide221 Apr 24, 2026
19d9851
docs(dflash): drop stale "model reload per turn" limit
davide221 Apr 24, 2026
34916da
docs(dflash): drop Q5_K_M / Q6_K target roadmap bullet
davide221 Apr 24, 2026
eec9265
docs: advertise TurboQuant TQ3_0 KV cache and 256K ceiling in root RE…
davide221 Apr 24, 2026
3865829
docs: clarify Blackwell/GB10 support alongside RTX 3090 reference
Apr 24, 2026
bf2275a
Update README.md
davide221 Apr 24, 2026
81701a8
docs: add RTX 5090 / sm_120 to requirements and build comment
easel Apr 25, 2026
a33988f
results: add TQ3_0 short-context numbers for RTX 5090 Laptop
easel Apr 27, 2026
11163b4
fix(dflash): auto-detect GPU arch to avoid sm_120a on consumer Blackwell
easel Apr 27, 2026
c725758
docs(dflash): RTX 5090 Laptop benchmark results and long-context swee…
easel Apr 27, 2026
e6dc0cd
docs(dflash): revert hf/uv-run command changes to huggingface-cli/pyt…
easel Apr 27, 2026
1a86289
refactor(server): proper Python package with tool-calling, tests, uv …
easel Apr 27, 2026
b96ea60
fix(server): honor tool_choice in chat completions
easel Apr 28, 2026
a066f68
test(server): add streaming coverage for tool_choice forwarding
easel Apr 28, 2026
98649e4
bench(server): code-agent throughput benchmark vs OpenAI-compat endpo…
easel Apr 28, 2026
25cb6f7
feat(server): auto-detect target GGUF in models/
easel Apr 28, 2026
09d1179
chore: refresh uv.lock dependency markers
easel Apr 28, 2026
8b7ad4e
bench(server): add overall_tok_s, fix TTFT for thinking models
easel Apr 29, 2026
24423ab
bench(server): add --replay mode for representative agentic prompts
easel Apr 29, 2026
8cb6434
fix(bench): include overall_tok_s in error-path ProbResult, catch HTT…
easel Apr 29, 2026
f2abf1e
fix(server): restore 16384 default max-ctx (regressed by package refa…
easel Apr 29, 2026
3cb6ec8
bench(server): replace synthetic agent loop with Claude Code transcri…
easel Apr 29, 2026
3c8169f
Add safe coding-agent server profile
easel Apr 29, 2026
a4f416d
Merge remote-tracking branch 'origin/main' into feat/setup-results-uv
easel Apr 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions dflash/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,42 @@ if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/CMakeLists.txt")
"deps/llama.cpp submodule missing. Run: "
"git submodule update --init --recursive")
endif()

# ─── CUDA architecture auto-detection ──────────────────────────────
# CUDA 12.8+ resolves "native" on Blackwell to sm_120a (DGX Spark variant)
# which is forward-incompatible with consumer Blackwell (RTX 5090, SM 12.0).
# Query nvidia-smi at configure time to get the exact SM so ggml-cuda
# compiles for the physical GPU rather than an incompatible variant.
if(NOT CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
find_program(_DFLASH_NVIDIA_SMI nvidia-smi)
if(_DFLASH_NVIDIA_SMI)
execute_process(
COMMAND "${_DFLASH_NVIDIA_SMI}"
--query-gpu=compute_cap --format=csv,noheader
OUTPUT_VARIABLE _dflash_gpu_caps
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE _dflash_smi_rc
)
if(_dflash_smi_rc EQUAL 0 AND _dflash_gpu_caps)
string(REGEX MATCH "^[0-9]+\\.[0-9]+" _dflash_cap "${_dflash_gpu_caps}")
string(REPLACE "." "" _dflash_arch "${_dflash_cap}")
if(_dflash_arch)
set(CMAKE_CUDA_ARCHITECTURES "${_dflash_arch}" CACHE STRING
"CUDA architectures (auto-detected from nvidia-smi: ${_dflash_cap})" FORCE)
message(STATUS "dflash27b: GPU compute_cap ${_dflash_cap} → CUDA_ARCHITECTURES=${_dflash_arch}")
# Consumer Blackwell (SM 12.x, no FP4 tensor cores) must skip
# ggml's sm_120→sm_120a replacement or kernels will fault.
if(_dflash_arch MATCHES "^12[0-9]$")
set(GGML_CUDA_BLACKWELL_CONSUMER ON CACHE BOOL
"Skip sm_12X→sm_12Xa for consumer Blackwell (no FP4)" FORCE)
endif()
endif()
endif()
endif()
endif()
set(GGML_CUDA_FA_ALL_QUANTS ON CACHE BOOL "Compile fattn kernels for all quant pairs (needed for asymmetric KV-quant)" FORCE)

add_subdirectory(deps/llama.cpp/ggml EXCLUDE_FROM_ALL)

# ─── dflash27b static library ──────────────────────────────────────
Expand Down
14 changes: 11 additions & 3 deletions dflash/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,16 @@ python3 examples/chat.py

# OpenAI-compatible HTTP server (drop-in for Open WebUI / LM Studio / Cline)
python3 -m venv .venv
.venv/bin/pip install fastapi uvicorn transformers jinja2
.venv/bin/python scripts/server.py --port 8000 --daemon
.venv/bin/pip install -e .
.venv/bin/dflash-server --profile agent-code-text --port 8000

# Safe text-only coding-agent profile (default)
# - max_ctx=48000, max_prompt_tokens=42000
# - DFLASH27B_PREFILL_UBATCH=384, DFLASH27B_LAYER_PREFILL=0
# - optional KV cache passthrough: --cache-type-k q4_0 --cache-type-v q8_0
# - vision/multimodal payloads rejected
# - prefix cache intentionally disabled; see docs/prefix-cache-design.md
.venv/bin/dflash-server --profile agent-code-text --prefill-ubatch 256 --cache-type-k q4_0 --cache-type-v q8_0

# Reproduce paper numbers
python3 scripts/bench_llm.py # HE + GSM8K + Math500
Expand All @@ -182,7 +190,7 @@ DFLASH27B_KV_TQ3=1 DFLASH27B_PREFILL_UBATCH=16 \
--fast-rollback --ddtree --ddtree-budget=16 --max-ctx=4096 # align_up(prompt + n_gen + 64, 256); raise up to 262144 for long prompts
```

**Requirements:** NVIDIA sm_86+ GPU (3090, A10, A40, 4090) or Jetson AGX Thor sm_110, CUDA 12+ (CUDA 13+ required for Thor), 24 GB VRAM, ~80 GB disk.
**Requirements:** NVIDIA sm_86+ GPU (3090, A10, A40, 4090), sm_120 (RTX 5090), or Jetson AGX Thor sm_110, CUDA 12+ (CUDA 13+ required for sm_120/Thor), 24 GB VRAM, ~80 GB disk.

## How it works

Expand Down
99 changes: 97 additions & 2 deletions dflash/RESULTS.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,60 @@
# Luce DFlash benchmark results

Single RTX 3090 24 GB, CUDA 12, driver 535.
Target: `unsloth/Qwen3.5-27B-GGUF` (Q4_K_M, ~16 GB).
Draft: `z-lab/Qwen3.5-27B-DFlash` (BF16, 3.46 GB).
Concurrency = 1, greedy decoding, `n_gen=256`.
Reproduce with `python3 scripts/bench_llm.py` (samples 10 prompts/dataset, seed=42).

## Headline — AR vs Luce DFlash at concurrency 1

### RTX 3090 24 GB desktop (sm_86) — CUDA 12, driver 535

| Task | AR tok/s | DFlash tok/s | AL | Speedup |
|-----------|:--------:|:------------:|:----:|:-------:|
| HumanEval | 37.78 | **129.52** | 8.31 | **3.43×** |
| Math500 | 37.71 | **110.51** | 7.04 | **2.93×** |
| GSM8K | 37.65 | **96.15** | 6.14 | **2.55×** |

### RTX 5090 Laptop 24 GB (sm_120) — CUDA 13.2, driver 581.80

| Task | AR tok/s | DFlash tok/s | AL | Speedup |
|-----------|:--------:|:------------:|:----:|:-------:|
| HumanEval | 23.96 | **87.30** | 8.49 | **3.64×** |
| GSM8K | 23.77 | **70.92** | 6.92 | **2.98×** |
| Math500 | 23.77 | **72.97** | 7.15 | **3.07×** |

AR is lower than on the 3090 (~24 vs ~38 tok/s) due to laptop power limits and memory bandwidth. The DFlash speedup ratio holds — HumanEval actually improves to 3.64× at AL 8.49, consistent with the draft having been distilled on Qwen3.5 hidden states which transfer across quantisation targets.

### RTX 5090 Laptop — TQ3_0 KV cache (`DFLASH27B_KV_TQ3=1`)

| Task | AR tok/s | DFlash tok/s | AL | Speedup |
|-----------|:--------:|:------------:|:----:|:-------:|
| HumanEval | 24.09 | 78.91 | 7.76 | 3.28× |
| GSM8K | 23.95 | 64.75 | 6.40 | 2.70× |
| Math500 | 24.01 | 67.85 | 6.57 | 2.83× |

TQ3_0 (3.5 bpv) costs ~0.7 AL and ~10 tok/s vs the default KV format at short contexts. The memory saving (9.7× vs F16, vs 8× for Q4_0) is the point — TQ3 enables longer contexts on the same VRAM budget, not higher short-context throughput.

### RTX 5090 Laptop — Long-context sweep: Q4_0 vs TQ3_0

`ddtree_budget=16`, `n_gen=128`, layer-segmented prefill for prompts > 8 K (`DFLASH27B_LAYER_PREFILL=1`).
KV sizes are actual quantized sizes (not F16 equivalent).

| Ctx | KV | Prefill | Decode tok/s | AL | KV size |
|:-----:|:-----:|:--------:|:------------:|:-----:|:-------:|
| 32K | Q4_0 | 54.3 s | 64.8 | 11.64 | 0.61 GB |
| 32K | TQ3_0 | 59.8 s | 60.1 | 10.67 | 0.47 GB |
| 64K | Q4_0 | 142.0 s | 48.6 | 11.64 | 1.21 GB |
| 64K | TQ3_0 | 151.5 s | 46.5 | 10.67 | 0.94 GB |
| 128K | Q4_0 | OOM | — | — | 2.42 GB |
| 128K | TQ3_0 | 436.1 s | 24.6 | 10.67 | 1.88 GB |

At 32K–64K: TQ3_0 costs ~5–7% decode throughput and ~8% AL vs Q4_0 while saving ~22% KV memory.
At 128K: Q4_0 exhausts available VRAM (model ~17 GB + draft ~3.5 GB + SSM compute buffer ~1.2 GB
leaves ~2.3 GB free; Q4_0 KV needs 2.42 GB → OOM). TQ3_0 (1.88 GB KV) fits and decodes at 24.6 tok/s.
At 256K: TQ3_0 KV grows to 3.76 GB, also exceeding the VRAM budget — timed out after 60 min.
TQ3_0 is the enabling KV format for 128K on this hardware; 256K is not reachable on 24 GB.

AR = autoregressive target-only decode via `test_generate`.
DFlash = block-diffusion draft + DDTree budget 22 verify + fast rollback.
AL = mean committed tokens per draft/verify step (acceptance length).
Expand All @@ -25,7 +66,61 @@ Datasets pulled live via HuggingFace `datasets`:

## Per-prompt numbers (seed 42)

### HumanEval (10 samples)
### RTX 5090 Laptop

#### HumanEval (10 samples)

| # | n_tok | AR | DFlash | AL |
|:-:|:-----:|:-----:|:------:|:-----:|
| 01| 84 | 23.99 | 91.55 | 8.83 |
| 02| 138 | 24.12 | 87.75 | 8.53 |
| 03| 134 | 23.90 | 95.30 | 9.14 |
| 04| 120 | 23.97 | 96.23 | 9.14 |
| 05| 172 | 24.00 | 87.56 | 8.53 |
| 06| 118 | 23.96 | 66.36 | 6.40 |
| 07| 51 | 23.96 | 85.35 | 8.26 |
| 08| 141 | 23.94 | **100.43** | **9.85** |
| 09| 125 | 23.94 | **103.38** | **10.67** |
| 10| 95 | 23.78 | 59.04 | 5.57 |
| **mean** | | **23.96** | **87.30** | **8.49** |

#### GSM8K (10 samples)

| # | n_tok | AR | DFlash | AL |
|:-:|:-----:|:-----:|:------:|:----:|
| 01| 45 | 23.75 | 72.46 | 6.92 |
| 02| 111 | 23.89 | 60.99 | 5.95 |
| 03| 49 | 23.87 | 88.37 | 8.53 |
| 04| 70 | 23.68 | 57.84 | 5.45 |
| 05| 102 | 23.94 | 80.51 | 7.76 |
| 06| 118 | 23.86 | 66.60 | 6.40 |
| 07| 113 | 23.93 | 79.68 | 8.12 |
| 08| 50 | 23.16 | 66.76 | 6.74 |
| 09| 43 | 23.81 | 72.02 | 7.11 |
| 10| 96 | 23.85 | 63.93 | 6.24 |
| **mean** | | **23.77** | **70.92** | **6.92** |

#### Math500 (10 samples)

| # | n_tok | AR | DFlash | AL |
|:-:|:-----:|:-----:|:------:|:----:|
| 01| 257 | 23.94 | 72.87 | 7.11 |
| 02| 53 | 24.03 | 74.69 | 7.31 |
| 03| 40 | 23.34 | 81.72 | 8.00 |
| 04| 50 | 23.77 | 88.77 | 8.83 |
| 05| 117 | 23.49 | 63.59 | 6.40 |
| 06| 76 | 23.89 | 64.93 | 6.40 |
| 07| 43 | 23.59 | 68.49 | 6.74 |
| 08| 79 | 23.81 | 63.08 | 6.10 |
| 09| 52 | 23.92 | 60.94 | 5.82 |
| 10| 57 | 23.93 | 90.61 | 8.83 |
| **mean** | | **23.77** | **72.97** | **7.15** |

---

### RTX 3090 Desktop

#### HumanEval (10 samples)

| # | n_tok | AR | DFlash | AL |
|:-:|:-----:|:-----:|:------:|:-----:|
Expand Down
2 changes: 1 addition & 1 deletion dflash/deps/llama.cpp
166 changes: 166 additions & 0 deletions dflash/docs/prefix-cache-design.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# DFlash Prefix Cache Intended Design

Status: design note, not implemented.

## Goal

Improve repeated coding-agent turns by avoiding full prefill when a later request can safely reuse token state from an earlier request.

The immediate serving default remains the safe text-only `agent-code-text` profile with prefix cache disabled. Prefix caching must be proven through token-level and chat-template equivalence tests before it becomes a default server behavior.

## Non-goals for v1

- No multi-slot trie cache.
- No cross-process persistence.
- No concurrent generations on one GPU worker.
- No arbitrary rollback to a shorter longest common prefix.
- No support for layer-segmented prefill reuse.
- No default-on OpenAI/Anthropic chat integration until real chat-template round trips are proven.

## Terminology

- `RUN`: current behavior. Reset daemon KV/SSM/conv state and prefill the full prompt.
- `RUN_PREFIX`: experimental behavior. Reuse daemon state only if the new prompt exactly extends the resident token sequence.
- `RESET`: destroy/recreate request state and invalidate resident metadata.
- `resident state`: daemon-side model state plus metadata describing the exact token sequence represented by that state.

## Required daemon protocol

The daemon should accept both the legacy protocol and an explicit protocol:

```text
/tmp/prompt.bin 512 # legacy, maps to RUN
RUN /tmp/prompt.bin 512 # reset + full prefill
RUN_PREFIX /tmp/prompt.bin 512 # exact-extension reuse attempt
RESET # clear resident state
```

Old clients must continue to work unchanged.

## Required daemon state model

Prefix reuse needs more than a vector of resident tokens. The daemon should track a state object similar to:

```cpp
enum class DaemonCommandKind {
Run,
RunPrefix,
Reset,
};

struct DaemonCommand {
DaemonCommandKind kind;
std::string prompt_path;
int n_gen = 0;
};

struct DaemonResidentState {
bool valid = false;
std::vector<int32_t> tokens;
int committed = 0;
int32_t last_tok = -1;
bool cache_decode_ready = false;
};
```

The state must make cache lifecycle explicit:

- whether cache is prefill-only or decode-ready;
- whether `migrate_prefill_cache()` has already happened;
- what token position `committed` represents;
- what token `last_tok` should seed decode with;
- whether any error invalidated reuse.

## v1 reuse rule

`RUN_PREFIX` v1 may reuse only exact extensions:

```cpp
reuse_len = lcp_tokens(resident.tokens, prompt);
can_reuse = resident.valid
&& reuse_len == resident.tokens.size()
&& reuse_len <= prompt.size();
```

If `can_reuse` is false, the daemon must fall back to `RUN` semantics and log a miss.

This is intentionally not a general longest-prefix cache. It avoids rollback in v1.

## Generated-token caveat

Do not assume `resident.tokens = prompt + generated_tokens` is reusable for OpenAI/Anthropic chat traffic.

The Python server may:

- stop streaming when a stop token is observed;
- decode with `skip_special_tokens=True`;
- parse/normalize tool-call markup;
- omit hidden raw tokens from the visible assistant message.

Therefore the client’s next templated prompt may not exactly extend the daemon’s raw generated token sequence.

For chat-serving integration, prove token round-trip first:

1. Tokenize request A.
2. Run generation.
3. Build request B exactly as a real client would from visible assistant/tool output.
4. Tokenize request B.
5. Check whether request B starts with the daemon’s intended resident tokens.

If that assertion fails, Python must not use `RUN_PREFIX` for chat traffic.

## Prefill scope

Prefix reuse v1 should support only token-segmented prefill.

If `DFLASH27B_LAYER_PREFILL=1` and `RUN_PREFIX` is requested, the daemon should either:

- fall back to `RUN` and log `reason=layer-prefill`; or
- return a clear unsupported error during development.

Layer-segmented prefill allocates full-prompt activation buffers and should not be part of the first reuse implementation.

## Observability

Every daemon request should log one stable parseable prefix-cache line:

```text
[prefix-cache] mode=disabled reason=run prompt_tokens=12000
[prefix-cache] mode=miss reason=not-extension reuse_tokens=0 suffix_tokens=13023 prompt_tokens=13023
[prefix-cache] mode=reuse reuse_tokens=12345 suffix_tokens=678 prompt_tokens=13023
```

Failures in daemon mode must emit `-1` to the stream fd and invalidate resident state.

## Validation milestones

1. Protocol compatibility:
- legacy `<prompt.bin> <n_gen>` still works;
- `RUN` matches legacy behavior;
- `RESET` clears state.

2. Token-file equivalence:
- let B = A + suffix;
- compare `RUN B` against `RESET; RUN_PREFIX A; RUN_PREFIX B` under deterministic decoding;
- outputs must match exactly.

3. Chat-template equivalence:
- perform the same comparison through `/v1/chat/completions` and `/v1/messages`;
- include stop-token behavior, tool-call parsing, and `skip_special_tokens` behavior;
- only then consider Python-side `--prefix-cache` wiring.

4. Default-on decision:
- prefix cache may become default only after real coding-agent traces show reliable reuse and exact output equivalence.

## Relationship to `agent-code-text`

The safe `agent-code-text` profile does not depend on prefix caching.

Default profile behavior should remain:

- text-only;
- 48K max context;
- 42K prompt admission limit;
- one serialized request per daemon;
- conservative prefill ubatch;
- prefix cache disabled.
32 changes: 32 additions & 0 deletions dflash/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[project]
name = "dflash"
version = "0.1.0"
description = "Python scripts for dflash inference, benchmarking, and serving"
requires-python = ">=3.10"
dependencies = [
"transformers",
"numpy",
"gguf",
"fastapi",
"uvicorn[standard]",
"jinja2",
"pytest",
"httpx",
"datasets",
]

[project.optional-dependencies]
oracle = ["torch"]

[project.scripts]
dflash-server = "dflash.server:main"

[build-system]
requires = ["setuptools>=61"]
build-backend = "setuptools.build_meta"

[tool.uv]
package = true

[tool.setuptools.packages.find]
where = ["src"]
Loading