diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt
index 21f96573..d76c8693 100644
--- a/dflash/CMakeLists.txt
+++ b/dflash/CMakeLists.txt
@@ -29,6 +29,7 @@ endif()
 # Hardcoded for CUDA. No libllama, no BLAS, no Metal, no Vulkan.
 
 set(GGML_CUDA         ON  CACHE BOOL "" FORCE)
+set(GGML_CUDA_GRAPHS  ON  CACHE BOOL "" FORCE)
 set(GGML_BACKEND_DL   OFF CACHE BOOL "" FORCE)
 set(GGML_METAL        OFF CACHE BOOL "" FORCE)
 set(GGML_VULKAN       OFF CACHE BOOL "" FORCE)
@@ -137,6 +138,26 @@ if(DFLASH27B_TESTS)
         target_include_directories(test_generate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
         target_link_libraries(test_generate PRIVATE dflash27b ggml ggml-cuda)
     endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_moe_draft.cpp")
+        add_executable(smoke_load_moe_draft test/smoke_load_moe_draft.cpp)
+        target_include_directories(smoke_load_moe_draft PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(smoke_load_moe_draft PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_moe_target_forward.cpp")
+        add_executable(smoke_moe_target_forward test/smoke_moe_target_forward.cpp)
+        target_include_directories(smoke_moe_target_forward PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(smoke_moe_target_forward PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_moe_ffn.cpp")
+        add_executable(smoke_moe_ffn test/smoke_moe_ffn.cpp)
+        target_include_directories(smoke_moe_ffn PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(smoke_moe_ffn PRIVATE dflash27b ggml ggml-cuda)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_moe_target.cpp")
+        add_executable(smoke_load_moe_target test/smoke_load_moe_target.cpp)
+        target_include_directories(smoke_load_moe_target PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+        target_link_libraries(smoke_load_moe_target PRIVATE dflash27b ggml ggml-cuda)
+    endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_dflash.cpp")
         add_executable(test_dflash test/test_dflash.cpp)
         target_include_directories(test_dflash PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
diff --git a/dflash/deps/llama.cpp b/dflash/deps/llama.cpp
index b6ffab4a..13722831 160000
--- a/dflash/deps/llama.cpp
+++ b/dflash/deps/llama.cpp
@@ -1 +1 @@
-Subproject commit b6ffab4a9d3ee7dc2bd39354c86f6bb11ab15420
+Subproject commit 13722831748a762108ad19fdf9112eb58f48378b
diff --git a/dflash/src/gguf_target_loader.cpp b/dflash/src/gguf_target_loader.cpp
index d6584fd5..d23dadbc 100644
--- a/dflash/src/gguf_target_loader.cpp
+++ b/dflash/src/gguf_target_loader.cpp
@@ -202,7 +202,8 @@ bool load_target_gguf(const std::string & path,
         return false;
     }
 
-    // Validate arch + the dimensions we hardcode everywhere.
+    // Validate arch — accept both qwen35 (dense) and qwen35moe (MoE)
+    std::string arch;
     {
         int64_t arch_id = gguf_find_key(gctx, "general.architecture");
         if (arch_id < 0) {
@@ -210,48 +211,80 @@ bool load_target_gguf(const std::string & path,
             gguf_free(gctx);
             return false;
         }
-        const char * arch = gguf_get_val_str(gctx, arch_id);
-        if (std::string(arch) != "qwen35") {
-            set_last_error(std::string("unexpected arch: ") + arch + " (expected qwen35)");
+        arch = gguf_get_val_str(gctx, arch_id);
+        if (arch != "qwen35" && arch != "qwen35moe") {
+            set_last_error(std::string("unexpected arch: ") + arch + " (expected qwen35 or qwen35moe)");
             gguf_free(gctx);
             return false;
         }
     }
 
     std::string err;
-    const uint32_t n_embd = get_u32_or(gctx, "qwen35.embedding_length",    0);
-    const uint32_t n_ff   = get_u32_or(gctx, "qwen35.feed_forward_length", 0);
-    const uint32_t n_layer= get_u32_or(gctx, "qwen35.block_count",         0);
-    const uint32_t n_head = get_u32_or(gctx, "qwen35.attention.head_count",0);
-    const uint32_t n_headkv=get_u32_or(gctx, "qwen35.attention.head_count_kv",0);
-    const uint32_t kl     = get_u32_or(gctx, "qwen35.attention.key_length",   0);
-    const uint32_t vl     = get_u32_or(gctx, "qwen35.attention.value_length", 0);
-    const uint32_t fai    = get_u32_or(gctx, "qwen35.full_attention_interval",0);
-    const uint32_t ssm_conv  = get_u32_or(gctx, "qwen35.ssm.conv_kernel",  0);
-    const uint32_t ssm_inner = get_u32_or(gctx, "qwen35.ssm.inner_size",   0);
-    const uint32_t ssm_state = get_u32_or(gctx, "qwen35.ssm.state_size",   0);
-    const uint32_t ssm_dt    = get_u32_or(gctx, "qwen35.ssm.time_step_rank",0);
-    const uint32_t ssm_grp   = get_u32_or(gctx, "qwen35.ssm.group_count",  0);
-
-    if (n_embd != 5120 || n_layer != 64 || n_head != 24 || n_headkv != 4 ||
-        kl != 256 || vl != 256 || n_ff != 17408 || fai != 4 ||
-        ssm_conv != 4 || ssm_inner != 6144 || ssm_state != 128 ||
-        ssm_dt != 48 || ssm_grp != 16) {
-        char buf[512];
-        std::snprintf(buf, sizeof(buf),
-            "unexpected hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u "
-            "kl=%u vl=%u n_ff=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u}",
-            n_embd, n_layer, n_head, n_headkv, kl, vl, n_ff, fai,
-            ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp);
-        set_last_error(buf);
-        gguf_free(gctx);
-        return false;
+    const uint32_t n_embd = get_u32_or(gctx, (arch + ".embedding_length").c_str(),    0);
+    const uint32_t n_ff   = get_u32_or(gctx, (arch + ".feed_forward_length").c_str(), 0);
+    const uint32_t n_layer= get_u32_or(gctx, (arch + ".block_count").c_str(),         0);
+    const uint32_t n_head = get_u32_or(gctx, (arch + ".attention.head_count").c_str(),0);
+    const uint32_t n_headkv=get_u32_or(gctx, (arch + ".attention.head_count_kv").c_str(),0);
+    const uint32_t kl     = get_u32_or(gctx, (arch + ".attention.key_length").c_str(),   0);
+    const uint32_t vl     = get_u32_or(gctx, (arch + ".attention.value_length").c_str(), 0);
+    const uint32_t fai    = get_u32_or(gctx, (arch + ".full_attention_interval").c_str(),0);
+    const uint32_t ssm_conv  = get_u32_or(gctx, (arch + ".ssm.conv_kernel").c_str(),  0);
+    const uint32_t ssm_inner = get_u32_or(gctx, (arch + ".ssm.inner_size").c_str(),   0);
+    const uint32_t ssm_state = get_u32_or(gctx, (arch + ".ssm.state_size").c_str(),   0);
+    const uint32_t ssm_dt    = get_u32_or(gctx, (arch + ".ssm.time_step_rank").c_str(),0);
+    const uint32_t ssm_grp   = get_u32_or(gctx, (arch + ".ssm.group_count").c_str(),  0);
+
+    // MoE fields (zero for dense qwen35)
+    const uint32_t n_expert      = get_u32_or(gctx, (arch + ".expert_count").c_str(), 0);
+    const uint32_t n_expert_used = get_u32_or(gctx, (arch + ".expert_used_count").c_str(), 0);
+    const uint32_t expert_ff     = get_u32_or(gctx, (arch + ".expert_feed_forward_length").c_str(), 0);
+    const uint32_t shared_ff     = get_u32_or(gctx, (arch + ".expert_shared_feed_forward_length").c_str(), 0);
+
+    const bool is_moe = (arch == "qwen35moe");
+
+    if (is_moe) {
+        // Validate qwen35moe hparams: 40 layers, 2048 hidden, 16 heads, 2 kv heads,
+        // 256 experts, 8 active, 512 expert ff, 512 shared ff
+        if (n_embd != 2048 || n_layer != 40 || n_head != 16 || n_headkv != 2 ||
+            kl != 256 || vl != 256 || fai != 4 ||
+            ssm_conv != 4 || ssm_inner != 4096 || ssm_state != 128 ||
+            ssm_dt != 32 || ssm_grp != 16 ||
+            n_expert != 256 || n_expert_used != 8 ||
+            expert_ff != 512 || shared_ff != 512) {
+            char buf[512];
+            std::snprintf(buf, sizeof(buf),
+                "unexpected qwen35moe hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u "
+                "kl=%u vl=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u} "
+                "n_expert=%u n_expert_used=%u expert_ff=%u shared_ff=%u",
+                n_embd, n_layer, n_head, n_headkv, kl, vl, fai,
+                ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp,
+                n_expert, n_expert_used, expert_ff, shared_ff);
+            set_last_error(buf);
+            gguf_free(gctx);
+            return false;
+        }
+    } else {
+        // Validate qwen35 dense hparams (unchanged)
+        if (n_embd != 5120 || n_layer != 64 || n_head != 24 || n_headkv != 4 ||
+            kl != 256 || vl != 256 || n_ff != 17408 || fai != 4 ||
+            ssm_conv != 4 || ssm_inner != 6144 || ssm_state != 128 ||
+            ssm_dt != 48 || ssm_grp != 16) {
+            char buf[512];
+            std::snprintf(buf, sizeof(buf),
+                "unexpected hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u "
+                "kl=%u vl=%u n_ff=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u}",
+                n_embd, n_layer, n_head, n_headkv, kl, vl, n_ff, fai,
+                ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp);
+            set_last_error(buf);
+            gguf_free(gctx);
+            return false;
+        }
     }
 
     // rope dimension_sections (array of 4 uint32)
     int rope_sections[4] = {0, 0, 0, 0};
     {
-        int64_t rid = gguf_find_key(gctx, "qwen35.rope.dimension_sections");
+        int64_t rid = gguf_find_key(gctx, (arch + ".rope.dimension_sections").c_str());
         if (rid >= 0) {
             size_t n = gguf_get_arr_n(gctx, rid);
             if (n >= 4) {
@@ -265,7 +298,7 @@ bool load_target_gguf(const std::string & path,
     out.backend = backend;
     out.n_layer = (int)n_layer;
     out.n_embd  = (int)n_embd;
-    out.n_ff    = (int)n_ff;
+    out.n_ff    = is_moe ? 0 : (int)n_ff;
     out.n_head  = (int)n_head;
     out.n_head_kv = (int)n_headkv;
     out.n_embd_head_k = (int)kl;
@@ -277,6 +310,10 @@ bool load_target_gguf(const std::string & path,
     out.ssm_d_state= (int)ssm_state;
     out.ssm_dt_rank= (int)ssm_dt;
     out.ssm_n_group= (int)ssm_grp;
+    out.n_expert       = (int)n_expert;
+    out.n_expert_used  = (int)n_expert_used;
+    out.expert_ff_dim  = (int)expert_ff;
+    out.shared_ff_dim  = (int)shared_ff;
     out.layers.assign((size_t)n_layer, TargetLayer{});
 
     // ── 2. Wire our layer pointers to tensors inside meta_ctx ─────────
@@ -303,15 +340,48 @@ bool load_target_gguf(const std::string & path,
         // Always-present tensors
         L.attn_norm      = fnd("attn_norm.weight");
         L.attn_post_norm = fnd("post_attention_norm.weight");
+
+        // Dense FFN tensors (qwen35) or MoE tensors (qwen35moe)
         L.w_gate         = fnd("ffn_gate.weight");
         L.w_up           = fnd("ffn_up.weight");
         L.w_down         = fnd("ffn_down.weight");
-        if (!L.attn_norm || !L.attn_post_norm || !L.w_gate || !L.w_up || !L.w_down) {
-            char b[128];
-            std::snprintf(b, sizeof(b), "layer %d: missing shared tensor", il);
-            set_last_error(b);
-            gguf_free(gctx);
-            return false;
+
+        // MoE FFN tensors (qwen35moe only; null for dense)
+        L.ffn_gate_inp       = fnd("ffn_gate_inp.weight");
+        L.ffn_up_exps        = fnd("ffn_up_exps.weight");
+        L.ffn_gate_exps      = fnd("ffn_gate_exps.weight");
+        L.ffn_down_exps      = fnd("ffn_down_exps.weight");
+        L.ffn_up_shexp       = fnd("ffn_up_shexp.weight");
+        L.ffn_gate_shexp     = fnd("ffn_gate_shexp.weight");
+        L.ffn_down_shexp     = fnd("ffn_down_shexp.weight");
+        L.ffn_gate_inp_shexp = fnd("ffn_gate_inp_shexp.weight");
+
+        if (!is_moe) {
+            // Dense model: gate/up/down required
+            if (!L.attn_norm || !L.attn_post_norm || !L.w_gate || !L.w_up || !L.w_down) {
+                char b[128];
+                std::snprintf(b, sizeof(b), "layer %d: missing shared tensor", il);
+                set_last_error(b);
+                gguf_free(gctx);
+                return false;
+            }
+        } else {
+            // MoE model: attn_norm + post_norm + expert tensors required
+            if (!L.attn_norm || !L.attn_post_norm) {
+                char b[128];
+                std::snprintf(b, sizeof(b), "layer %d: missing attn_norm/post_norm", il);
+                set_last_error(b);
+                gguf_free(gctx);
+                return false;
+            }
+            if (!L.ffn_gate_inp || !L.ffn_up_exps || !L.ffn_gate_exps || !L.ffn_down_exps ||
+                !L.ffn_up_shexp || !L.ffn_gate_shexp || !L.ffn_down_shexp || !L.ffn_gate_inp_shexp) {
+                char b[256];
+                std::snprintf(b, sizeof(b), "layer %d: missing MoE tensors", il);
+                set_last_error(b);
+                gguf_free(gctx);
+                return false;
+            }
         }
 
         // Full-attention tensors (only on layers where (il+1)%fai == 0,
diff --git a/dflash/src/internal.h b/dflash/src/internal.h
index 536cdcc6..2ec12ba7 100644
--- a/dflash/src/internal.h
+++ b/dflash/src/internal.h
@@ -49,10 +49,22 @@ struct TargetLayer {
     ggml_tensor * attn_norm      = nullptr;  // [hidden]
     ggml_tensor * attn_post_norm = nullptr;  // [hidden]  (post-block norm before FFN)
     ggml_tensor * ffn_norm       = nullptr;  // [hidden]
+
+    // Dense FFN (non-null for qwen35 dense target)
     ggml_tensor * w_gate         = nullptr;  // [hidden, intermediate]
     ggml_tensor * w_up           = nullptr;  // [hidden, intermediate]
     ggml_tensor * w_down         = nullptr;  // [intermediate, hidden]
 
+    // MoE FFN (non-null for qwen35moe target)
+    ggml_tensor * ffn_gate_inp       = nullptr;  // [n_embd, n_expert] — router
+    ggml_tensor * ffn_up_exps        = nullptr;  // [expert_ff, n_embd, n_expert]
+    ggml_tensor * ffn_gate_exps      = nullptr;  // [expert_ff, n_embd, n_expert]
+    ggml_tensor * ffn_down_exps      = nullptr;  // [n_embd, expert_ff, n_expert]
+    ggml_tensor * ffn_up_shexp       = nullptr;  // [shared_ff, n_embd] — shared expert
+    ggml_tensor * ffn_gate_shexp     = nullptr;  // [shared_ff, n_embd]
+    ggml_tensor * ffn_down_shexp     = nullptr;  // [n_embd, shared_ff]
+    ggml_tensor * ffn_gate_inp_shexp = nullptr;  // [n_embd] — shared expert gate
+
     // Full-attention block (non-null for layers where (il+1) % 4 == 0)
     ggml_tensor * wq             = nullptr;  // [hidden, q_dim]
     ggml_tensor * wk             = nullptr;  // [hidden, kv_dim]
@@ -127,6 +139,13 @@ struct TargetWeights {
     int ssm_d_state             = 128;
     int ssm_dt_rank             = 48;
     int ssm_n_group             = 16;
+
+    // MoE-specific (zero for dense models)
+    int n_expert                = 0;
+    int n_expert_used           = 0;
+    int expert_ff_dim           = 0;
+    int shared_ff_dim           = 0;
+    float expert_weights_scale  = 1.0f;
 };
 
 // Load a Q4_K_M target model from a GGUF file on disk.
@@ -139,6 +158,24 @@ void free_target_weights(TargetWeights & w);
 
 // ─── Draft weights (z-lab DFlash, bf16) ───────────────────────────
 
+struct DraftHparams {
+    int n_layer       = DFLASH27B_DRAFT_LAYERS;
+    int hidden        = DFLASH27B_TARGET_HIDDEN;
+    int n_head        = DFLASH27B_TARGET_N_HEADS;
+    int n_kv_head     = DFLASH27B_TARGET_N_KV_HEADS;
+    int head_dim      = DFLASH27B_TARGET_HEAD_DIM;
+    int intermediate  = DFLASH27B_TARGET_INTERMEDIATE;
+    int n_target_layers = DFLASH27B_DRAFT_N_TARGET_LAYERS;
+    int block_size    = DFLASH27B_DRAFT_BLOCK_SIZE;
+    int mask_token_id = DFLASH27B_DRAFT_MASK_TOKEN_ID;
+    float rope_theta  = DFLASH27B_ROPE_THETA;
+    float rms_eps     = DFLASH27B_RMS_EPS;
+    float rope_factor       = 1.0f;
+    float rope_beta_fast    = 0.0f;
+    float rope_beta_slow    = 0.0f;
+    int   rope_orig_ctx     = 0;
+};
+
 struct DraftLayer {
     ggml_tensor * attn_norm;
     ggml_tensor * ffn_norm;
@@ -158,10 +195,11 @@ struct DraftWeights {
     ggml_backend_t    backend = nullptr;
     ggml_backend_buffer_t buf = nullptr;
 
-    ggml_tensor *          fc          = nullptr;   // [5*hidden, hidden]
-    ggml_tensor *          hidden_norm = nullptr;   // [hidden]
-    std::vector<DraftLayer> layers;                 // size = 5
-    ggml_tensor *          out_norm    = nullptr;   // [hidden]
+    DraftHparams           hparams;
+    ggml_tensor *          fc          = nullptr;
+    ggml_tensor *          hidden_norm = nullptr;
+    std::vector<DraftLayer> layers;
+    ggml_tensor *          out_norm    = nullptr;
 };
 
 bool load_draft_safetensors(const std::string & path,
@@ -310,6 +348,16 @@ QwenGraphOutputs build_qwen35_graph(
     TargetCache &          cache,
     const QwenGraphInputs & in);
 
+// MoE FFN forward pass (qwen35moe). Computes expert routing, per-expert
+// SwiGLU, shared expert with sigmoid gating, and returns the combined output.
+// Shape: [n_embd, n_tokens] f32.
+ggml_tensor * build_moe_ffn(
+    ggml_context *        ctx,
+    ggml_cgraph *         gf,
+    ggml_tensor *         cur,
+    const TargetLayer &   L,
+    const TargetWeights & w);
+
 // Build a single-layer forward graph. Mirrors build_qwen35_graph but processes
 // only one layer, taking `inp` as the input activation and returning the output.
 // Used by layer-segmented prefill to iterate layers as the outer loop.
diff --git a/dflash/src/qwen35_target_graph.cpp b/dflash/src/qwen35_target_graph.cpp
index e09221b5..27d38d95 100644
--- a/dflash/src/qwen35_target_graph.cpp
+++ b/dflash/src/qwen35_target_graph.cpp
@@ -81,6 +81,13 @@ bool create_target_cache(const TargetWeights & w,
         max_verify_tokens = DFLASH27B_DRAFT_BLOCK_SIZE;
     }
 
+    const int head_dim      = w.n_embd_head_k;
+    const int n_head_kv     = w.n_head_kv;
+    const int num_v_heads   = w.ssm_dt_rank;
+    const int head_v_dim    = w.ssm_d_inner / num_v_heads;
+    const int conv_kern     = w.ssm_d_conv;
+    const int conv_channels = w.ssm_d_inner + 2 * w.ssm_n_group * w.ssm_d_state;
+
     const int n_full_attn = w.n_layer / w.full_attention_interval; // 16
     const int n_delta     = w.n_layer - n_full_attn;               // 48
 
@@ -138,9 +145,9 @@ bool create_target_cache(const TargetWeights & w,
         if (is_attn) {
             // [head_dim, max_ctx_alloc, n_head_kv]
             ggml_tensor * K = ggml_new_tensor_3d(out.ctx, kv_k_type,
-                                                 q35::HEAD_DIM, max_ctx_alloc, q35::N_HEAD_KV);
+                                                 head_dim, max_ctx_alloc, n_head_kv);
             ggml_tensor * V = ggml_new_tensor_3d(out.ctx, kv_v_type,
-                                                 q35::HEAD_DIM, max_ctx_alloc, q35::N_HEAD_KV);
+                                                 head_dim, max_ctx_alloc, n_head_kv);
             char name[64];
             std::snprintf(name, sizeof(name), "cache_k_%d", il);
             ggml_set_name(K, name);
@@ -317,6 +324,7 @@ static ggml_tensor * build_swiglu_ffn(ggml_context * ctx, ggml_tensor * cur,
 static ggml_tensor * build_full_attn_block(
     ggml_context * ctx,
     ggml_cgraph * gf,
+    const TargetWeights & w,
     const TargetLayer & L,
     ggml_tensor * cur,
     ggml_tensor * positions,
@@ -329,35 +337,42 @@ static ggml_tensor * build_full_attn_block(
     ggml_type kv_k_type,
     int fa_window = 0
 ) {
+    const int head_dim  = w.n_embd_head_k;
+    const int n_head    = w.n_head;
+    const int n_head_kv = w.n_head_kv;
+    const int q_dim     = n_head * head_dim;
+    constexpr float eps = 1e-6f;
+    constexpr float rope_theta = 10000000.0f;
+
     // ── Q projection (packed Q || gate), shape [2*q_dim, n_tokens]
     ggml_tensor * QG = ggml_mul_mat(ctx, L.wq, cur);
     // Reshape to [head_dim*2, n_head, n_tokens] so we can view the Q and gate halves
-    QG = ggml_reshape_3d(ctx, QG, q35::HEAD_DIM * 2, q35::N_HEAD, n_tokens);
+    QG = ggml_reshape_3d(ctx, QG, head_dim * 2, n_head, n_tokens);
 
     // Q half: view at offset 0, stride head_dim*2
     // Layout: [head_dim, n_head, n_tokens]
     ggml_tensor * Q = ggml_view_3d(ctx, QG,
-        q35::HEAD_DIM, q35::N_HEAD, n_tokens,
-        ggml_element_size(QG) * q35::HEAD_DIM * 2,                 // nb1: stride over n_head
-        ggml_element_size(QG) * q35::HEAD_DIM * 2 * q35::N_HEAD,   // nb2: stride over n_tokens
+        head_dim, n_head, n_tokens,
+        ggml_element_size(QG) * head_dim * 2,                 // nb1: stride over n_head
+        ggml_element_size(QG) * head_dim * 2 * n_head,   // nb2: stride over n_tokens
         /*offset*/ 0);
-    Q = rms_norm_mul(ctx, Q, L.q_norm, q35::EPS);
+    Q = rms_norm_mul(ctx, Q, L.q_norm, eps);
 
     // Gate half: view at offset head_dim
     ggml_tensor * gate = ggml_view_3d(ctx, QG,
-        q35::HEAD_DIM, q35::N_HEAD, n_tokens,
-        ggml_element_size(QG) * q35::HEAD_DIM * 2,
-        ggml_element_size(QG) * q35::HEAD_DIM * 2 * q35::N_HEAD,
-        ggml_element_size(QG) * q35::HEAD_DIM);
-    gate = ggml_cont_2d(ctx, gate, q35::HEAD_DIM * q35::N_HEAD, n_tokens);  // [q_dim, n_tokens]
+        head_dim, n_head, n_tokens,
+        ggml_element_size(QG) * head_dim * 2,
+        ggml_element_size(QG) * head_dim * 2 * n_head,
+        ggml_element_size(QG) * head_dim);
+    gate = ggml_cont_2d(ctx, gate, head_dim * n_head, n_tokens);  // [q_dim, n_tokens]
 
     // ── K and V projections
     ggml_tensor * Kcur = ggml_mul_mat(ctx, L.wk, cur);   // [kv_dim, n_tokens]
     ggml_tensor * Vcur = ggml_mul_mat(ctx, L.wv, cur);   // [kv_dim, n_tokens]
 
-    Kcur = ggml_reshape_3d(ctx, Kcur, q35::HEAD_DIM, q35::N_HEAD_KV, n_tokens);
-    Kcur = rms_norm_mul(ctx, Kcur, L.k_norm, q35::EPS);
-    Vcur = ggml_reshape_3d(ctx, Vcur, q35::HEAD_DIM, q35::N_HEAD_KV, n_tokens);
+    Kcur = ggml_reshape_3d(ctx, Kcur, head_dim, n_head_kv, n_tokens);
+    Kcur = rms_norm_mul(ctx, Kcur, L.k_norm, eps);
+    Vcur = ggml_reshape_3d(ctx, Vcur, head_dim, n_head_kv, n_tokens);
 
     // ── M-RoPE (multi-axis rotary). n_rot = HEAD_DIM/4 * 4 ? Actually
     //    ggml_rope_multi takes n_dims = the number of dims to rotate; for
@@ -368,11 +383,11 @@ static ggml_tensor * build_full_attn_block(
 
     Q = ggml_rope_multi(ctx, Q, positions, /*freq_factors=*/nullptr,
                         n_rot, sections, GGML_ROPE_TYPE_MROPE,
-                        /*n_ctx_orig=*/0, q35::ROPE_THETA, 1.0f,
+                        /*n_ctx_orig=*/0, rope_theta, 1.0f,
                         0.0f, 1.0f, 0.0f, 0.0f);
     Kcur = ggml_rope_multi(ctx, Kcur, positions, nullptr,
                            n_rot, sections, GGML_ROPE_TYPE_MROPE,
-                           0, q35::ROPE_THETA, 1.0f,
+                           0, rope_theta, 1.0f,
                            0.0f, 1.0f, 0.0f, 0.0f);
 
     // ── Write K/V into the persistent cache at slot [kv_start..kv_start+n_tokens)
@@ -387,11 +402,11 @@ static ggml_tensor * build_full_attn_block(
     ggml_tensor * Vcur_T = ggml_permute(ctx, Vcur, 0, 2, 1, 3);  // [head_dim, n_tokens, n_head_kv]
 
     ggml_tensor * k_slot = ggml_view_3d(ctx, cache_k,
-        q35::HEAD_DIM, n_tokens, q35::N_HEAD_KV,
+        head_dim, n_tokens, n_head_kv,
         cache_k->nb[1], cache_k->nb[2],
         /*offset*/ cache_k->nb[1] * kv_start);
     ggml_tensor * v_slot = ggml_view_3d(ctx, cache_v,
-        q35::HEAD_DIM, n_tokens, q35::N_HEAD_KV,
+        head_dim, n_tokens, n_head_kv,
         cache_v->nb[1], cache_v->nb[2],
         cache_v->nb[1] * kv_start);
 
@@ -432,7 +447,7 @@ static ggml_tensor * build_full_attn_block(
     // to all keys is trivially causal). For n_tokens>1 the caller must provide
     // a mask shaped [kv_len, n_tokens] with 0 for attendable positions and
     // -inf for positions beyond the causal boundary.
-    const float kq_scale = 1.0f / std::sqrt((float)q35::HEAD_DIM);
+    const float kq_scale = 1.0f / std::sqrt((float)head_dim);
     ggml_tensor * attn = ggml_flash_attn_ext(ctx, Qfa, Kfa, Vfa, attn_mask,
                                              kq_scale, 0.0f, 0.0f);
     // attn: [head_dim, n_head, n_tokens] (permuted)
@@ -443,7 +458,7 @@ static ggml_tensor * build_full_attn_block(
         attn = ggml_turbo_wht(ctx, attn, 1);
     }
 
-    attn = ggml_reshape_2d(ctx, attn, q35::Q_DIM, n_tokens);
+    attn = ggml_reshape_2d(ctx, attn, q_dim, n_tokens);
 
     // ── Apply the sigmoid gate from the packed Q
     ggml_tensor * gate_sig = ggml_sigmoid(ctx, gate);
@@ -467,6 +482,7 @@ static ggml_tensor * build_full_attn_block(
 static ggml_tensor * build_delta_net_block(
     ggml_context * ctx,
     ggml_cgraph * gf,
+    const TargetWeights & w,
     const TargetLayer & L,
     ggml_tensor * cur,            // [hidden, n_tokens]
     ggml_tensor * conv_state,     // [kernel-1, conv_channels] persistent
@@ -475,17 +491,21 @@ static ggml_tensor * build_delta_net_block(
     DeltaNetCapture * cap,        // optional: populated on capture_delta_intermediate
     ggml_tensor * parent_ids      // optional [n_tokens] i32; tree mode when non-null
 ) {
-    const int d_inner      = q35::SSM_D_INNER;
-    const int head_k_dim   = q35::HEAD_K_DIM;   // 128
-    const int num_k_heads  = q35::SSM_N_GROUP;  // 16
-    const int num_v_heads  = q35::SSM_DT_RANK;  // 48
-    const int head_v_dim   = q35::HEAD_V_DIM;   // 128
+    const int d_inner       = w.ssm_d_inner;
+    const int head_k_dim    = w.ssm_d_state;
+    const int num_k_heads   = w.ssm_n_group;
+    const int num_v_heads   = w.ssm_dt_rank;
+    const int head_v_dim    = d_inner / num_v_heads;
+    const int conv_kern     = w.ssm_d_conv;
+    const int conv_channels = d_inner + 2 * num_k_heads * head_k_dim;
+    const int hidden        = w.n_embd;
+    constexpr float eps     = 1e-6f;
     const int n_seqs       = 1;
     const int n_seq_tokens = n_tokens;
 
-    // ── qkv_mixed = wqkv @ cur         [10240, n_tokens]
+    // ── qkv_mixed = wqkv @ cur         [conv_channels, n_tokens]
     ggml_tensor * qkv_mixed = ggml_mul_mat(ctx, L.wqkv, cur);
-    qkv_mixed = ggml_reshape_3d(ctx, qkv_mixed, q35::CONV_CHANNELS, n_seq_tokens, n_seqs);
+    qkv_mixed = ggml_reshape_3d(ctx, qkv_mixed, conv_channels, n_seq_tokens, n_seqs);
 
     // ── z = wqkv_gate @ cur            [inner, n_tokens]
     ggml_tensor * z = ggml_mul_mat(ctx, L.wqkv_gate, cur);
@@ -509,7 +529,7 @@ static ggml_tensor * build_delta_net_block(
     // ── Fetch conv state [kernel-1, conv_channels] and prepend to qkv_mixed
     //    along the token axis to form the convolution input.
     ggml_tensor * conv_states_r = ggml_reshape_3d(ctx, conv_state,
-        q35::SSM_CONV_KERN - 1, q35::CONV_CHANNELS, n_seqs);
+        conv_kern - 1, conv_channels, n_seqs);
 
     // qkv_mixed currently is [conv_channels, n_tokens, n_seqs]; we need
     // [n_tokens, conv_channels, n_seqs] to concat on dim 0.
@@ -529,9 +549,9 @@ static ggml_tensor * build_delta_net_block(
 
     // ── Save the last (kernel-1) steps back to conv_state
     ggml_tensor * last_conv = ggml_view_3d(ctx, conv_input,
-        q35::SSM_CONV_KERN - 1, q35::CONV_CHANNELS, n_seqs,
+        conv_kern - 1, conv_channels, n_seqs,
         conv_input->nb[1], conv_input->nb[2],
-        (conv_input->ne[0] - (q35::SSM_CONV_KERN - 1)) * ggml_element_size(conv_input));
+        (conv_input->ne[0] - (conv_kern - 1)) * ggml_element_size(conv_input));
     ggml_build_forward_expand(gf, ggml_cpy(ctx, last_conv, conv_state));
 
     // ── 1D conv + silu
@@ -550,7 +570,7 @@ static ggml_tensor * build_delta_net_block(
     const int64_t v_offset = 2 * num_k_heads * head_k_dim;
 
     const size_t elt = ggml_element_size(conv_out);
-    const size_t row_size = q35::CONV_CHANNELS * elt;
+    const size_t row_size = conv_channels * elt;
 
     ggml_tensor * q_c = ggml_view_4d(ctx, conv_out,
         head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
@@ -572,8 +592,8 @@ static ggml_tensor * build_delta_net_block(
         v_offset * elt);
 
     // L2 norm on Q and K
-    q_c = ggml_l2_norm(ctx, q_c, q35::EPS);
-    k_c = ggml_l2_norm(ctx, k_c, q35::EPS);
+    q_c = ggml_l2_norm(ctx, q_c, eps);
+    k_c = ggml_l2_norm(ctx, k_c, eps);
 
     // Repeat Q and K from num_k_heads to num_v_heads so they match V's layout
     // (only needed if not using the fused op's broadcast support).
@@ -697,7 +717,7 @@ static ggml_tensor * build_delta_net_block(
 
     // ── Gated output norm: rms_norm(output) * silu(z_4d)
     ggml_tensor * z_4d = ggml_reshape_4d(ctx, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
-    ggml_tensor * output_n = ggml_rms_norm(ctx, output, q35::EPS);
+    ggml_tensor * output_n = ggml_rms_norm(ctx, output, eps);
     output_n = ggml_mul(ctx, output_n, L.ssm_norm);
     ggml_tensor * z_silu  = ggml_silu(ctx, z_4d);
     output_n = ggml_mul(ctx, output_n, z_silu);
@@ -708,7 +728,7 @@ static ggml_tensor * build_delta_net_block(
 
     // Output projection
     ggml_tensor * out = ggml_mul_mat(ctx, L.ssm_out, flat);
-    out = ggml_reshape_2d(ctx, out, q35::N_HEAD * 0 + DFLASH27B_TARGET_HIDDEN, n_seq_tokens * n_seqs);
+    out = ggml_reshape_2d(ctx, out, hidden, n_seq_tokens * n_seqs);
     return out;
 }
 
@@ -748,7 +768,7 @@ static ggml_tensor * build_single_layer(
         for (int il = 0; il < layer_idx; il++) {
             if (((il + 1) % w.full_attention_interval) == 0) fa_idx++;
         }
-        cur = build_full_attn_block(ctx, gf, L, cur, positions, w.rope_sections,
+        cur = build_full_attn_block(ctx, gf, w, L, cur, positions, w.rope_sections,
                                     cache.attn_k[fa_idx], cache.attn_v[fa_idx],
                                     attn_mask, kv_start, n_tokens,
                                     cache.kv_k_type, fa_window);
@@ -757,7 +777,7 @@ static ggml_tensor * build_single_layer(
         for (int il = 0; il < layer_idx; il++) {
             if (((il + 1) % w.full_attention_interval) != 0) dn_idx++;
         }
-        cur = build_delta_net_block(ctx, gf, L, cur,
+        cur = build_delta_net_block(ctx, gf, w, L, cur,
                                     cache.conv_state[dn_idx], cache.ssm_state[dn_idx],
                                     n_tokens, nullptr, nullptr);
     }
@@ -834,13 +854,18 @@ QwenGraphOutputs build_qwen35_graph(
         og_early.delta_captures.resize(n_delta);
     }
 
-    // DFlash target layer IDs for feature capture: {1, 16, 31, 46, 61}
+    // DFlash target layer IDs for feature capture.
     // HF hidden_states[lid+1] convention — capture AFTER layer 'lid' runs.
-    static const int CAPTURE_LAYERS[DFLASH27B_DRAFT_N_TARGET_LAYERS] =
-        { 1, 16, 31, 46, 61 };
+    // Evenly spaced across the layers, always 5 capture points.
+    // Formula: il = 1 + k * (n_layer - 2) / (n_capture - 1) for k=0..4
+    int CAPTURE_LAYERS[DFLASH27B_DRAFT_N_TARGET_LAYERS];
+    const int capture_step = (w.n_layer - 2) / (DFLASH27B_DRAFT_N_TARGET_LAYERS - 1);
+    for (int k = 0; k < DFLASH27B_DRAFT_N_TARGET_LAYERS; k++) {
+        CAPTURE_LAYERS[k] = 1 + k * capture_step;
+    }
 
     const int hidden = w.n_embd;
-    const float eps  = q35::EPS;
+    constexpr float eps = 1e-6f;
 
     for (int il = 0; il < w.n_layer; il++) {
         const TargetLayer & L = w.layers[il];
@@ -852,7 +877,7 @@ QwenGraphOutputs build_qwen35_graph(
         ggml_tensor * cur = rms_norm_mul(ctx, inpL, L.attn_norm, eps);
 
         if (is_attn) {
-            cur = build_full_attn_block(ctx, gf, L, cur, in.positions, w.rope_sections,
+            cur = build_full_attn_block(ctx, gf, w, L, cur, in.positions, w.rope_sections,
                                         cache.attn_k[fa_idx], cache.attn_v[fa_idx],
                                         in.attn_mask, in.kv_start, n_tokens,
                                         cache.kv_k_type, in.fa_window);
@@ -869,7 +894,7 @@ QwenGraphOutputs build_qwen35_graph(
                 cap_ptr->ssm_intermediate_states = cache.ssm_intermediate[dn_idx];
                 cap_ptr->conv_input              = cache.conv_input_cache[dn_idx];
             }
-            cur = build_delta_net_block(ctx, gf, L, cur,
+            cur = build_delta_net_block(ctx, gf, w, L, cur,
                                         cache.conv_state[dn_idx], cache.ssm_state[dn_idx],
                                         n_tokens, cap_ptr, in.parent_ids);
             dn_idx++;
@@ -882,8 +907,10 @@ QwenGraphOutputs build_qwen35_graph(
         ggml_tensor * ffn_residual = cur;
         ggml_tensor * post = rms_norm_mul(ctx, cur, L.attn_post_norm, eps);
 
-        // SwiGLU FFN
-        ggml_tensor * ffn = build_swiglu_ffn(ctx, post, L);
+        // FFN: MoE or dense SwiGLU depending on model
+        ggml_tensor * ffn = (w.n_expert > 0)
+            ? build_moe_ffn(ctx, gf, post, L, w)
+            : build_swiglu_ffn(ctx, post, L);
         cur = ggml_add(ctx, ffn, ffn_residual);
 
         // ── DFlash layer feature capture ──
@@ -936,7 +963,7 @@ QwenGraphOutputs build_qwen35_graph(
     }
 
     // 2. Final norm
-    ggml_tensor * out = rms_norm_mul(ctx, inpL, w.out_norm, q35::EPS);
+    ggml_tensor * out = rms_norm_mul(ctx, inpL, w.out_norm, eps);
 
     // 3. LM head
     ggml_tensor * logits = ggml_mul_mat(ctx, w.output, out);
@@ -949,6 +976,97 @@ QwenGraphOutputs build_qwen35_graph(
     return og;
 }
 
+ggml_tensor * build_moe_ffn(
+    ggml_context *        ctx,
+    ggml_cgraph *         gf,
+    ggml_tensor *         cur,
+    const TargetLayer &   L,
+    const TargetWeights & w) {
+
+    const int64_t n_embd       = cur->ne[0];
+    const int64_t n_tokens     = cur->ne[1];
+    const int64_t n_expert     = w.n_expert;
+    const int64_t n_expert_used = w.n_expert_used;
+
+    // 1. Router: gate_inp @ cur → [n_expert, n_tokens]
+    ggml_tensor * logits = ggml_mul_mat(ctx, L.ffn_gate_inp, cur);
+
+    // 2. Softmax gating
+    ggml_tensor * probs = ggml_soft_max(ctx, logits);
+
+    // 3. Top-k expert selection: [n_expert_used, n_tokens]
+    ggml_tensor * selected = ggml_argsort_top_k(ctx, probs, (int)n_expert_used);
+
+    // 4. Extract weights for selected experts
+    probs = ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens);
+    ggml_tensor * weights = ggml_get_rows(ctx, probs, selected);
+
+    // 5. Normalize weights
+    weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
+    ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights);
+    weights_sum = ggml_clamp(ctx, weights_sum, 6.103515625e-5f, INFINITY);
+    weights = ggml_div(ctx, weights, weights_sum);
+    weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
+    ggml_build_forward_expand(gf, weights);
+
+    // 6. Reshape input for batched expert matmul
+    cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
+
+    // 7. Per-expert projections via ggml_mul_mat_id
+    // gate
+    ggml_tensor * gate = ggml_mul_mat_id(ctx, L.ffn_gate_exps, cur, selected);
+    // up
+    ggml_tensor * up = ggml_mul_mat_id(ctx, L.ffn_up_exps, cur, selected);
+
+    // 8. SwiGLU: silu(gate) * up
+    ggml_tensor * gu = ggml_swiglu_split(ctx, gate, up);
+
+    // 9. Down projection
+    ggml_tensor * experts = ggml_mul_mat_id(ctx, L.ffn_down_exps, gu, selected);
+
+    // 10. Apply weights
+    experts = ggml_mul(ctx, experts, weights);
+    ggml_build_forward_expand(gf, experts);
+
+    // 11. Sum across selected experts
+    ggml_tensor * moe_out = nullptr;
+    for (int64_t i = 0; i < n_expert_used; i++) {
+        ggml_tensor * view = ggml_view_2d(ctx, experts, n_embd, n_tokens,
+            experts->nb[2], i * experts->nb[1]);
+        ggml_build_forward_expand(gf, view);
+        if (i == 0) {
+            moe_out = view;
+        } else {
+            moe_out = ggml_add(ctx, moe_out, view);
+            ggml_build_forward_expand(gf, moe_out);
+        }
+    }
+    if (n_expert_used == 1) {
+        moe_out = ggml_cont(ctx, moe_out);
+    }
+
+    // 12. Shared expert path
+    ggml_tensor * sh_gate = ggml_mul_mat(ctx, L.ffn_gate_shexp, cur);
+    ggml_tensor * sh_up   = ggml_mul_mat(ctx, L.ffn_up_shexp, cur);
+    sh_gate = ggml_silu(ctx, sh_gate);
+    sh_gate = ggml_reshape_2d(ctx, sh_gate, w.expert_ff_dim, n_tokens);
+    sh_up   = ggml_reshape_2d(ctx, sh_up,   w.expert_ff_dim, n_tokens);
+    ggml_tensor * sh_gu   = ggml_mul(ctx, sh_gate, sh_up);
+    ggml_tensor * sh_down = ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu);
+
+    // Shared expert gating (sigmoid)
+    ggml_tensor * shared_gate = ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, cur);
+    shared_gate = ggml_sigmoid(ctx, shared_gate);
+    shared_gate = ggml_reshape_2d(ctx, shared_gate, 1, n_tokens);
+    sh_down = ggml_reshape_2d(ctx, sh_down, n_embd, n_tokens);
+    sh_down = ggml_mul(ctx, sh_down, shared_gate);
+
+    // 13. Combine routed + shared
+    moe_out = ggml_add(ctx, moe_out, sh_down);
+
+    return moe_out;
+}
+
 ggml_tensor * build_qwen35_layer(
     ggml_context *        ctx,
     ggml_cgraph *         gf,
diff --git a/dflash/src/qwen3_dflash_graph.cpp b/dflash/src/qwen3_dflash_graph.cpp
index e454ae3a..9686ada6 100644
--- a/dflash/src/qwen3_dflash_graph.cpp
+++ b/dflash/src/qwen3_dflash_graph.cpp
@@ -40,15 +40,20 @@ DraftGraphOutputs build_draft_graph(
     const DraftWeights &      w,
     const DraftGraphInputs &  in) {
 
-    const int q_len    = DFLASH27B_DRAFT_BLOCK_SIZE;
+    const int q_len    = w.hparams.block_size;
     const int ctx_len  = in.ctx_len;
     const int total_k  = ctx_len + q_len;
-    const int n_head   = DFLASH27B_TARGET_N_HEADS;           // 32
-    const int n_kv     = DFLASH27B_TARGET_N_KV_HEADS;        // 8
-    const int head_dim = DFLASH27B_TARGET_HEAD_DIM;          // 128
-    const float eps    = DFLASH27B_RMS_EPS;
-    const float rope_base = DFLASH27B_ROPE_THETA;
-    (void)ctx_len;  // used only via input tensor shapes
+    const int n_head   = w.hparams.n_head;
+    const int n_kv     = w.hparams.n_kv_head;
+    const int head_dim = w.hparams.head_dim;
+    const float eps    = w.hparams.rms_eps;
+    const float rope_base = w.hparams.rope_theta;
+    (void)ctx_len;
+
+    const float freq_scale = 1.0f / w.hparams.rope_factor;
+    const float ext_factor = (w.hparams.rope_factor > 1.0f) ? 1.0f : 0.0f;
+    const float mscale = (w.hparams.rope_factor > 1.0f) ?
+        1.0f / (0.1f * std::log(w.hparams.rope_factor) + 1.0f) : 1.0f;
 
     // ── 1. Feature fusion: target_feat = rms_norm(fc @ target_hidden_cat, hidden_norm)
     //    fc:                [5*hidden, hidden]  (ggml: ne[0]=5*hidden, ne[1]=hidden)
@@ -62,7 +67,7 @@ DraftGraphOutputs build_draft_graph(
     // ── 2. Decoder layers
     ggml_tensor * h = in.noise_embed;  // [hidden, q_len, 1]
 
-    for (int il = 0; il < DFLASH27B_DRAFT_LAYERS; il++) {
+    for (int il = 0; il < w.hparams.n_layer; il++) {
         const DraftLayer & L = w.layers[il];
 
         // ── 2a. Attention pre-norm
@@ -97,14 +102,16 @@ DraftGraphOutputs build_draft_graph(
         // ── 2d. RoPE (NEOX, theta=10M)
         //   Q: positions_q  [q_len]      values [ctx_len..ctx_len+q_len-1]
         //   K: positions_k  [total_k]    values [0..total_k-1]
-        Q = ggml_rope_ext(ctx, Q, in.positions_q, /*freq_factors=*/nullptr,
-                          head_dim, GGML_ROPE_TYPE_NEOX, /*n_ctx_orig=*/0,
-                          rope_base, /*freq_scale=*/1.0f,
-                          /*ext_factor=*/0.0f, /*attn_factor=*/1.0f,
-                          /*beta_fast=*/0.0f, /*beta_slow=*/0.0f);
+        Q = ggml_rope_ext(ctx, Q, in.positions_q, nullptr,
+                          head_dim, GGML_ROPE_TYPE_NEOX, w.hparams.rope_orig_ctx,
+                          rope_base, freq_scale,
+                          ext_factor, mscale,
+                          w.hparams.rope_beta_fast, w.hparams.rope_beta_slow);
         K = ggml_rope_ext(ctx, K, in.positions_k, nullptr,
-                          head_dim, GGML_ROPE_TYPE_NEOX, 0,
-                          rope_base, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+                          head_dim, GGML_ROPE_TYPE_NEOX, w.hparams.rope_orig_ctx,
+                          rope_base, freq_scale,
+                          ext_factor, mscale,
+                          w.hparams.rope_beta_fast, w.hparams.rope_beta_slow);
 
         // ── 2e. Permute into the layout flash_attn_ext wants
         //   q: [n_embd_k=head_dim, n_batch=q_len, n_head,   ne3]
diff --git a/dflash/src/safetensors_draft.cpp b/dflash/src/safetensors_draft.cpp
index 17fbc3f5..862801c3 100644
--- a/dflash/src/safetensors_draft.cpp
+++ b/dflash/src/safetensors_draft.cpp
@@ -51,6 +51,8 @@
 #include <unistd.h>
 #endif
 
+#include <fstream>
+#include <sstream>
 #include <unordered_map>
 #include <vector>
 
@@ -307,11 +309,109 @@ static void bf16_to_f32_array(const uint16_t * src, float * dst, size_t n) {
     }
 }
 
+static bool parse_config_json(const std::string & model_dir, DraftHparams & hp) {
+    std::string cfg_path = model_dir + "/config.json";
+    std::ifstream ifs(cfg_path);
+    if (!ifs.is_open()) return false;
+    std::stringstream ss;
+    ss << ifs.rdbuf();
+    std::string json = ss.str();
+
+    auto find_number = [&](const std::string & key, double def) -> double {
+        auto pos = json.find("\"" + key + "\"");
+        if (pos == std::string::npos) return def;
+        auto colon = json.find(':', pos + key.size() + 2);
+        if (colon == std::string::npos) return def;
+        size_t start = colon + 1;
+        while (start < json.size() && (json[start] == ' ' || json[start] == '\t' || json[start] == '\n' || json[start] == '\r')) start++;
+        if (start >= json.size()) return def;
+        char * end = nullptr;
+        double v = std::strtod(json.c_str() + start, &end);
+        if (end == json.c_str() + start) return def;
+        return v;
+    };
+
+    auto find_int = [&](const std::string & key, int def) -> int {
+        return (int)find_number(key, (double)def);
+    };
+
+    hp.n_layer      = find_int("num_hidden_layers", hp.n_layer);
+    hp.hidden       = find_int("hidden_size", hp.hidden);
+    hp.n_head       = find_int("num_attention_heads", hp.n_head);
+    hp.n_kv_head    = find_int("num_key_value_heads", hp.n_kv_head);
+    hp.head_dim     = find_int("head_dim", -1);
+    if (hp.head_dim <= 0) hp.head_dim = hp.hidden / hp.n_head;
+    hp.intermediate = find_int("intermediate_size", hp.intermediate);
+    hp.block_size   = find_int("block_size", hp.block_size);
+    hp.mask_token_id = find_int("mask_token_id", hp.mask_token_id);
+    hp.rope_theta   = (float)find_number("rope_theta", hp.rope_theta);
+    hp.rms_eps      = (float)find_number("rms_norm_eps", hp.rms_eps);
+
+    {
+        auto pos = json.find("\"dflash_config\"");
+        if (pos != std::string::npos) {
+            hp.mask_token_id = find_int("mask_token_id", hp.mask_token_id);
+            auto ids_pos = json.find("\"target_layer_ids\"", pos);
+            if (ids_pos != std::string::npos) {
+                auto bracket = json.find('[', ids_pos);
+                if (bracket != std::string::npos) {
+                    int count = 0;
+                    auto end_bracket = json.find(']', bracket + 1);
+                    if (end_bracket != std::string::npos) {
+                        std::string arr = json.substr(bracket + 1, end_bracket - bracket - 1);
+                        const char * p = arr.c_str();
+                        while (*p) {
+                            while (*p && (*p == ' ' || *p == ',' || *p == '\t' || *p == '\n' || *p == '\r')) p++;
+                            if (!*p) break;
+                            char * ep = nullptr;
+                            std::strtol(p, &ep, 10);
+                            if (ep == p) break;
+                            count++;
+                            p = ep;
+                        }
+                    }
+                    if (count > 0) hp.n_target_layers = count;
+                }
+            }
+        }
+    }
+
+    {
+        auto pos = json.find("\"rope_scaling\"");
+        if (pos != std::string::npos) {
+            hp.rope_factor    = (float)find_number("factor", hp.rope_factor);
+            hp.rope_beta_fast = (float)find_number("beta_fast", hp.rope_beta_fast);
+            hp.rope_beta_slow = (float)find_number("beta_slow", hp.rope_beta_slow);
+            hp.rope_orig_ctx  = find_int("original_max_position_embeddings", hp.rope_orig_ctx);
+        }
+    }
+
+    return true;
+}
+
 } // namespace
 
 bool load_draft_safetensors(const std::string & path,
                             ggml_backend_t       backend,
                             DraftWeights &       out) {
+    // ── 0. Try to load config.json from the model directory ──────
+    {
+        std::string model_dir;
+        auto slash = path.rfind('/');
+#if defined(_WIN32)
+        auto bslash = path.rfind('\\');
+        if (bslash != std::string::npos && (slash == std::string::npos || bslash > slash))
+            slash = bslash;
+#endif
+        if (slash != std::string::npos)
+            model_dir = path.substr(0, slash);
+        else
+            model_dir = ".";
+        if (!parse_config_json(model_dir, out.hparams)) {
+            out.hparams = DraftHparams{};
+        }
+    }
+
     // ── 1. Open + mmap ────────────────────────────────────────────
     Mmap mm;
     std::string err;
@@ -334,9 +434,9 @@ bool load_draft_safetensors(const std::string & path,
     const uint8_t * blob = (const uint8_t *)mm.addr + 8 + header_len;
     const size_t    blob_len = mm.len - 8 - header_len;
 
-    // ── 3. Allocate ggml context big enough for 5 layers × 11 + 3 top ─
-    const int n_layers    = DFLASH27B_DRAFT_LAYERS;
-    const int n_tensors   = 3 + 11 * n_layers;  // with some headroom below
+    // ── 3. Allocate ggml context ──────────────────────────────────
+    const int n_layers    = out.hparams.n_layer;
+    const int n_tensors   = 3 + 11 * n_layers;
     ggml_init_params ip{};
     ip.mem_size   = (size_t)(n_tensors + 16) * ggml_tensor_overhead();
     ip.mem_buffer = nullptr;
@@ -346,12 +446,12 @@ bool load_draft_safetensors(const std::string & path,
     out.backend = backend;
     out.layers.assign(n_layers, DraftLayer{});
 
-    const int64_t HIDDEN  = DFLASH27B_TARGET_HIDDEN;           // 5120
-    const int64_t Q_DIM   = DFLASH27B_TARGET_N_HEADS * DFLASH27B_TARGET_HEAD_DIM;     // 4096
-    const int64_t KV_DIM  = DFLASH27B_TARGET_N_KV_HEADS * DFLASH27B_TARGET_HEAD_DIM;  // 1024
-    const int64_t INTER   = DFLASH27B_TARGET_INTERMEDIATE;     // 17408
-    const int64_t HD      = DFLASH27B_TARGET_HEAD_DIM;         // 128
-    const int64_t FC_IN   = DFLASH27B_DRAFT_N_TARGET_LAYERS * HIDDEN; // 25600
+    const int64_t HIDDEN  = out.hparams.hidden;
+    const int64_t Q_DIM   = (int64_t)out.hparams.n_head * out.hparams.head_dim;
+    const int64_t KV_DIM  = (int64_t)out.hparams.n_kv_head * out.hparams.head_dim;
+    const int64_t INTER   = out.hparams.intermediate;
+    const int64_t HD      = out.hparams.head_dim;
+    const int64_t FC_IN   = (int64_t)out.hparams.n_target_layers * HIDDEN;
 
     // ── 4. Create named tensors in the context ───────────────────
     //
diff --git a/dflash/test/smoke_draft_graph.cpp b/dflash/test/smoke_draft_graph.cpp
index 16672216..a1d90c5c 100644
--- a/dflash/test/smoke_draft_graph.cpp
+++ b/dflash/test/smoke_draft_graph.cpp
@@ -52,13 +52,7 @@ int main(int argc, char ** argv) {
     }
     const char * path = argv[1];
     const int ctx_len = (argc >= 3) ? std::atoi(argv[2]) : 64;
-    const int q_len   = DFLASH27B_DRAFT_BLOCK_SIZE;      // 16
-    const int hidden  = DFLASH27B_TARGET_HIDDEN;         // 5120
-    const int fc_in   = DFLASH27B_DRAFT_N_TARGET_LAYERS * hidden;  // 25600
 
-    std::printf("ctx_len=%d q_len=%d hidden=%d fc_in=%d\n", ctx_len, q_len, hidden, fc_in);
-
-    // ── 1. Backend + weights
     ggml_backend_t backend = ggml_backend_cuda_init(0);
     if (!backend) { std::fprintf(stderr, "ggml_backend_cuda_init failed\n"); return 1; }
 
@@ -69,6 +63,12 @@ int main(int argc, char ** argv) {
     }
     std::printf("draft loaded\n");
 
+    const int hidden_actual = w.hparams.hidden;
+    const int q_len_actual  = w.hparams.block_size;
+    const int fc_in_actual  = w.hparams.n_target_layers * hidden_actual;
+
+    std::printf("ctx_len=%d q_len=%d hidden=%d fc_in=%d\n", ctx_len, q_len_actual, hidden_actual, fc_in_actual);
+
     // ── 2. Graph context (separate from weights context)
     const size_t mem_size = 256 * 1024 * 1024;  // 256 MB — plenty for nodes
     ggml_init_params ip{};
@@ -81,10 +81,10 @@ int main(int argc, char ** argv) {
     // ── 3. Input placeholder tensors
     // Activations flow as F32 through the graph (CUDA rms_norm requires F32).
     // Weights stay bf16 — ggml_mul_mat auto-casts.
-    ggml_tensor * noise_embed = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, hidden, q_len, 1);
-    ggml_tensor * target_hid  = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, fc_in, ctx_len, 1);
-    ggml_tensor * pos_q       = ggml_new_tensor_1d(gctx, GGML_TYPE_I32,  q_len);
-    ggml_tensor * pos_k       = ggml_new_tensor_1d(gctx, GGML_TYPE_I32,  ctx_len + q_len);
+    ggml_tensor * noise_embed = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, hidden_actual, q_len_actual, 1);
+    ggml_tensor * target_hid  = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, fc_in_actual, ctx_len, 1);
+    ggml_tensor * pos_q       = ggml_new_tensor_1d(gctx, GGML_TYPE_I32,  q_len_actual);
+    ggml_tensor * pos_k       = ggml_new_tensor_1d(gctx, GGML_TYPE_I32,  ctx_len + q_len_actual);
     ggml_set_name(noise_embed, "noise_embed");
     ggml_set_name(target_hid,  "target_hidden_cat");
     ggml_set_name(pos_q,       "positions_q");
@@ -122,23 +122,23 @@ int main(int argc, char ** argv) {
     std::uniform_real_distribution<float> u(-0.02f, 0.02f);
 
     {
-        std::vector<float> data((size_t)hidden * q_len);
+        std::vector<float> data((size_t)hidden_actual * q_len_actual);
         for (auto & v : data) v = u(rng);
         ggml_backend_tensor_set(noise_embed, data.data(), 0, sizeof(float) * data.size());
     }
     {
-        std::vector<float> data((size_t)fc_in * ctx_len);
+        std::vector<float> data((size_t)fc_in_actual * ctx_len);
         for (auto & v : data) v = u(rng);
         ggml_backend_tensor_set(target_hid, data.data(), 0, sizeof(float) * data.size());
     }
     {
-        std::vector<int32_t> pq(q_len);
-        for (int i = 0; i < q_len; i++) pq[i] = ctx_len + i;
+        std::vector<int32_t> pq(q_len_actual);
+        for (int i = 0; i < q_len_actual; i++) pq[i] = ctx_len + i;
         ggml_backend_tensor_set(pos_q, pq.data(), 0, sizeof(int32_t) * pq.size());
     }
     {
-        std::vector<int32_t> pk(ctx_len + q_len);
-        for (int i = 0; i < ctx_len + q_len; i++) pk[i] = i;
+        std::vector<int32_t> pk(ctx_len + q_len_actual);
+        for (int i = 0; i < ctx_len + q_len_actual; i++) pk[i] = i;
         ggml_backend_tensor_set(pos_k, pk.data(), 0, sizeof(int32_t) * pk.size());
     }
 
@@ -152,8 +152,8 @@ int main(int argc, char ** argv) {
 
     // ── 8. Read output, check shape + no NaN + print summary stats
     const size_t n_out_elems = ggml_nelements(go.hidden_states);
-    if (n_out_elems != (size_t)hidden * q_len) {
-        std::fprintf(stderr, "out elems mismatch: %zu vs %d\n", n_out_elems, hidden * q_len);
+    if (n_out_elems != (size_t)hidden_actual * q_len_actual) {
+        std::fprintf(stderr, "out elems mismatch: %zu vs %d\n", n_out_elems, hidden_actual * q_len_actual);
         return 1;
     }
     std::vector<float> out(n_out_elems);
diff --git a/dflash/test/smoke_load_moe_draft.cpp b/dflash/test/smoke_load_moe_draft.cpp
new file mode 100644
index 00000000..7125171d
--- /dev/null
+++ b/dflash/test/smoke_load_moe_draft.cpp
@@ -0,0 +1,97 @@
+// Smoke test: load 35B-A3B DFlash draft model from safetensors.
+// Validates that DraftHparams are read correctly from config.json,
+// tensor shapes match, and all 8 layers are loaded.
+//
+// Usage: smoke_load_moe_draft <path/to/model.safetensors>
+
+#include "dflash27b.h"
+#include "internal.h"
+
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-cuda.h"
+
+#include <cstdio>
+#include <cstdlib>
+
+using namespace dflash27b;
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        std::fprintf(stderr, "usage: %s <model.safetensors>\n", argv[0]);
+        return 2;
+    }
+
+    ggml_backend_t backend = ggml_backend_cuda_init(0);
+    if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; }
+
+    DraftWeights w;
+    if (!load_draft_safetensors(argv[1], backend, w)) {
+        std::fprintf(stderr, "load_draft_safetensors: %s\n", dflash27b_last_error());
+        return 1;
+    }
+
+    bool ok = true;
+    auto & hp = w.hparams;
+
+    std::printf("[hparams] n_layer=%d hidden=%d n_head=%d n_kv_head=%d head_dim=%d "
+                "intermediate=%d n_target_layers=%d block_size=%d\n",
+        hp.n_layer, hp.hidden, hp.n_head, hp.n_kv_head, hp.head_dim,
+        hp.intermediate, hp.n_target_layers, hp.block_size);
+    std::printf("[rope] theta=%.0f factor=%.1f beta_fast=%.1f beta_slow=%.1f orig_ctx=%d\n",
+        hp.rope_theta, hp.rope_factor, hp.rope_beta_fast, hp.rope_beta_slow, hp.rope_orig_ctx);
+
+    if (hp.n_layer != 8)       { std::fprintf(stderr, "FAIL: n_layer=%d expected 8\n", hp.n_layer); ok = false; }
+    if (hp.hidden != 2048)     { std::fprintf(stderr, "FAIL: hidden=%d expected 2048\n", hp.hidden); ok = false; }
+    if (hp.n_head != 32)       { std::fprintf(stderr, "FAIL: n_head=%d expected 32\n", hp.n_head); ok = false; }
+    if (hp.n_kv_head != 4)     { std::fprintf(stderr, "FAIL: n_kv_head=%d expected 4\n", hp.n_kv_head); ok = false; }
+    if (hp.head_dim != 128)    { std::fprintf(stderr, "FAIL: head_dim=%d expected 128\n", hp.head_dim); ok = false; }
+    if (hp.intermediate != 6144){ std::fprintf(stderr, "FAIL: intermediate=%d expected 6144\n", hp.intermediate); ok = false; }
+    if (hp.n_target_layers != 5){ std::fprintf(stderr, "FAIL: n_target_layers=%d expected 5\n", hp.n_target_layers); ok = false; }
+    if (hp.block_size != 16)   { std::fprintf(stderr, "FAIL: block_size=%d expected 16\n", hp.block_size); ok = false; }
+    if (hp.rope_factor != 64.0f){ std::fprintf(stderr, "FAIL: rope_factor=%.1f expected 64.0\n", hp.rope_factor); ok = false; }
+    if (hp.rope_beta_fast != 32.0f){ std::fprintf(stderr, "FAIL: rope_beta_fast=%.1f expected 32.0\n", hp.rope_beta_fast); ok = false; }
+    if (hp.rope_beta_slow != 1.0f) { std::fprintf(stderr, "FAIL: rope_beta_slow=%.1f expected 1.0\n", hp.rope_beta_slow); ok = false; }
+    if (hp.rope_orig_ctx != 4096)  { std::fprintf(stderr, "FAIL: rope_orig_ctx=%d expected 4096\n", hp.rope_orig_ctx); ok = false; }
+
+    if ((int)w.layers.size() != hp.n_layer) {
+        std::fprintf(stderr, "FAIL: layers.size=%zu expected %d\n", w.layers.size(), hp.n_layer);
+        ok = false;
+    }
+
+    std::printf("[fc] ne=[%lld,%lld]\n", (long long)w.fc->ne[0], (long long)w.fc->ne[1]);
+    const int64_t fc_in = hp.n_target_layers * hp.hidden;
+    if (w.fc->ne[1] != hp.hidden || w.fc->ne[0] != fc_in) {
+        std::fprintf(stderr, "FAIL: fc shape [%lld,%lld] expected [%lld,%lld]\n",
+            (long long)w.fc->ne[0], (long long)w.fc->ne[1], (long long)fc_in, (long long)hp.hidden);
+        ok = false;
+    }
+
+    if (!w.layers.empty()) {
+        auto & L0 = w.layers[0];
+        const int64_t q_dim = hp.n_head * hp.head_dim;
+        const int64_t kv_dim = hp.n_kv_head * hp.head_dim;
+        std::printf("[layer0] wq=[%lld,%lld] wk=[%lld,%lld] w_gate=[%lld,%lld]\n",
+            (long long)L0.wq->ne[0], (long long)L0.wq->ne[1],
+            (long long)L0.wk->ne[0], (long long)L0.wk->ne[1],
+            (long long)L0.w_gate->ne[0], (long long)L0.w_gate->ne[1]);
+        if (L0.wq->ne[1] != q_dim || L0.wq->ne[0] != hp.hidden) {
+            std::fprintf(stderr, "FAIL: wq shape mismatch\n"); ok = false;
+        }
+        if (L0.wk->ne[1] != kv_dim || L0.wk->ne[0] != hp.hidden) {
+            std::fprintf(stderr, "FAIL: wk shape mismatch\n"); ok = false;
+        }
+        if (L0.w_gate->ne[1] != hp.intermediate || L0.w_gate->ne[0] != hp.hidden) {
+            std::fprintf(stderr, "FAIL: w_gate shape mismatch\n"); ok = false;
+        }
+    }
+
+    free_draft_weights(w);
+    ggml_backend_free(backend);
+
+    if (ok) {
+        std::printf("OK\n");
+        return 0;
+    }
+    return 1;
+}
diff --git a/dflash/test/smoke_load_moe_target.cpp b/dflash/test/smoke_load_moe_target.cpp
new file mode 100644
index 00000000..5a381cf4
--- /dev/null
+++ b/dflash/test/smoke_load_moe_target.cpp
@@ -0,0 +1,194 @@
+// Smoke test for the GGUF target loader with Qwen3.6-35B-A3B MoE model.
+// Validates that the loader accepts the "qwen35moe" architecture, reads MoE
+// hyperparameters, and wires expert tensors correctly.
+//
+// Usage: smoke_load_moe_target <path/to/qwen35moe.gguf>
+
+#include "dflash27b.h"
+#include "internal.h"
+
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-cuda.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+using namespace dflash27b;
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        std::fprintf(stderr, "usage: %s <qwen35moe.gguf>\n", argv[0]);
+        return 2;
+    }
+
+    ggml_backend_t backend = ggml_backend_cuda_init(0);
+    if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; }
+
+    TargetWeights w;
+    if (!load_target_gguf(argv[1], backend, w)) {
+        std::fprintf(stderr, "FAIL: load_target_gguf: %s\n", dflash27b_last_error());
+        return 1;
+    }
+    std::printf("%s\n", dflash27b_last_error());
+
+    // Validate MoE-specific hyperparameters
+    bool ok = true;
+
+    if (w.n_layer != 40) {
+        std::fprintf(stderr, "FAIL: n_layer=%d expected 40\n", w.n_layer);
+        ok = false;
+    }
+    if (w.n_embd != 2048) {
+        std::fprintf(stderr, "FAIL: n_embd=%d expected 2048\n", w.n_embd);
+        ok = false;
+    }
+    if (w.n_head != 16) {
+        std::fprintf(stderr, "FAIL: n_head=%d expected 16\n", w.n_head);
+        ok = false;
+    }
+    if (w.n_head_kv != 2) {
+        std::fprintf(stderr, "FAIL: n_head_kv=%d expected 2\n", w.n_head_kv);
+        ok = false;
+    }
+    if (w.n_embd_head_k != 256 || w.n_embd_head_v != 256) {
+        std::fprintf(stderr, "FAIL: head_dim k=%d v=%d expected 256/256\n",
+            w.n_embd_head_k, w.n_embd_head_v);
+        ok = false;
+    }
+    if (w.full_attention_interval != 4) {
+        std::fprintf(stderr, "FAIL: fai=%d expected 4\n", w.full_attention_interval);
+        ok = false;
+    }
+    if (w.ssm_d_inner != 4096) {
+        std::fprintf(stderr, "FAIL: ssm_d_inner=%d expected 4096\n", w.ssm_d_inner);
+        ok = false;
+    }
+    if (w.ssm_dt_rank != 32) {
+        std::fprintf(stderr, "FAIL: ssm_dt_rank=%d expected 32\n", w.ssm_dt_rank);
+        ok = false;
+    }
+    if (w.ssm_d_state != 128) {
+        std::fprintf(stderr, "FAIL: ssm_d_state=%d expected 128\n", w.ssm_d_state);
+        ok = false;
+    }
+    if (w.ssm_n_group != 16) {
+        std::fprintf(stderr, "FAIL: ssm_n_group=%d expected 16\n", w.ssm_n_group);
+        ok = false;
+    }
+
+    // MoE-specific: expert fields
+    if (w.n_expert != 256) {
+        std::fprintf(stderr, "FAIL: n_expert=%d expected 256\n", w.n_expert);
+        ok = false;
+    }
+    if (w.n_expert_used != 8) {
+        std::fprintf(stderr, "FAIL: n_expert_used=%d expected 8\n", w.n_expert_used);
+        ok = false;
+    }
+    if (w.expert_ff_dim != 512) {
+        std::fprintf(stderr, "FAIL: expert_ff_dim=%d expected 512\n", w.expert_ff_dim);
+        ok = false;
+    }
+    if (w.shared_ff_dim != 512) {
+        std::fprintf(stderr, "FAIL: shared_ff_dim=%d expected 512\n", w.shared_ff_dim);
+        ok = false;
+    }
+
+    // Count layer types
+    int n_attn = 0, n_delta = 0;
+    int n_expert_layers = 0;
+    for (int il = 0; il < w.n_layer; il++) {
+        const auto & L = w.layers[il];
+        bool attn = L.wq && L.wk && L.wv && L.wo;
+        bool ssm  = L.wqkv && L.wqkv_gate && L.ssm_conv1d;
+        if (attn) n_attn++;
+        if (ssm)  n_delta++;
+        if (L.ffn_gate_inp) n_expert_layers++;
+    }
+
+    std::printf("hparams: n_layer=%d n_embd=%d n_head=%d n_head_kv=%d head_dim=%d/%d fai=%d\n",
+        w.n_layer, w.n_embd, w.n_head, w.n_head_kv, w.n_embd_head_k, w.n_embd_head_v,
+        w.full_attention_interval);
+    std::printf("ssm:     conv=%d inner=%d state=%d dt_rank=%d n_group=%d\n",
+        w.ssm_d_conv, w.ssm_d_inner, w.ssm_d_state, w.ssm_dt_rank, w.ssm_n_group);
+    std::printf("moe:     n_expert=%d n_expert_used=%d expert_ff=%d shared_ff=%d\n",
+        w.n_expert, w.n_expert_used, w.expert_ff_dim, w.shared_ff_dim);
+    std::printf("layer counts: full_attn=%d delta_net=%d expert_layers=%d\n",
+        n_attn, n_delta, n_expert_layers);
+
+    if (n_attn != 10) {
+        std::fprintf(stderr, "FAIL: expected 10 full-attn layers, got %d\n", n_attn);
+        ok = false;
+    }
+    if (n_delta != 30) {
+        std::fprintf(stderr, "FAIL: expected 30 delta-net layers, got %d\n", n_delta);
+        ok = false;
+    }
+    if (n_expert_layers != 40) {
+        std::fprintf(stderr, "FAIL: expected 40 layers with ffn_gate_inp, got %d\n",
+            n_expert_layers);
+        ok = false;
+    }
+
+    // Verify expert tensors exist on layer 0
+    {
+        const auto & L = w.layers[0];
+        if (!L.ffn_gate_inp) {
+            std::fprintf(stderr, "FAIL: layer 0 missing ffn_gate_inp\n");
+            ok = false;
+        }
+        if (!L.ffn_up_exps) {
+            std::fprintf(stderr, "FAIL: layer 0 missing ffn_up_exps\n");
+            ok = false;
+        }
+        if (!L.ffn_gate_exps) {
+            std::fprintf(stderr, "FAIL: layer 0 missing ffn_gate_exps\n");
+            ok = false;
+        }
+        if (!L.ffn_down_exps) {
+            std::fprintf(stderr, "FAIL: layer 0 missing ffn_down_exps\n");
+            ok = false;
+        }
+        if (!L.ffn_up_shexp) {
+            std::fprintf(stderr, "FAIL: layer 0 missing ffn_up_shexp\n");
+            ok = false;
+        }
+        if (!L.ffn_gate_shexp) {
+            std::fprintf(stderr, "FAIL: layer 0 missing ffn_gate_shexp\n");
+            ok = false;
+        }
+        if (!L.ffn_down_shexp) {
+            std::fprintf(stderr, "FAIL: layer 0 missing ffn_down_shexp\n");
+            ok = false;
+        }
+        if (!L.ffn_gate_inp_shexp) {
+            std::fprintf(stderr, "FAIL: layer 0 missing ffn_gate_inp_shexp\n");
+            ok = false;
+        }
+
+        if (L.ffn_gate_inp) {
+            std::printf("layer 0 ffn_gate_inp: [%lld, %lld] type=%s\n",
+                (long long)L.ffn_gate_inp->ne[0], (long long)L.ffn_gate_inp->ne[1],
+                ggml_type_name(L.ffn_gate_inp->type));
+        }
+        if (L.ffn_up_exps) {
+            std::printf("layer 0 ffn_up_exps: [%lld, %lld, %lld] type=%s\n",
+                (long long)L.ffn_up_exps->ne[0], (long long)L.ffn_up_exps->ne[1],
+                (long long)L.ffn_up_exps->ne[2], ggml_type_name(L.ffn_up_exps->type));
+        }
+    }
+
+    free_target_weights(w);
+    ggml_backend_free(backend);
+
+    if (ok) {
+        std::printf("OK\n");
+        return 0;
+    } else {
+        return 1;
+    }
+}
diff --git a/dflash/test/smoke_moe_ffn.cpp b/dflash/test/smoke_moe_ffn.cpp
new file mode 100644
index 00000000..fa50d53f
--- /dev/null
+++ b/dflash/test/smoke_moe_ffn.cpp
@@ -0,0 +1,128 @@
+// Smoke test: single MoE FFN layer forward pass.
+// Loads the 35B-A3B MoE GGUF, picks layer 0, feeds a random vector through
+// just the MoE FFN (post-attention-norm → expert routing → shared expert → output),
+// and prints the output shape + a spot value.
+//
+// Usage: smoke_moe_ffn <path/to/qwen35moe.gguf>
+
+#include "dflash27b.h"
+#include "internal.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cuda.h"
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+using namespace dflash27b;
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        std::fprintf(stderr, "usage: %s <qwen35moe.gguf>\n", argv[0]);
+        return 2;
+    }
+
+    ggml_backend_t backend = ggml_backend_cuda_init(0);
+    if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; }
+
+    TargetWeights w;
+    if (!load_target_gguf(argv[1], backend, w)) {
+        std::fprintf(stderr, "load_target_gguf: %s\n", dflash27b_last_error());
+        return 1;
+    }
+
+    if (w.n_expert == 0) {
+        std::fprintf(stderr, "FAIL: expected MoE model, got dense (n_expert=0)\n");
+        return 1;
+    }
+
+    // Build a simple graph: MoE FFN on a single token
+    const int n_tokens = 1;
+    const int n_embd   = w.n_embd;
+
+    ggml_context * ctx = nullptr;
+    {
+        struct ggml_init_params params = {};
+        params.mem_size = 256 * 1024 * 1024;
+        params.no_alloc = true;
+        ctx = ggml_init(params);
+    }
+
+    ggml_cgraph * gf = ggml_new_graph(ctx);
+
+    // Input: random-ish values (just use 1.0 everywhere)
+    ggml_tensor * inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
+    ggml_set_name(inp, "inp");
+
+    // Apply post-attention norm (RMS norm)
+    const float eps = 1e-6f;
+    ggml_tensor * normed = ggml_rms_norm(ctx, inp, eps);
+    normed = ggml_mul(ctx, normed, w.layers[0].attn_post_norm);
+
+    // MoE FFN
+    ggml_tensor * moe_out = build_moe_ffn(ctx, gf, normed, w.layers[0], w);
+
+    // Residual add
+    ggml_tensor * out = ggml_add(ctx, moe_out, inp);
+    ggml_set_name(out, "out");
+    ggml_set_output(out);
+
+    ggml_build_forward_expand(gf, out);
+
+    // Allocate
+    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+        std::fprintf(stderr, "FAIL: graph alloc failed\n");
+        return 1;
+    }
+
+    // Set input data
+    std::vector<float> inp_data(n_embd * n_tokens, 1.0f);
+    ggml_backend_tensor_set(inp, inp_data.data(), 0, sizeof(float) * inp_data.size());
+
+    // Compute
+    ggml_backend_graph_compute(backend, gf);
+
+    // Read output
+    std::vector<float> out_data(n_embd * n_tokens);
+    ggml_backend_tensor_get(out, out_data.data(), 0, sizeof(float) * out_data.size());
+
+    std::printf("MoE FFN output shape: [%lld, %lld]\n",
+        (long long)out->ne[0], (long long)out->ne[1]);
+    std::printf("First 8 values: ");
+    for (int i = 0; i < 8 && i < (int)out_data.size(); i++) {
+        std::printf("%.6f ", out_data[i]);
+    }
+    std::printf("\n");
+
+    // Sanity: output should not be all zeros or NaN/Inf
+    bool ok = true;
+    bool all_zero = true;
+    bool has_nan = false;
+    bool has_inf = false;
+    for (auto v : out_data) {
+        if (v != 0.0f) all_zero = false;
+        if (std::isnan(v)) has_nan = true;
+        if (std::isinf(v)) has_inf = true;
+    }
+    if (all_zero) { std::fprintf(stderr, "FAIL: output is all zeros\n"); ok = false; }
+    if (has_nan)  { std::fprintf(stderr, "FAIL: output contains NaN\n"); ok = false; }
+    if (has_inf)  { std::fprintf(stderr, "FAIL: output contains Inf\n"); ok = false; }
+
+    ggml_gallocr_free(alloc);
+    ggml_free(ctx);
+    free_target_weights(w);
+    ggml_backend_free(backend);
+
+    if (ok) {
+        std::printf("OK\n");
+        return 0;
+    }
+    return 1;
+}
diff --git a/dflash/test/smoke_moe_target_forward.cpp b/dflash/test/smoke_moe_target_forward.cpp
new file mode 100644
index 00000000..e1ce5986
--- /dev/null
+++ b/dflash/test/smoke_moe_target_forward.cpp
@@ -0,0 +1,162 @@
+// Smoke test: full MoE target forward pass.
+// Loads Qwen3.6-35B-A3B, creates cache, runs 1-token decode through all 40 layers,
+// validates logits are sane.
+//
+// Usage: smoke_moe_target_forward <path/to/qwen35moe.gguf>
+
+#include "dflash27b.h"
+#include "internal.h"
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cuda.h"
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+using namespace dflash27b;
+
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        std::fprintf(stderr, "usage: %s <qwen35moe.gguf>\n", argv[0]);
+        return 2;
+    }
+
+    ggml_backend_t backend = ggml_backend_cuda_init(0);
+    if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; }
+
+    TargetWeights w;
+    if (!load_target_gguf(argv[1], backend, w)) {
+        std::fprintf(stderr, "load_target_gguf: %s\n", dflash27b_last_error());
+        return 1;
+    }
+    std::printf("[target] %s\n", dflash27b_last_error());
+
+    if (w.n_expert == 0) {
+        std::fprintf(stderr, "FAIL: expected MoE model\n");
+        return 1;
+    }
+
+    // Create cache
+    TargetCache cache;
+    const int max_ctx = 512;
+    if (!create_target_cache(w, max_ctx, 0, backend, cache)) {
+        std::fprintf(stderr, "FAIL: create_target_cache: %s\n", dflash27b_last_error());
+        return 1;
+    }
+    std::printf("[cache] attn_k=%zu attn_v=%zu ssm=%zu conv=%zu\n",
+        cache.attn_k.size(), cache.attn_v.size(),
+        cache.ssm_state.size(), cache.conv_state.size());
+
+    // Validate cache dimensions
+    bool ok = true;
+    // 10 full-attn layers (indices 3,7,11,15,19,23,27,31,35,39)
+    if (cache.attn_k.size() != 10) {
+        std::fprintf(stderr, "FAIL: expected 10 attn_k, got %zu\n", cache.attn_k.size());
+        ok = false;
+    }
+    // 30 delta-net layers
+    if (cache.ssm_state.size() != 30) {
+        std::fprintf(stderr, "FAIL: expected 30 ssm_state, got %zu\n", cache.ssm_state.size());
+        ok = false;
+    }
+
+    // Embed a single token
+    const int n_tokens = 1;
+    const int token_id = 1; // arbitrary
+    std::vector<float> embed_data(w.n_embd * n_tokens);
+    if (!w.embedder.embed(&token_id, n_tokens, embed_data.data())) {
+        std::fprintf(stderr, "FAIL: embedder.embed failed\n");
+        return 1;
+    }
+
+    // Build graph
+    ggml_context * ctx = nullptr;
+    {
+        struct ggml_init_params params = {};
+        params.mem_size = 512 * 1024 * 1024;
+        params.no_alloc = true;
+        ctx = ggml_init(params);
+    }
+
+    ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false);
+
+    ggml_tensor * inp_embed = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w.n_embd, n_tokens, 1);
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4 * n_tokens);
+    ggml_set_name(inp_embed, "inp_embed");
+    ggml_set_name(positions, "positions");
+    ggml_set_input(inp_embed);
+    ggml_set_input(positions);
+
+    QwenGraphInputs in;
+    in.inp_embed  = inp_embed;
+    in.positions  = positions;
+    in.n_tokens   = n_tokens;
+    in.kv_start   = 0;
+    in.attn_mask  = nullptr; // single token, no mask needed
+    in.capture_layers = false;
+    in.capture_delta_intermediate = false;
+
+    QwenGraphOutputs og = build_qwen35_graph(ctx, gf, w, cache, in);
+    ggml_set_output(og.logits);
+
+    ggml_build_forward_expand(gf, og.logits);
+    std::printf("[graph] nodes=%d\n", ggml_graph_n_nodes(gf));
+
+    // Allocate + set input
+    ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+        std::fprintf(stderr, "FAIL: graph alloc failed\n");
+        return 1;
+    }
+    ggml_backend_tensor_set(inp_embed, embed_data.data(), 0, sizeof(float) * embed_data.size());
+    int32_t pos4[4] = { 0, 0, 0, 0 };
+    ggml_backend_tensor_set(positions, pos4, 0, sizeof(int32_t) * 4);
+
+    // Compute
+    if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) {
+        std::fprintf(stderr, "FAIL: graph compute failed\n");
+        return 1;
+    }
+
+    // Read logits
+    std::printf("[logits_tensor] ne=[%lld,%lld,%lld]\n",
+        (long long)og.logits->ne[0], (long long)og.logits->ne[1], (long long)og.logits->ne[2]);
+    const int64_t vocab = og.logits->ne[0];
+    const int64_t logits_count = vocab * og.logits->ne[1];
+    std::vector<float> logits(logits_count);
+    ggml_backend_tensor_get(og.logits, logits.data(), 0, sizeof(float) * logits.size());
+
+    int n_nan = 0, n_inf = 0;
+    float min_l = INFINITY, max_l = -INFINITY;
+    for (int64_t i = 0; i < logits_count; i++) {
+        auto v = logits[i];
+        if (std::isnan(v)) n_nan++;
+        if (std::isinf(v)) n_inf++;
+        if (v < min_l) min_l = v;
+        if (v > max_l) max_l = v;
+    }
+    std::printf("[logits] vocab=%lld nan=%d inf=%d min=%.2f max=%.2f\n",
+        (long long)vocab, n_nan, n_inf, min_l, max_l);
+
+    if (n_nan > 0) { std::fprintf(stderr, "FAIL: logits contain NaN\n"); ok = false; }
+    if (n_inf > 0) { std::fprintf(stderr, "FAIL: logits contain Inf\n"); ok = false; }
+    if (max_l - min_l < 1.0f) { std::fprintf(stderr, "FAIL: logits range too small, possible constant output\n"); ok = false; }
+
+    ggml_gallocr_free(alloc);
+    ggml_free(ctx);
+    free_target_cache(cache);
+    free_target_weights(w);
+    ggml_backend_free(backend);
+
+    if (ok) {
+        std::printf("OK\n");
+        return 0;
+    }
+    return 1;
+}
diff --git a/dflash/test/test_dflash.cpp b/dflash/test/test_dflash.cpp
index 14b020f8..5a6a904e 100644
--- a/dflash/test/test_dflash.cpp
+++ b/dflash/test/test_dflash.cpp
@@ -615,7 +615,7 @@ static bool build_target_step(
     sg.ctx = ggml_init(ip);
     if (!sg.ctx) return false;
 
-    const int hidden = DFLASH27B_TARGET_HIDDEN;
+    const int hidden = w.n_embd;
     sg.inp_embed = ggml_new_tensor_3d(sg.ctx, GGML_TYPE_F32, hidden, n_tokens, 1);
     ggml_set_name(sg.inp_embed, "inp_embed");
     ggml_set_input(sg.inp_embed);
@@ -684,7 +684,7 @@ static bool build_target_step_tree(
     sg.ctx = ggml_init(ip);
     if (!sg.ctx) return false;
 
-    const int hidden = DFLASH27B_TARGET_HIDDEN;
+    const int hidden = w.n_embd;
     sg.inp_embed = ggml_new_tensor_3d(sg.ctx, GGML_TYPE_F32, hidden, n_tokens, 1);
     ggml_set_name(sg.inp_embed, "inp_embed");
     ggml_set_input(sg.inp_embed);
@@ -731,6 +731,91 @@ static bool build_target_step_tree(
     return ggml_gallocr_alloc_graph(sg.alloc, sg.gf);
 }
 
+// Reusable tree-verify graph: fixed n_tokens and kv_start so CUDA graphs can
+// replay across steps.  Writes K/V to scratch slots [ws..ws+max_n-1] and
+// the attention mask covers [0..ws+max_n).  After compute the caller copies
+// K/V from scratch slots to the real committed positions.
+static bool build_target_step_tree_reusable(
+    StepGraph & sg,
+    const TargetWeights & w,
+    TargetCache & cache,
+    ggml_backend_t backend,
+    int max_ctx,
+    int max_n)
+{
+    step_graph_free(sg);
+
+    ggml_init_params ip{};
+    ip.mem_size   = 512 * 1024 * 1024;
+    ip.mem_buffer = nullptr;
+    ip.no_alloc   = true;
+    sg.ctx = ggml_init(ip);
+    if (!sg.ctx) return false;
+
+    const int hidden = w.n_embd;
+    const int write_start = max_ctx - max_n;
+
+    sg.inp_embed = ggml_new_tensor_3d(sg.ctx, GGML_TYPE_F32, hidden, max_n, 1);
+    ggml_set_name(sg.inp_embed, "inp_embed");
+    ggml_set_input(sg.inp_embed);
+
+    sg.positions = ggml_new_tensor_1d(sg.ctx, GGML_TYPE_I32, 4 * max_n);
+    ggml_set_name(sg.positions, "positions");
+    ggml_set_input(sg.positions);
+
+    const int kv_pad = align_up(max_ctx, g_kq_stride_pad);
+    const int q_pad  = align_up(max_n, KQ_MASK_PAD);
+    sg.attn_mask = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, kv_pad, q_pad);
+    ggml_set_name(sg.attn_mask, "attn_mask");
+    ggml_set_input(sg.attn_mask);
+
+    sg.parent_ids = ggml_new_tensor_1d(sg.ctx, GGML_TYPE_I32, max_n);
+    ggml_set_name(sg.parent_ids, "parent_ids");
+    ggml_set_input(sg.parent_ids);
+
+    sg.gf = ggml_new_graph_custom(sg.ctx, 16384, false);
+
+    QwenGraphInputs gi{};
+    gi.inp_embed                  = sg.inp_embed;
+    gi.positions                  = sg.positions;
+    gi.attn_mask                  = sg.attn_mask;
+    gi.n_tokens                   = max_n;
+    gi.kv_start                   = write_start;
+    gi.capture_layers             = true;
+    gi.capture_delta_intermediate = true;
+    gi.parent_ids                 = sg.parent_ids;
+
+    QwenGraphOutputs go = build_qwen35_graph(sg.ctx, sg.gf, w, cache, gi);
+    if (!go.logits) return false;
+    sg.logits = go.logits;
+    sg.delta_captures = std::move(go.delta_captures);
+    ggml_set_output(sg.logits);
+    ggml_build_forward_expand(sg.gf, sg.logits);
+
+    if (!sg.alloc) {
+        sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    }
+    if (!ggml_gallocr_reserve(sg.alloc, sg.gf)) return false;
+    return ggml_gallocr_alloc_graph(sg.alloc, sg.gf);
+}
+
+static void copy_kv_slots_batch(
+    const std::vector<ggml_tensor *> & cache_tensors,
+    int src_start, int dst_start, int n_slots, int n_heads)
+{
+    for (auto * ct : cache_tensors) {
+        const size_t slot_bytes = ct->nb[1];
+        const size_t head_stride = ct->nb[2];
+        const char * base = (const char *) ct->data;
+        for (int s = 0; s < n_slots; s++) {
+            cudaMemcpy2D(
+                (void *)(base + (size_t)(dst_start + s) * slot_bytes), head_stride,
+                (const void *)(base + (size_t)(src_start + s) * slot_bytes), head_stride,
+                slot_bytes, n_heads, cudaMemcpyDeviceToDevice);
+        }
+    }
+}
+
 static bool build_draft_step(
     StepGraph & sg,
     const DraftWeights & dw,
@@ -746,9 +831,9 @@ static bool build_draft_step(
     sg.ctx = ggml_init(ip);
     if (!sg.ctx) return false;
 
-    const int hidden = DFLASH27B_TARGET_HIDDEN;
-    const int q_len  = DFLASH27B_DRAFT_BLOCK_SIZE;
-    const int fc_in  = DFLASH27B_DRAFT_N_TARGET_LAYERS * hidden;
+    const int hidden = tw.n_embd;
+    const int q_len  = dw.hparams.block_size;
+    const int fc_in  = dw.hparams.n_target_layers * hidden;
 
     sg.inp_embed = ggml_new_tensor_3d(sg.ctx, GGML_TYPE_F32, hidden, q_len, 1);
     ggml_set_name(sg.inp_embed, "inp_embed");
@@ -916,10 +1001,10 @@ int main(int argc, char ** argv) {
     // Profile mode intentionally keeps the intermediate cache tiny (no capture)
     // so we can go up to n_tokens=128 without OOM.
     const int max_verify_tokens = profile_scaling
-        ? DFLASH27B_DRAFT_BLOCK_SIZE
+        ? dw.hparams.block_size
         : (ddtree_mode
-            ? std::max<int>(DFLASH27B_DRAFT_BLOCK_SIZE, ddtree_budget + 1)
-            : DFLASH27B_DRAFT_BLOCK_SIZE);
+            ? std::max<int>(dw.hparams.block_size, ddtree_budget + 1)
+            : dw.hparams.block_size);
     TargetCache cache;
     if (!create_target_cache(w, max_ctx, max_verify_tokens, backend, cache,
                              /*prefill_only=*/true)) {
@@ -929,7 +1014,7 @@ int main(int argc, char ** argv) {
 
     // ── Profile mode: microbench target forward at varying N ───────────
     if (profile_scaling) {
-        const int hidden_p = DFLASH27B_TARGET_HIDDEN;
+        const int hidden_p = w.n_embd;
         StepGraph psg;
         const int n_values[] = { 1, 4, 8, 12, 16, 20, 24, 32, 48, 64, 96, 128 };
         std::printf("[profile] target forward ms at varying N (kv_start=0, no capture)\n");
@@ -1499,6 +1584,22 @@ int main(int argc, char ** argv) {
     std::vector<int32_t> pos_q_buf(q_len), pos_k_buf(max_ctx + q_len);
     std::vector<int32_t> pos4_buf(4 * q_len);
 
+    // Reusable tree-verify graph for DDTree CUDA graph replay.
+    // Built once with max_n = budget+1 tokens and fixed kv_start.
+    StepGraph sg_tree;
+    const int ddtree_max_n = ddtree_budget + 1;
+    const int tree_write_start = max_ctx - ddtree_max_n;
+    const int tree_kv_pad = align_up(max_ctx, g_kq_stride_pad);
+    const int tree_q_pad  = align_up(ddtree_max_n, KQ_MASK_PAD);
+    const int n_head_kv_tree = w.n_head_kv;
+    std::vector<uint16_t> tree_mask_buf((size_t)tree_kv_pad * tree_q_pad, F16_NEG_INF);
+    if (ddtree_mode) {
+        if (!build_target_step_tree_reusable(sg_tree, w, cache, backend,
+                                              max_ctx, ddtree_max_n)) {
+            std::fprintf(stderr, "reusable tree verify build failed\n"); return 1;
+        }
+    }
+
     auto t_gen0 = std::chrono::steady_clock::now();
 
     // Per-phase timing accumulators (microseconds)
@@ -1682,64 +1783,80 @@ int main(int argc, char ** argv) {
 
             const int N = 1 + tree.n_nodes;  // flat size including root
 
-            if (!build_target_step_tree(sg, w, cache, backend,
-                                        /*kv_start=*/committed, /*n_tokens=*/N,
-                                        g_fa_window)) {
-                std::fprintf(stderr, "ddtree verify build failed\n"); return 1;
-            }
+            // Reuse pre-built graph — no rebuild needed
             T_verify_build = sync_us();
             tt_verify_build += std::chrono::duration<double, std::micro>(T_verify_build - T_snap).count();
 
             // Embeddings: [last_tok, tree.token_ids[0..n_nodes-1]]
-            std::vector<int32_t> flat_tokens(N);
+            // Pad to ddtree_max_n with last_tok (harmless — unused by the mask)
+            std::vector<int32_t> flat_tokens(ddtree_max_n, last_tok);
             flat_tokens[0] = last_tok;
             for (int i = 0; i < tree.n_nodes; i++) flat_tokens[1 + i] = tree.token_ids[i];
 
-            std::vector<float> tree_embed(hidden * N);
-            if (!w.embedder.embed(flat_tokens.data(), N, tree_embed.data())) return 1;
-            ggml_backend_tensor_set(sg.inp_embed, tree_embed.data(), 0,
-                                    sizeof(float) * hidden * N);
-
-            // M-RoPE axis-major positions: committed + depth_of_node.
-            // Slot 0 = root = depth 0 → position `committed`.
-            std::vector<int32_t> pos4(4 * N);
-            for (int i = 0; i < N; i++) {
-                int p = committed + (i == 0 ? 0 : tree.depths[i - 1]);
-                pos4[0 * N + i] = p;
-                pos4[1 * N + i] = p;
-                pos4[2 * N + i] = p;
-                pos4[3 * N + i] = 0;
+            std::vector<float> tree_embed(hidden * ddtree_max_n);
+            if (!w.embedder.embed(flat_tokens.data(), ddtree_max_n, tree_embed.data())) return 1;
+            ggml_backend_tensor_set(sg_tree.inp_embed, tree_embed.data(), 0,
+                                    sizeof(float) * hidden * ddtree_max_n);
+
+            // M-RoPE positions: committed + depth_of_node for real nodes;
+            // pad remaining slots with committed (harmless, masked out).
+            std::vector<int32_t> pos4(4 * ddtree_max_n);
+            for (int i = 0; i < ddtree_max_n; i++) {
+                int p = (i < N)
+                    ? committed + (i == 0 ? 0 : tree.depths[i - 1])
+                    : committed;
+                pos4[0 * ddtree_max_n + i] = p;
+                pos4[1 * ddtree_max_n + i] = p;
+                pos4[2 * ddtree_max_n + i] = p;
+                pos4[3 * ddtree_max_n + i] = 0;
             }
-            ggml_backend_tensor_set(sg.positions, pos4.data(), 0, sizeof(int32_t) * 4 * N);
+            ggml_backend_tensor_set(sg_tree.positions, pos4.data(), 0,
+                                    sizeof(int32_t) * 4 * ddtree_max_n);
 
-            // Ancestor-only attention mask (f16).
+            // Ancestor-only mask: fill the full fixed-size buffer.
+            // build_tree_mask writes [q_pad, kv_pad] but with variable kv_pad.
+            // We build into a temp buffer then copy, OR build directly into
+            // tree_mask_buf which is pre-sized for max dimensions.
             const int tree_win_start = (g_fa_window > 0 && committed > g_fa_window)
                                            ? (committed - g_fa_window) : 0;
-            build_tree_mask(tree, /*past_length=*/committed, mask_buf, tree_win_start);
-            ggml_backend_tensor_set(sg.attn_mask, mask_buf.data(), 0,
-                                    sizeof(uint16_t) * mask_buf.size());
-
-            // parent_ids for tree-mode DeltaNet kernel.
-            // Slot 0 (root): -1 (reload initial state — matches kernel's skip
-            // at t==0). Slots 1..N-1: the tree's parent index in the flat array.
-            std::vector<int32_t> parent_ids(N);
+            {
+                // Fill with -inf
+                tree_mask_buf.assign((size_t)tree_kv_pad * tree_q_pad, F16_NEG_INF);
+                // Past KV (windowed committed positions): always visible
+                for (int q = 0; q < N; q++) {
+                    for (int k = tree_win_start; k < committed; k++) {
+                        tree_mask_buf[(size_t)q * tree_kv_pad + k] = F16_ZERO;
+                    }
+                    // Tree region at scratch slots: ancestors only
+                    for (int j = 0; j < N; j++) {
+                        if (tree.visibility[(size_t)q * N + j]) {
+                            tree_mask_buf[(size_t)q * tree_kv_pad + (tree_write_start + j)] = F16_ZERO;
+                        }
+                    }
+                }
+            }
+            ggml_backend_tensor_set(sg_tree.attn_mask, tree_mask_buf.data(), 0,
+                                    sizeof(uint16_t) * tree_mask_buf.size());
+
+            // parent_ids: pad to ddtree_max_n with -1
+            std::vector<int32_t> parent_ids(ddtree_max_n, -1);
             parent_ids[0] = -1;
             for (int i = 1; i < N; i++) parent_ids[i] = (int32_t)tree.parents[i];
-            ggml_backend_tensor_set(sg.parent_ids, parent_ids.data(), 0,
-                                    sizeof(int32_t) * N);
+            ggml_backend_tensor_set(sg_tree.parent_ids, parent_ids.data(), 0,
+                                    sizeof(int32_t) * ddtree_max_n);
 
             T_verify_set = sync_us();
             tt_verify_set += std::chrono::duration<double, std::micro>(T_verify_set - T_verify_build).count();
 
-            st = ggml_backend_graph_compute(backend, sg.gf);
+            st = ggml_backend_graph_compute(backend, sg_tree.gf);
             if (st != GGML_STATUS_SUCCESS) {
                 std::fprintf(stderr, "ddtree verify compute %d\n", (int)st); return 1;
             }
             T_verify_compute = sync_us();
             tt_verify_compute += std::chrono::duration<double, std::micro>(T_verify_compute - T_verify_set).count();
 
-            // Read the N verify logits, compute posterior argmax per slot.
-            ggml_backend_tensor_get(sg.logits, verify_logits_buf.data(), 0,
+            // Read the N verify logits (only first N of ddtree_max_n slots)
+            ggml_backend_tensor_get(sg_tree.logits, verify_logits_buf.data(), 0,
                                     sizeof(float) * vocab * N);
             std::vector<int32_t> posterior(N);
             for (int i = 0; i < N; i++) {
@@ -1825,10 +1942,10 @@ int main(int argc, char ** argv) {
             }
 
             {
-                const int n_delta = (int)sg.delta_captures.size();
+                const int n_delta = (int)sg_tree.delta_captures.size();
                 cudaStream_t stream = nullptr;
                 for (int il = 0; il < n_delta; il++) {
-                    const DeltaNetCapture & cap = sg.delta_captures[il];
+                    const DeltaNetCapture & cap = sg_tree.delta_captures[il];
                     if (!cap.ssm_intermediate_states || !cap.conv_input) {
                         std::fprintf(stderr, "ddtree rollback: missing capture layer %d\n", il);
                         return 1;
@@ -1905,24 +2022,20 @@ int main(int argc, char ** argv) {
                     }
                 }
 
-                // target_feat compaction: written in DFS order during verify
-                // (column kv_start+i = dfs slot i's features). Same logic as
-                // KV cache: when accepted[d] != d, copy the accepted DFS slot's
-                // features to the spine slot at d so next iter's draft reads
-                // the right history. Position→slot uses `% target_feat_cap`
-                // to account for the ring buffer.
+                // target_feat compaction: written at scratch slots
+                // (tree_write_start + dfs_idx) during verify. Copy accepted
+                // DFS slots' features to the committed spine positions.
                 if (cache.target_feat) {
                     const size_t elt = ggml_element_size(cache.target_feat);
-                    const int    fc_in = (int)cache.target_feat->ne[0];  // 5*hidden
+                    const int    fc_in = (int)cache.target_feat->ne[0];
                     const size_t col_stride = cache.target_feat->nb[1];
                     const int    tcap = cache.target_feat_cap;
-                    for (int d = 1; d < commit_n; d++) {
+                    for (int d = 0; d < commit_n; d++) {
                         const int src_dfs = accepted[d];
-                        if (src_dfs == d) continue;
-                        const int    src_slot = (committed + src_dfs) % tcap;
-                        const int    dst_slot = (committed + d)       % tcap;
-                        const size_t src_off  = (size_t)src_slot * col_stride;
-                        const size_t dst_off  = (size_t)dst_slot * col_stride;
+                        const int src_feat_slot = (tree_write_start + src_dfs) % tcap;
+                        const int dst_feat_slot = (committed + d) % tcap;
+                        const size_t src_off = (size_t)src_feat_slot * col_stride;
+                        const size_t dst_off = (size_t)dst_feat_slot * col_stride;
                         cudaMemcpyAsync((char *)cache.target_feat->data + dst_off,
                                         (const char *)cache.target_feat->data + src_off,
                                         (size_t)fc_in * elt,
@@ -1930,34 +2043,22 @@ int main(int argc, char ** argv) {
                     }
                 }
 
-                // Full-attention KV compaction: the verify wrote K/V at slots
-                // [committed..committed+N-1] in DFS tree order (slot 0 = root).
-                // For the next iter's verify to see the correct committed
-                // prefix, slots [committed..committed+commit_n-1] must hold
-                // the K/V of the accepted path's committed tokens. For each
-                // committed position d in 0..commit_n-1, the source K/V is at
-                // DFS slot accepted[d]. d==0 is always the root (DFS slot 0),
-                // trivially aligned. For d>=1, copy if accepted[d] != d.
+                // Full-attention KV compaction: the verify wrote K/V at scratch
+                // slots [tree_write_start..tree_write_start+N-1]. Copy the
+                // accepted DFS slots to committed positions.
                 const int n_full_attn = (int)cache.attn_k.size();
                 for (int d = 0; d < commit_n; d++) {
                     const int src_dfs = accepted[d];
-                    const int dst_slot = d;
-                    if (src_dfs == dst_slot) continue;  // already aligned
+                    const int src_slot = tree_write_start + src_dfs;
+                    const int dst_slot = committed + d;
                     for (int l = 0; l < n_full_attn; l++) {
-                        // Each slot: head_dim * n_kv floats in f16 per tensor.
                         ggml_tensor * ck = cache.attn_k[l];
                         ggml_tensor * cv = cache.attn_v[l];
-                        const size_t slot_bytes = ck->nb[1];  // stride between slots
-                        const size_t src_off = (size_t)(committed + src_dfs) * slot_bytes;
-                        const size_t dst_off = (size_t)(committed + dst_slot) * slot_bytes;
-                        // Per-head-kv layout: shape [head_dim, max_ctx, n_head_kv].
-                        // nb[2] is distance between heads; we copy one slot's
-                        // slice per head. For simplicity, do a 2D copy across
-                        // the head dimension.
+                        const size_t slot_bytes = ck->nb[1];
                         const int n_kv = (int)ck->ne[2];
                         for (int h = 0; h < n_kv; h++) {
-                            const size_t head_src = src_off + (size_t)h * ck->nb[2];
-                            const size_t head_dst = dst_off + (size_t)h * ck->nb[2];
+                            const size_t head_src = (size_t)src_slot * slot_bytes + (size_t)h * ck->nb[2];
+                            const size_t head_dst = (size_t)dst_slot * slot_bytes + (size_t)h * ck->nb[2];
                             cudaMemcpyAsync((char *)ck->data + head_dst,
                                             (const char *)ck->data + head_src,
                                             slot_bytes, cudaMemcpyDeviceToDevice, stream);
diff --git a/dflash/test/test_generate.cpp b/dflash/test/test_generate.cpp
index 68be0fe8..dfbfdcd8 100644
--- a/dflash/test/test_generate.cpp
+++ b/dflash/test/test_generate.cpp
@@ -18,6 +18,8 @@
 #include "ggml-backend.h"
 #include "ggml-cuda.h"
 
+#include <cuda_runtime.h>
+
 #include <algorithm>
 #include <chrono>
 #include <cinttypes>
@@ -49,21 +51,19 @@ struct StepGraph {
     ggml_gallocr_t    alloc = nullptr;
     ggml_tensor *     inp_embed = nullptr;
     ggml_tensor *     positions = nullptr;
+    ggml_tensor *     attn_mask = nullptr;
     ggml_tensor *     logits    = nullptr;
+    ggml_tensor *     argmax_out = nullptr;
 };
 
-// Build a fresh single-token forward graph. We rebuild per step so that
-// `kv_start` updates drive the correct KV cache slot. The graph is cheap to
-// rebuild — all the weights + KV cache stay persistent.
-static bool build_step_graph(
+static bool build_reusable_graph(
     StepGraph & sg,
     const TargetWeights & w,
     TargetCache & cache,
     ggml_backend_t backend,
-    int kv_start
+    int max_ctx
 ) {
-    if (sg.alloc) { ggml_gallocr_free(sg.alloc); sg.alloc = nullptr; }
-    if (sg.ctx)   { ggml_free(sg.ctx); sg.ctx = nullptr; }
+    if (sg.ctx) { ggml_free(sg.ctx); sg.ctx = nullptr; }
 
     ggml_init_params ip{};
     ip.mem_size   = 256 * 1024 * 1024;
@@ -73,30 +73,40 @@ static bool build_step_graph(
     if (!sg.ctx) return false;
 
     const int n_tokens = 1;
-    const int hidden = DFLASH27B_TARGET_HIDDEN;
+    const int hidden = w.n_embd;
+
     sg.inp_embed = ggml_new_tensor_3d(sg.ctx, GGML_TYPE_F32, hidden, n_tokens, 1);
     sg.positions = ggml_new_tensor_1d(sg.ctx, GGML_TYPE_I32, 4 * n_tokens);
+    sg.attn_mask = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, max_ctx, n_tokens);
     ggml_set_input(sg.inp_embed);
     ggml_set_input(sg.positions);
+    ggml_set_input(sg.attn_mask);
 
     sg.gf = ggml_new_graph_custom(sg.ctx, 8192, false);
 
     QwenGraphInputs gi{};
     gi.inp_embed      = sg.inp_embed;
     gi.positions      = sg.positions;
-    gi.attn_mask      = nullptr;        // n_tokens==1, no mask needed
+    gi.attn_mask      = sg.attn_mask;
     gi.n_tokens       = n_tokens;
-    gi.kv_start       = kv_start;
+    gi.kv_start       = max_ctx - 1;
     gi.capture_layers = false;
 
     QwenGraphOutputs go = build_qwen35_graph(sg.ctx, sg.gf, w, cache, gi);
     if (!go.logits) return false;
-    ggml_set_output(go.logits);
-    ggml_build_forward_expand(sg.gf, go.logits);
+
+    sg.argmax_out = ggml_argmax(sg.ctx, go.logits);
+    ggml_set_output(sg.argmax_out);
+    ggml_build_forward_expand(sg.gf, sg.argmax_out);
     sg.logits = go.logits;
 
-    sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    return ggml_gallocr_alloc_graph(sg.alloc, sg.gf);
+    if (!sg.alloc) {
+        sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    }
+    if (!ggml_gallocr_reserve(sg.alloc, sg.gf)) return false;
+    if (!ggml_gallocr_alloc_graph(sg.alloc, sg.gf)) return false;
+
+    return true;
 }
 
 static std::vector<int32_t> read_int32_file(const std::string & path) {
@@ -116,6 +126,22 @@ static bool write_int32_file(const std::string & path, const std::vector<int32_t
     return (bool)f;
 }
 
+static void copy_kv_slot(
+    const std::vector<ggml_tensor *> & cache_tensors,
+    int src_slot, int dst_slot, int max_ctx, int n_heads
+) {
+    for (auto * ct : cache_tensors) {
+        const size_t pos_bytes = ct->nb[1];
+        const size_t head_stride = ct->nb[2];
+        const char * base = (const char *) ct->data;
+        cudaMemcpy2D(
+            (void *)(base + (size_t)dst_slot * pos_bytes), head_stride,
+            (const void *)(base + (size_t)src_slot * pos_bytes), head_stride,
+            pos_bytes, n_heads, cudaMemcpyDeviceToDevice
+        );
+    }
+}
+
 int main(int argc, char ** argv) {
     if (argc < 5) {
         std::fprintf(stderr,
@@ -144,7 +170,6 @@ int main(int argc, char ** argv) {
 #endif
     };
 
-    // ── Load model and cache
     ggml_backend_t backend = ggml_backend_cuda_init(0);
     if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; }
 
@@ -173,22 +198,25 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    const int n_full_attn = w.n_layer / w.full_attention_interval;
+    const int n_head_kv = w.n_head_kv;
+
+    StepGraph sg;
+    if (!build_reusable_graph(sg, w, cache, backend, max_ctx)) {
+        std::fprintf(stderr, "build_reusable_graph failed\n");
+        return 1;
+    }
+
+    std::vector<ggml_fp16_t> mask_buf(max_ctx);
     std::vector<int32_t> all_tokens = prompt;
     all_tokens.reserve(prompt.size() + n_gen);
-
-    const int hidden = DFLASH27B_TARGET_HIDDEN;
+    const int hidden = w.n_embd;
     std::vector<float> embed_buf(hidden);
+    int32_t argmax_result = 0;
 
-    StepGraph sg;
-
-    // ── Helper: run one step given current token + absolute position
-    auto run_step = [&](int32_t tok, int pos) -> int32_t {
-        if (!build_step_graph(sg, w, cache, backend, pos)) {
-            std::fprintf(stderr, "build_step_graph failed at pos=%d\n", pos);
-            std::exit(1);
-        }
+    const int write_slot = max_ctx - 1;
 
-        // CPU embed
+    auto run_step_reuse = [&](int32_t tok, int pos) -> int32_t {
         int32_t ids[1] = { tok };
         if (!w.embedder.embed(ids, 1, embed_buf.data())) {
             std::fprintf(stderr, "embed failed tok=%d\n", tok);
@@ -197,53 +225,78 @@ int main(int argc, char ** argv) {
         ggml_backend_tensor_set(sg.inp_embed, embed_buf.data(), 0,
                                 sizeof(float) * embed_buf.size());
 
-        // M-RoPE positions: 4 copies of pos
         int32_t p4[4] = { pos, pos, pos, pos };
         ggml_backend_tensor_set(sg.positions, p4, 0, sizeof(int32_t) * 4);
 
+        for (int i = 0; i < max_ctx; i++) {
+            bool attend = (i < pos) || (i == write_slot);
+            mask_buf[i] = ggml_fp32_to_fp16(attend ? 0.0f : -INFINITY);
+        }
+        ggml_backend_tensor_set(sg.attn_mask, mask_buf.data(), 0,
+                                sizeof(ggml_fp16_t) * max_ctx);
+
         auto st = ggml_backend_graph_compute(backend, sg.gf);
         if (st != GGML_STATUS_SUCCESS) {
             std::fprintf(stderr, "compute failed at pos=%d (%d)\n", pos, (int)st);
             std::exit(1);
         }
 
-        // argmax on logits
-        const int vocab = DFLASH27B_TARGET_VOCAB;
-        std::vector<float> logits(vocab);
-        ggml_backend_tensor_get(sg.logits, logits.data(), 0, sizeof(float) * vocab);
-        int best = 0;
-        float bv = logits[0];
-        for (int i = 1; i < vocab; i++) {
-            if (logits[i] > bv) { bv = logits[i]; best = i; }
-        }
-        return best;
+        ggml_backend_tensor_get(sg.argmax_out, &argmax_result, 0, sizeof(int32_t));
+
+        copy_kv_slot(cache.attn_k, write_slot, pos, max_ctx, n_head_kv);
+        copy_kv_slot(cache.attn_v, write_slot, pos, max_ctx, n_head_kv);
+
+        return argmax_result;
     };
 
     // ── Prefill: feed prompt tokens one at a time (decode-only mode).
-    //    We throw away the logits for all prompt tokens except the last one.
     int next = -1;
     for (int i = 0; i < (int)prompt.size(); i++) {
-        next = run_step(prompt[i], i);
+        next = run_step_reuse(prompt[i], i);
     }
     std::printf("[prefill] last-token argmax=%d\n", next);
 
-    // ── Generation loop
+    // ── Generation loop (CUDA graph captures on first step)
     auto t_start = std::chrono::steady_clock::now();
+    double total_compute = 0;
     int gen_start_pos = (int)prompt.size();
     for (int g = 0; g < n_gen; g++) {
         int32_t tok = next;
         all_tokens.push_back(tok);
         stream_emit(tok);
-        next = run_step(tok, gen_start_pos + g);
+
+        auto t0 = std::chrono::steady_clock::now();
+
+        if (!w.embedder.embed(&tok, 1, embed_buf.data())) return 1;
+        ggml_backend_tensor_set(sg.inp_embed, embed_buf.data(), 0, sizeof(float) * embed_buf.size());
+        int32_t p4[4] = { gen_start_pos + g, gen_start_pos + g, gen_start_pos + g, gen_start_pos + g };
+        ggml_backend_tensor_set(sg.positions, p4, 0, sizeof(int32_t) * 4);
+        for (int i = 0; i < max_ctx; i++) {
+            bool attend = (i < gen_start_pos + g) || (i == write_slot);
+            mask_buf[i] = ggml_fp32_to_fp16(attend ? 0.0f : -INFINITY);
+        }
+        ggml_backend_tensor_set(sg.attn_mask, mask_buf.data(), 0, sizeof(ggml_fp16_t) * max_ctx);
+
+        ggml_backend_graph_compute(backend, sg.gf);
+
+        ggml_backend_tensor_get(sg.argmax_out, &argmax_result, 0, sizeof(int32_t));
+        next = argmax_result;
+
+        copy_kv_slot(cache.attn_k, write_slot, gen_start_pos + g, max_ctx, n_head_kv);
+        copy_kv_slot(cache.attn_v, write_slot, gen_start_pos + g, max_ctx, n_head_kv);
+
+        auto t1 = std::chrono::steady_clock::now();
+        total_compute += std::chrono::duration<double>(t1 - t0).count();
     }
     auto t_end = std::chrono::steady_clock::now();
     double secs = std::chrono::duration<double>(t_end - t_start).count();
     double tps  = n_gen / std::max(1e-9, secs);
 
-    // Also push the final next token so downstream sees it
     all_tokens.push_back(next);
 
     std::printf("[gen] %d new tokens in %.3f s  ->  %.2f tok/s\n", n_gen, secs, tps);
+    std::printf("[gen] compute=%.3f s (%.1f%% of total)\n",
+        total_compute, 100.0 * total_compute / std::max(1e-12, secs));
     std::printf("[gen] tokens: ");
     for (int i = 0; i < n_gen; i++) std::printf("%d ", all_tokens[prompt.size() + i]);
     std::printf("\n");