diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt index 21f96573..d76c8693 100644 --- a/dflash/CMakeLists.txt +++ b/dflash/CMakeLists.txt @@ -29,6 +29,7 @@ endif() # Hardcoded for CUDA. No libllama, no BLAS, no Metal, no Vulkan. set(GGML_CUDA ON CACHE BOOL "" FORCE) +set(GGML_CUDA_GRAPHS ON CACHE BOOL "" FORCE) set(GGML_BACKEND_DL OFF CACHE BOOL "" FORCE) set(GGML_METAL OFF CACHE BOOL "" FORCE) set(GGML_VULKAN OFF CACHE BOOL "" FORCE) @@ -137,6 +138,26 @@ if(DFLASH27B_TESTS) target_include_directories(test_generate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) target_link_libraries(test_generate PRIVATE dflash27b ggml ggml-cuda) endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_moe_draft.cpp") + add_executable(smoke_load_moe_draft test/smoke_load_moe_draft.cpp) + target_include_directories(smoke_load_moe_draft PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) + target_link_libraries(smoke_load_moe_draft PRIVATE dflash27b ggml ggml-cuda) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_moe_target_forward.cpp") + add_executable(smoke_moe_target_forward test/smoke_moe_target_forward.cpp) + target_include_directories(smoke_moe_target_forward PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) + target_link_libraries(smoke_moe_target_forward PRIVATE dflash27b ggml ggml-cuda) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_moe_ffn.cpp") + add_executable(smoke_moe_ffn test/smoke_moe_ffn.cpp) + target_include_directories(smoke_moe_ffn PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) + target_link_libraries(smoke_moe_ffn PRIVATE dflash27b ggml ggml-cuda) + endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_moe_target.cpp") + add_executable(smoke_load_moe_target test/smoke_load_moe_target.cpp) + target_include_directories(smoke_load_moe_target PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) + target_link_libraries(smoke_load_moe_target PRIVATE dflash27b ggml ggml-cuda) + endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_dflash.cpp") add_executable(test_dflash test/test_dflash.cpp) target_include_directories(test_dflash PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) diff --git a/dflash/deps/llama.cpp b/dflash/deps/llama.cpp index b6ffab4a..13722831 160000 --- a/dflash/deps/llama.cpp +++ b/dflash/deps/llama.cpp @@ -1 +1 @@ -Subproject commit b6ffab4a9d3ee7dc2bd39354c86f6bb11ab15420 +Subproject commit 13722831748a762108ad19fdf9112eb58f48378b diff --git a/dflash/src/gguf_target_loader.cpp b/dflash/src/gguf_target_loader.cpp index d6584fd5..d23dadbc 100644 --- a/dflash/src/gguf_target_loader.cpp +++ b/dflash/src/gguf_target_loader.cpp @@ -202,7 +202,8 @@ bool load_target_gguf(const std::string & path, return false; } - // Validate arch + the dimensions we hardcode everywhere. + // Validate arch — accept both qwen35 (dense) and qwen35moe (MoE) + std::string arch; { int64_t arch_id = gguf_find_key(gctx, "general.architecture"); if (arch_id < 0) { @@ -210,48 +211,80 @@ bool load_target_gguf(const std::string & path, gguf_free(gctx); return false; } - const char * arch = gguf_get_val_str(gctx, arch_id); - if (std::string(arch) != "qwen35") { - set_last_error(std::string("unexpected arch: ") + arch + " (expected qwen35)"); + arch = gguf_get_val_str(gctx, arch_id); + if (arch != "qwen35" && arch != "qwen35moe") { + set_last_error(std::string("unexpected arch: ") + arch + " (expected qwen35 or qwen35moe)"); gguf_free(gctx); return false; } } std::string err; - const uint32_t n_embd = get_u32_or(gctx, "qwen35.embedding_length", 0); - const uint32_t n_ff = get_u32_or(gctx, "qwen35.feed_forward_length", 0); - const uint32_t n_layer= get_u32_or(gctx, "qwen35.block_count", 0); - const uint32_t n_head = get_u32_or(gctx, "qwen35.attention.head_count",0); - const uint32_t n_headkv=get_u32_or(gctx, "qwen35.attention.head_count_kv",0); - const uint32_t kl = get_u32_or(gctx, "qwen35.attention.key_length", 0); - const uint32_t vl = get_u32_or(gctx, "qwen35.attention.value_length", 0); - const uint32_t fai = get_u32_or(gctx, "qwen35.full_attention_interval",0); - const uint32_t ssm_conv = get_u32_or(gctx, "qwen35.ssm.conv_kernel", 0); - const uint32_t ssm_inner = get_u32_or(gctx, "qwen35.ssm.inner_size", 0); - const uint32_t ssm_state = get_u32_or(gctx, "qwen35.ssm.state_size", 0); - const uint32_t ssm_dt = get_u32_or(gctx, "qwen35.ssm.time_step_rank",0); - const uint32_t ssm_grp = get_u32_or(gctx, "qwen35.ssm.group_count", 0); - - if (n_embd != 5120 || n_layer != 64 || n_head != 24 || n_headkv != 4 || - kl != 256 || vl != 256 || n_ff != 17408 || fai != 4 || - ssm_conv != 4 || ssm_inner != 6144 || ssm_state != 128 || - ssm_dt != 48 || ssm_grp != 16) { - char buf[512]; - std::snprintf(buf, sizeof(buf), - "unexpected hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u " - "kl=%u vl=%u n_ff=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u}", - n_embd, n_layer, n_head, n_headkv, kl, vl, n_ff, fai, - ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp); - set_last_error(buf); - gguf_free(gctx); - return false; + const uint32_t n_embd = get_u32_or(gctx, (arch + ".embedding_length").c_str(), 0); + const uint32_t n_ff = get_u32_or(gctx, (arch + ".feed_forward_length").c_str(), 0); + const uint32_t n_layer= get_u32_or(gctx, (arch + ".block_count").c_str(), 0); + const uint32_t n_head = get_u32_or(gctx, (arch + ".attention.head_count").c_str(),0); + const uint32_t n_headkv=get_u32_or(gctx, (arch + ".attention.head_count_kv").c_str(),0); + const uint32_t kl = get_u32_or(gctx, (arch + ".attention.key_length").c_str(), 0); + const uint32_t vl = get_u32_or(gctx, (arch + ".attention.value_length").c_str(), 0); + const uint32_t fai = get_u32_or(gctx, (arch + ".full_attention_interval").c_str(),0); + const uint32_t ssm_conv = get_u32_or(gctx, (arch + ".ssm.conv_kernel").c_str(), 0); + const uint32_t ssm_inner = get_u32_or(gctx, (arch + ".ssm.inner_size").c_str(), 0); + const uint32_t ssm_state = get_u32_or(gctx, (arch + ".ssm.state_size").c_str(), 0); + const uint32_t ssm_dt = get_u32_or(gctx, (arch + ".ssm.time_step_rank").c_str(),0); + const uint32_t ssm_grp = get_u32_or(gctx, (arch + ".ssm.group_count").c_str(), 0); + + // MoE fields (zero for dense qwen35) + const uint32_t n_expert = get_u32_or(gctx, (arch + ".expert_count").c_str(), 0); + const uint32_t n_expert_used = get_u32_or(gctx, (arch + ".expert_used_count").c_str(), 0); + const uint32_t expert_ff = get_u32_or(gctx, (arch + ".expert_feed_forward_length").c_str(), 0); + const uint32_t shared_ff = get_u32_or(gctx, (arch + ".expert_shared_feed_forward_length").c_str(), 0); + + const bool is_moe = (arch == "qwen35moe"); + + if (is_moe) { + // Validate qwen35moe hparams: 40 layers, 2048 hidden, 16 heads, 2 kv heads, + // 256 experts, 8 active, 512 expert ff, 512 shared ff + if (n_embd != 2048 || n_layer != 40 || n_head != 16 || n_headkv != 2 || + kl != 256 || vl != 256 || fai != 4 || + ssm_conv != 4 || ssm_inner != 4096 || ssm_state != 128 || + ssm_dt != 32 || ssm_grp != 16 || + n_expert != 256 || n_expert_used != 8 || + expert_ff != 512 || shared_ff != 512) { + char buf[512]; + std::snprintf(buf, sizeof(buf), + "unexpected qwen35moe hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u " + "kl=%u vl=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u} " + "n_expert=%u n_expert_used=%u expert_ff=%u shared_ff=%u", + n_embd, n_layer, n_head, n_headkv, kl, vl, fai, + ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp, + n_expert, n_expert_used, expert_ff, shared_ff); + set_last_error(buf); + gguf_free(gctx); + return false; + } + } else { + // Validate qwen35 dense hparams (unchanged) + if (n_embd != 5120 || n_layer != 64 || n_head != 24 || n_headkv != 4 || + kl != 256 || vl != 256 || n_ff != 17408 || fai != 4 || + ssm_conv != 4 || ssm_inner != 6144 || ssm_state != 128 || + ssm_dt != 48 || ssm_grp != 16) { + char buf[512]; + std::snprintf(buf, sizeof(buf), + "unexpected hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u " + "kl=%u vl=%u n_ff=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u}", + n_embd, n_layer, n_head, n_headkv, kl, vl, n_ff, fai, + ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp); + set_last_error(buf); + gguf_free(gctx); + return false; + } } // rope dimension_sections (array of 4 uint32) int rope_sections[4] = {0, 0, 0, 0}; { - int64_t rid = gguf_find_key(gctx, "qwen35.rope.dimension_sections"); + int64_t rid = gguf_find_key(gctx, (arch + ".rope.dimension_sections").c_str()); if (rid >= 0) { size_t n = gguf_get_arr_n(gctx, rid); if (n >= 4) { @@ -265,7 +298,7 @@ bool load_target_gguf(const std::string & path, out.backend = backend; out.n_layer = (int)n_layer; out.n_embd = (int)n_embd; - out.n_ff = (int)n_ff; + out.n_ff = is_moe ? 0 : (int)n_ff; out.n_head = (int)n_head; out.n_head_kv = (int)n_headkv; out.n_embd_head_k = (int)kl; @@ -277,6 +310,10 @@ bool load_target_gguf(const std::string & path, out.ssm_d_state= (int)ssm_state; out.ssm_dt_rank= (int)ssm_dt; out.ssm_n_group= (int)ssm_grp; + out.n_expert = (int)n_expert; + out.n_expert_used = (int)n_expert_used; + out.expert_ff_dim = (int)expert_ff; + out.shared_ff_dim = (int)shared_ff; out.layers.assign((size_t)n_layer, TargetLayer{}); // ── 2. Wire our layer pointers to tensors inside meta_ctx ───────── @@ -303,15 +340,48 @@ bool load_target_gguf(const std::string & path, // Always-present tensors L.attn_norm = fnd("attn_norm.weight"); L.attn_post_norm = fnd("post_attention_norm.weight"); + + // Dense FFN tensors (qwen35) or MoE tensors (qwen35moe) L.w_gate = fnd("ffn_gate.weight"); L.w_up = fnd("ffn_up.weight"); L.w_down = fnd("ffn_down.weight"); - if (!L.attn_norm || !L.attn_post_norm || !L.w_gate || !L.w_up || !L.w_down) { - char b[128]; - std::snprintf(b, sizeof(b), "layer %d: missing shared tensor", il); - set_last_error(b); - gguf_free(gctx); - return false; + + // MoE FFN tensors (qwen35moe only; null for dense) + L.ffn_gate_inp = fnd("ffn_gate_inp.weight"); + L.ffn_up_exps = fnd("ffn_up_exps.weight"); + L.ffn_gate_exps = fnd("ffn_gate_exps.weight"); + L.ffn_down_exps = fnd("ffn_down_exps.weight"); + L.ffn_up_shexp = fnd("ffn_up_shexp.weight"); + L.ffn_gate_shexp = fnd("ffn_gate_shexp.weight"); + L.ffn_down_shexp = fnd("ffn_down_shexp.weight"); + L.ffn_gate_inp_shexp = fnd("ffn_gate_inp_shexp.weight"); + + if (!is_moe) { + // Dense model: gate/up/down required + if (!L.attn_norm || !L.attn_post_norm || !L.w_gate || !L.w_up || !L.w_down) { + char b[128]; + std::snprintf(b, sizeof(b), "layer %d: missing shared tensor", il); + set_last_error(b); + gguf_free(gctx); + return false; + } + } else { + // MoE model: attn_norm + post_norm + expert tensors required + if (!L.attn_norm || !L.attn_post_norm) { + char b[128]; + std::snprintf(b, sizeof(b), "layer %d: missing attn_norm/post_norm", il); + set_last_error(b); + gguf_free(gctx); + return false; + } + if (!L.ffn_gate_inp || !L.ffn_up_exps || !L.ffn_gate_exps || !L.ffn_down_exps || + !L.ffn_up_shexp || !L.ffn_gate_shexp || !L.ffn_down_shexp || !L.ffn_gate_inp_shexp) { + char b[256]; + std::snprintf(b, sizeof(b), "layer %d: missing MoE tensors", il); + set_last_error(b); + gguf_free(gctx); + return false; + } } // Full-attention tensors (only on layers where (il+1)%fai == 0, diff --git a/dflash/src/internal.h b/dflash/src/internal.h index 536cdcc6..2ec12ba7 100644 --- a/dflash/src/internal.h +++ b/dflash/src/internal.h @@ -49,10 +49,22 @@ struct TargetLayer { ggml_tensor * attn_norm = nullptr; // [hidden] ggml_tensor * attn_post_norm = nullptr; // [hidden] (post-block norm before FFN) ggml_tensor * ffn_norm = nullptr; // [hidden] + + // Dense FFN (non-null for qwen35 dense target) ggml_tensor * w_gate = nullptr; // [hidden, intermediate] ggml_tensor * w_up = nullptr; // [hidden, intermediate] ggml_tensor * w_down = nullptr; // [intermediate, hidden] + // MoE FFN (non-null for qwen35moe target) + ggml_tensor * ffn_gate_inp = nullptr; // [n_embd, n_expert] — router + ggml_tensor * ffn_up_exps = nullptr; // [expert_ff, n_embd, n_expert] + ggml_tensor * ffn_gate_exps = nullptr; // [expert_ff, n_embd, n_expert] + ggml_tensor * ffn_down_exps = nullptr; // [n_embd, expert_ff, n_expert] + ggml_tensor * ffn_up_shexp = nullptr; // [shared_ff, n_embd] — shared expert + ggml_tensor * ffn_gate_shexp = nullptr; // [shared_ff, n_embd] + ggml_tensor * ffn_down_shexp = nullptr; // [n_embd, shared_ff] + ggml_tensor * ffn_gate_inp_shexp = nullptr; // [n_embd] — shared expert gate + // Full-attention block (non-null for layers where (il+1) % 4 == 0) ggml_tensor * wq = nullptr; // [hidden, q_dim] ggml_tensor * wk = nullptr; // [hidden, kv_dim] @@ -127,6 +139,13 @@ struct TargetWeights { int ssm_d_state = 128; int ssm_dt_rank = 48; int ssm_n_group = 16; + + // MoE-specific (zero for dense models) + int n_expert = 0; + int n_expert_used = 0; + int expert_ff_dim = 0; + int shared_ff_dim = 0; + float expert_weights_scale = 1.0f; }; // Load a Q4_K_M target model from a GGUF file on disk. @@ -139,6 +158,24 @@ void free_target_weights(TargetWeights & w); // ─── Draft weights (z-lab DFlash, bf16) ─────────────────────────── +struct DraftHparams { + int n_layer = DFLASH27B_DRAFT_LAYERS; + int hidden = DFLASH27B_TARGET_HIDDEN; + int n_head = DFLASH27B_TARGET_N_HEADS; + int n_kv_head = DFLASH27B_TARGET_N_KV_HEADS; + int head_dim = DFLASH27B_TARGET_HEAD_DIM; + int intermediate = DFLASH27B_TARGET_INTERMEDIATE; + int n_target_layers = DFLASH27B_DRAFT_N_TARGET_LAYERS; + int block_size = DFLASH27B_DRAFT_BLOCK_SIZE; + int mask_token_id = DFLASH27B_DRAFT_MASK_TOKEN_ID; + float rope_theta = DFLASH27B_ROPE_THETA; + float rms_eps = DFLASH27B_RMS_EPS; + float rope_factor = 1.0f; + float rope_beta_fast = 0.0f; + float rope_beta_slow = 0.0f; + int rope_orig_ctx = 0; +}; + struct DraftLayer { ggml_tensor * attn_norm; ggml_tensor * ffn_norm; @@ -158,10 +195,11 @@ struct DraftWeights { ggml_backend_t backend = nullptr; ggml_backend_buffer_t buf = nullptr; - ggml_tensor * fc = nullptr; // [5*hidden, hidden] - ggml_tensor * hidden_norm = nullptr; // [hidden] - std::vector layers; // size = 5 - ggml_tensor * out_norm = nullptr; // [hidden] + DraftHparams hparams; + ggml_tensor * fc = nullptr; + ggml_tensor * hidden_norm = nullptr; + std::vector layers; + ggml_tensor * out_norm = nullptr; }; bool load_draft_safetensors(const std::string & path, @@ -310,6 +348,16 @@ QwenGraphOutputs build_qwen35_graph( TargetCache & cache, const QwenGraphInputs & in); +// MoE FFN forward pass (qwen35moe). Computes expert routing, per-expert +// SwiGLU, shared expert with sigmoid gating, and returns the combined output. +// Shape: [n_embd, n_tokens] f32. +ggml_tensor * build_moe_ffn( + ggml_context * ctx, + ggml_cgraph * gf, + ggml_tensor * cur, + const TargetLayer & L, + const TargetWeights & w); + // Build a single-layer forward graph. Mirrors build_qwen35_graph but processes // only one layer, taking `inp` as the input activation and returning the output. // Used by layer-segmented prefill to iterate layers as the outer loop. diff --git a/dflash/src/qwen35_target_graph.cpp b/dflash/src/qwen35_target_graph.cpp index e09221b5..27d38d95 100644 --- a/dflash/src/qwen35_target_graph.cpp +++ b/dflash/src/qwen35_target_graph.cpp @@ -81,6 +81,13 @@ bool create_target_cache(const TargetWeights & w, max_verify_tokens = DFLASH27B_DRAFT_BLOCK_SIZE; } + const int head_dim = w.n_embd_head_k; + const int n_head_kv = w.n_head_kv; + const int num_v_heads = w.ssm_dt_rank; + const int head_v_dim = w.ssm_d_inner / num_v_heads; + const int conv_kern = w.ssm_d_conv; + const int conv_channels = w.ssm_d_inner + 2 * w.ssm_n_group * w.ssm_d_state; + const int n_full_attn = w.n_layer / w.full_attention_interval; // 16 const int n_delta = w.n_layer - n_full_attn; // 48 @@ -138,9 +145,9 @@ bool create_target_cache(const TargetWeights & w, if (is_attn) { // [head_dim, max_ctx_alloc, n_head_kv] ggml_tensor * K = ggml_new_tensor_3d(out.ctx, kv_k_type, - q35::HEAD_DIM, max_ctx_alloc, q35::N_HEAD_KV); + head_dim, max_ctx_alloc, n_head_kv); ggml_tensor * V = ggml_new_tensor_3d(out.ctx, kv_v_type, - q35::HEAD_DIM, max_ctx_alloc, q35::N_HEAD_KV); + head_dim, max_ctx_alloc, n_head_kv); char name[64]; std::snprintf(name, sizeof(name), "cache_k_%d", il); ggml_set_name(K, name); @@ -317,6 +324,7 @@ static ggml_tensor * build_swiglu_ffn(ggml_context * ctx, ggml_tensor * cur, static ggml_tensor * build_full_attn_block( ggml_context * ctx, ggml_cgraph * gf, + const TargetWeights & w, const TargetLayer & L, ggml_tensor * cur, ggml_tensor * positions, @@ -329,35 +337,42 @@ static ggml_tensor * build_full_attn_block( ggml_type kv_k_type, int fa_window = 0 ) { + const int head_dim = w.n_embd_head_k; + const int n_head = w.n_head; + const int n_head_kv = w.n_head_kv; + const int q_dim = n_head * head_dim; + constexpr float eps = 1e-6f; + constexpr float rope_theta = 10000000.0f; + // ── Q projection (packed Q || gate), shape [2*q_dim, n_tokens] ggml_tensor * QG = ggml_mul_mat(ctx, L.wq, cur); // Reshape to [head_dim*2, n_head, n_tokens] so we can view the Q and gate halves - QG = ggml_reshape_3d(ctx, QG, q35::HEAD_DIM * 2, q35::N_HEAD, n_tokens); + QG = ggml_reshape_3d(ctx, QG, head_dim * 2, n_head, n_tokens); // Q half: view at offset 0, stride head_dim*2 // Layout: [head_dim, n_head, n_tokens] ggml_tensor * Q = ggml_view_3d(ctx, QG, - q35::HEAD_DIM, q35::N_HEAD, n_tokens, - ggml_element_size(QG) * q35::HEAD_DIM * 2, // nb1: stride over n_head - ggml_element_size(QG) * q35::HEAD_DIM * 2 * q35::N_HEAD, // nb2: stride over n_tokens + head_dim, n_head, n_tokens, + ggml_element_size(QG) * head_dim * 2, // nb1: stride over n_head + ggml_element_size(QG) * head_dim * 2 * n_head, // nb2: stride over n_tokens /*offset*/ 0); - Q = rms_norm_mul(ctx, Q, L.q_norm, q35::EPS); + Q = rms_norm_mul(ctx, Q, L.q_norm, eps); // Gate half: view at offset head_dim ggml_tensor * gate = ggml_view_3d(ctx, QG, - q35::HEAD_DIM, q35::N_HEAD, n_tokens, - ggml_element_size(QG) * q35::HEAD_DIM * 2, - ggml_element_size(QG) * q35::HEAD_DIM * 2 * q35::N_HEAD, - ggml_element_size(QG) * q35::HEAD_DIM); - gate = ggml_cont_2d(ctx, gate, q35::HEAD_DIM * q35::N_HEAD, n_tokens); // [q_dim, n_tokens] + head_dim, n_head, n_tokens, + ggml_element_size(QG) * head_dim * 2, + ggml_element_size(QG) * head_dim * 2 * n_head, + ggml_element_size(QG) * head_dim); + gate = ggml_cont_2d(ctx, gate, head_dim * n_head, n_tokens); // [q_dim, n_tokens] // ── K and V projections ggml_tensor * Kcur = ggml_mul_mat(ctx, L.wk, cur); // [kv_dim, n_tokens] ggml_tensor * Vcur = ggml_mul_mat(ctx, L.wv, cur); // [kv_dim, n_tokens] - Kcur = ggml_reshape_3d(ctx, Kcur, q35::HEAD_DIM, q35::N_HEAD_KV, n_tokens); - Kcur = rms_norm_mul(ctx, Kcur, L.k_norm, q35::EPS); - Vcur = ggml_reshape_3d(ctx, Vcur, q35::HEAD_DIM, q35::N_HEAD_KV, n_tokens); + Kcur = ggml_reshape_3d(ctx, Kcur, head_dim, n_head_kv, n_tokens); + Kcur = rms_norm_mul(ctx, Kcur, L.k_norm, eps); + Vcur = ggml_reshape_3d(ctx, Vcur, head_dim, n_head_kv, n_tokens); // ── M-RoPE (multi-axis rotary). n_rot = HEAD_DIM/4 * 4 ? Actually // ggml_rope_multi takes n_dims = the number of dims to rotate; for @@ -368,11 +383,11 @@ static ggml_tensor * build_full_attn_block( Q = ggml_rope_multi(ctx, Q, positions, /*freq_factors=*/nullptr, n_rot, sections, GGML_ROPE_TYPE_MROPE, - /*n_ctx_orig=*/0, q35::ROPE_THETA, 1.0f, + /*n_ctx_orig=*/0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); Kcur = ggml_rope_multi(ctx, Kcur, positions, nullptr, n_rot, sections, GGML_ROPE_TYPE_MROPE, - 0, q35::ROPE_THETA, 1.0f, + 0, rope_theta, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); // ── Write K/V into the persistent cache at slot [kv_start..kv_start+n_tokens) @@ -387,11 +402,11 @@ static ggml_tensor * build_full_attn_block( ggml_tensor * Vcur_T = ggml_permute(ctx, Vcur, 0, 2, 1, 3); // [head_dim, n_tokens, n_head_kv] ggml_tensor * k_slot = ggml_view_3d(ctx, cache_k, - q35::HEAD_DIM, n_tokens, q35::N_HEAD_KV, + head_dim, n_tokens, n_head_kv, cache_k->nb[1], cache_k->nb[2], /*offset*/ cache_k->nb[1] * kv_start); ggml_tensor * v_slot = ggml_view_3d(ctx, cache_v, - q35::HEAD_DIM, n_tokens, q35::N_HEAD_KV, + head_dim, n_tokens, n_head_kv, cache_v->nb[1], cache_v->nb[2], cache_v->nb[1] * kv_start); @@ -432,7 +447,7 @@ static ggml_tensor * build_full_attn_block( // to all keys is trivially causal). For n_tokens>1 the caller must provide // a mask shaped [kv_len, n_tokens] with 0 for attendable positions and // -inf for positions beyond the causal boundary. - const float kq_scale = 1.0f / std::sqrt((float)q35::HEAD_DIM); + const float kq_scale = 1.0f / std::sqrt((float)head_dim); ggml_tensor * attn = ggml_flash_attn_ext(ctx, Qfa, Kfa, Vfa, attn_mask, kq_scale, 0.0f, 0.0f); // attn: [head_dim, n_head, n_tokens] (permuted) @@ -443,7 +458,7 @@ static ggml_tensor * build_full_attn_block( attn = ggml_turbo_wht(ctx, attn, 1); } - attn = ggml_reshape_2d(ctx, attn, q35::Q_DIM, n_tokens); + attn = ggml_reshape_2d(ctx, attn, q_dim, n_tokens); // ── Apply the sigmoid gate from the packed Q ggml_tensor * gate_sig = ggml_sigmoid(ctx, gate); @@ -467,6 +482,7 @@ static ggml_tensor * build_full_attn_block( static ggml_tensor * build_delta_net_block( ggml_context * ctx, ggml_cgraph * gf, + const TargetWeights & w, const TargetLayer & L, ggml_tensor * cur, // [hidden, n_tokens] ggml_tensor * conv_state, // [kernel-1, conv_channels] persistent @@ -475,17 +491,21 @@ static ggml_tensor * build_delta_net_block( DeltaNetCapture * cap, // optional: populated on capture_delta_intermediate ggml_tensor * parent_ids // optional [n_tokens] i32; tree mode when non-null ) { - const int d_inner = q35::SSM_D_INNER; - const int head_k_dim = q35::HEAD_K_DIM; // 128 - const int num_k_heads = q35::SSM_N_GROUP; // 16 - const int num_v_heads = q35::SSM_DT_RANK; // 48 - const int head_v_dim = q35::HEAD_V_DIM; // 128 + const int d_inner = w.ssm_d_inner; + const int head_k_dim = w.ssm_d_state; + const int num_k_heads = w.ssm_n_group; + const int num_v_heads = w.ssm_dt_rank; + const int head_v_dim = d_inner / num_v_heads; + const int conv_kern = w.ssm_d_conv; + const int conv_channels = d_inner + 2 * num_k_heads * head_k_dim; + const int hidden = w.n_embd; + constexpr float eps = 1e-6f; const int n_seqs = 1; const int n_seq_tokens = n_tokens; - // ── qkv_mixed = wqkv @ cur [10240, n_tokens] + // ── qkv_mixed = wqkv @ cur [conv_channels, n_tokens] ggml_tensor * qkv_mixed = ggml_mul_mat(ctx, L.wqkv, cur); - qkv_mixed = ggml_reshape_3d(ctx, qkv_mixed, q35::CONV_CHANNELS, n_seq_tokens, n_seqs); + qkv_mixed = ggml_reshape_3d(ctx, qkv_mixed, conv_channels, n_seq_tokens, n_seqs); // ── z = wqkv_gate @ cur [inner, n_tokens] ggml_tensor * z = ggml_mul_mat(ctx, L.wqkv_gate, cur); @@ -509,7 +529,7 @@ static ggml_tensor * build_delta_net_block( // ── Fetch conv state [kernel-1, conv_channels] and prepend to qkv_mixed // along the token axis to form the convolution input. ggml_tensor * conv_states_r = ggml_reshape_3d(ctx, conv_state, - q35::SSM_CONV_KERN - 1, q35::CONV_CHANNELS, n_seqs); + conv_kern - 1, conv_channels, n_seqs); // qkv_mixed currently is [conv_channels, n_tokens, n_seqs]; we need // [n_tokens, conv_channels, n_seqs] to concat on dim 0. @@ -529,9 +549,9 @@ static ggml_tensor * build_delta_net_block( // ── Save the last (kernel-1) steps back to conv_state ggml_tensor * last_conv = ggml_view_3d(ctx, conv_input, - q35::SSM_CONV_KERN - 1, q35::CONV_CHANNELS, n_seqs, + conv_kern - 1, conv_channels, n_seqs, conv_input->nb[1], conv_input->nb[2], - (conv_input->ne[0] - (q35::SSM_CONV_KERN - 1)) * ggml_element_size(conv_input)); + (conv_input->ne[0] - (conv_kern - 1)) * ggml_element_size(conv_input)); ggml_build_forward_expand(gf, ggml_cpy(ctx, last_conv, conv_state)); // ── 1D conv + silu @@ -550,7 +570,7 @@ static ggml_tensor * build_delta_net_block( const int64_t v_offset = 2 * num_k_heads * head_k_dim; const size_t elt = ggml_element_size(conv_out); - const size_t row_size = q35::CONV_CHANNELS * elt; + const size_t row_size = conv_channels * elt; ggml_tensor * q_c = ggml_view_4d(ctx, conv_out, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, @@ -572,8 +592,8 @@ static ggml_tensor * build_delta_net_block( v_offset * elt); // L2 norm on Q and K - q_c = ggml_l2_norm(ctx, q_c, q35::EPS); - k_c = ggml_l2_norm(ctx, k_c, q35::EPS); + q_c = ggml_l2_norm(ctx, q_c, eps); + k_c = ggml_l2_norm(ctx, k_c, eps); // Repeat Q and K from num_k_heads to num_v_heads so they match V's layout // (only needed if not using the fused op's broadcast support). @@ -697,7 +717,7 @@ static ggml_tensor * build_delta_net_block( // ── Gated output norm: rms_norm(output) * silu(z_4d) ggml_tensor * z_4d = ggml_reshape_4d(ctx, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs); - ggml_tensor * output_n = ggml_rms_norm(ctx, output, q35::EPS); + ggml_tensor * output_n = ggml_rms_norm(ctx, output, eps); output_n = ggml_mul(ctx, output_n, L.ssm_norm); ggml_tensor * z_silu = ggml_silu(ctx, z_4d); output_n = ggml_mul(ctx, output_n, z_silu); @@ -708,7 +728,7 @@ static ggml_tensor * build_delta_net_block( // Output projection ggml_tensor * out = ggml_mul_mat(ctx, L.ssm_out, flat); - out = ggml_reshape_2d(ctx, out, q35::N_HEAD * 0 + DFLASH27B_TARGET_HIDDEN, n_seq_tokens * n_seqs); + out = ggml_reshape_2d(ctx, out, hidden, n_seq_tokens * n_seqs); return out; } @@ -748,7 +768,7 @@ static ggml_tensor * build_single_layer( for (int il = 0; il < layer_idx; il++) { if (((il + 1) % w.full_attention_interval) == 0) fa_idx++; } - cur = build_full_attn_block(ctx, gf, L, cur, positions, w.rope_sections, + cur = build_full_attn_block(ctx, gf, w, L, cur, positions, w.rope_sections, cache.attn_k[fa_idx], cache.attn_v[fa_idx], attn_mask, kv_start, n_tokens, cache.kv_k_type, fa_window); @@ -757,7 +777,7 @@ static ggml_tensor * build_single_layer( for (int il = 0; il < layer_idx; il++) { if (((il + 1) % w.full_attention_interval) != 0) dn_idx++; } - cur = build_delta_net_block(ctx, gf, L, cur, + cur = build_delta_net_block(ctx, gf, w, L, cur, cache.conv_state[dn_idx], cache.ssm_state[dn_idx], n_tokens, nullptr, nullptr); } @@ -834,13 +854,18 @@ QwenGraphOutputs build_qwen35_graph( og_early.delta_captures.resize(n_delta); } - // DFlash target layer IDs for feature capture: {1, 16, 31, 46, 61} + // DFlash target layer IDs for feature capture. // HF hidden_states[lid+1] convention — capture AFTER layer 'lid' runs. - static const int CAPTURE_LAYERS[DFLASH27B_DRAFT_N_TARGET_LAYERS] = - { 1, 16, 31, 46, 61 }; + // Evenly spaced across the layers, always 5 capture points. + // Formula: il = 1 + k * (n_layer - 2) / (n_capture - 1) for k=0..4 + int CAPTURE_LAYERS[DFLASH27B_DRAFT_N_TARGET_LAYERS]; + const int capture_step = (w.n_layer - 2) / (DFLASH27B_DRAFT_N_TARGET_LAYERS - 1); + for (int k = 0; k < DFLASH27B_DRAFT_N_TARGET_LAYERS; k++) { + CAPTURE_LAYERS[k] = 1 + k * capture_step; + } const int hidden = w.n_embd; - const float eps = q35::EPS; + constexpr float eps = 1e-6f; for (int il = 0; il < w.n_layer; il++) { const TargetLayer & L = w.layers[il]; @@ -852,7 +877,7 @@ QwenGraphOutputs build_qwen35_graph( ggml_tensor * cur = rms_norm_mul(ctx, inpL, L.attn_norm, eps); if (is_attn) { - cur = build_full_attn_block(ctx, gf, L, cur, in.positions, w.rope_sections, + cur = build_full_attn_block(ctx, gf, w, L, cur, in.positions, w.rope_sections, cache.attn_k[fa_idx], cache.attn_v[fa_idx], in.attn_mask, in.kv_start, n_tokens, cache.kv_k_type, in.fa_window); @@ -869,7 +894,7 @@ QwenGraphOutputs build_qwen35_graph( cap_ptr->ssm_intermediate_states = cache.ssm_intermediate[dn_idx]; cap_ptr->conv_input = cache.conv_input_cache[dn_idx]; } - cur = build_delta_net_block(ctx, gf, L, cur, + cur = build_delta_net_block(ctx, gf, w, L, cur, cache.conv_state[dn_idx], cache.ssm_state[dn_idx], n_tokens, cap_ptr, in.parent_ids); dn_idx++; @@ -882,8 +907,10 @@ QwenGraphOutputs build_qwen35_graph( ggml_tensor * ffn_residual = cur; ggml_tensor * post = rms_norm_mul(ctx, cur, L.attn_post_norm, eps); - // SwiGLU FFN - ggml_tensor * ffn = build_swiglu_ffn(ctx, post, L); + // FFN: MoE or dense SwiGLU depending on model + ggml_tensor * ffn = (w.n_expert > 0) + ? build_moe_ffn(ctx, gf, post, L, w) + : build_swiglu_ffn(ctx, post, L); cur = ggml_add(ctx, ffn, ffn_residual); // ── DFlash layer feature capture ── @@ -936,7 +963,7 @@ QwenGraphOutputs build_qwen35_graph( } // 2. Final norm - ggml_tensor * out = rms_norm_mul(ctx, inpL, w.out_norm, q35::EPS); + ggml_tensor * out = rms_norm_mul(ctx, inpL, w.out_norm, eps); // 3. LM head ggml_tensor * logits = ggml_mul_mat(ctx, w.output, out); @@ -949,6 +976,97 @@ QwenGraphOutputs build_qwen35_graph( return og; } +ggml_tensor * build_moe_ffn( + ggml_context * ctx, + ggml_cgraph * gf, + ggml_tensor * cur, + const TargetLayer & L, + const TargetWeights & w) { + + const int64_t n_embd = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + const int64_t n_expert = w.n_expert; + const int64_t n_expert_used = w.n_expert_used; + + // 1. Router: gate_inp @ cur → [n_expert, n_tokens] + ggml_tensor * logits = ggml_mul_mat(ctx, L.ffn_gate_inp, cur); + + // 2. Softmax gating + ggml_tensor * probs = ggml_soft_max(ctx, logits); + + // 3. Top-k expert selection: [n_expert_used, n_tokens] + ggml_tensor * selected = ggml_argsort_top_k(ctx, probs, (int)n_expert_used); + + // 4. Extract weights for selected experts + probs = ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens); + ggml_tensor * weights = ggml_get_rows(ctx, probs, selected); + + // 5. Normalize weights + weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens); + ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); + weights_sum = ggml_clamp(ctx, weights_sum, 6.103515625e-5f, INFINITY); + weights = ggml_div(ctx, weights, weights_sum); + weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens); + ggml_build_forward_expand(gf, weights); + + // 6. Reshape input for batched expert matmul + cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); + + // 7. Per-expert projections via ggml_mul_mat_id + // gate + ggml_tensor * gate = ggml_mul_mat_id(ctx, L.ffn_gate_exps, cur, selected); + // up + ggml_tensor * up = ggml_mul_mat_id(ctx, L.ffn_up_exps, cur, selected); + + // 8. SwiGLU: silu(gate) * up + ggml_tensor * gu = ggml_swiglu_split(ctx, gate, up); + + // 9. Down projection + ggml_tensor * experts = ggml_mul_mat_id(ctx, L.ffn_down_exps, gu, selected); + + // 10. Apply weights + experts = ggml_mul(ctx, experts, weights); + ggml_build_forward_expand(gf, experts); + + // 11. Sum across selected experts + ggml_tensor * moe_out = nullptr; + for (int64_t i = 0; i < n_expert_used; i++) { + ggml_tensor * view = ggml_view_2d(ctx, experts, n_embd, n_tokens, + experts->nb[2], i * experts->nb[1]); + ggml_build_forward_expand(gf, view); + if (i == 0) { + moe_out = view; + } else { + moe_out = ggml_add(ctx, moe_out, view); + ggml_build_forward_expand(gf, moe_out); + } + } + if (n_expert_used == 1) { + moe_out = ggml_cont(ctx, moe_out); + } + + // 12. Shared expert path + ggml_tensor * sh_gate = ggml_mul_mat(ctx, L.ffn_gate_shexp, cur); + ggml_tensor * sh_up = ggml_mul_mat(ctx, L.ffn_up_shexp, cur); + sh_gate = ggml_silu(ctx, sh_gate); + sh_gate = ggml_reshape_2d(ctx, sh_gate, w.expert_ff_dim, n_tokens); + sh_up = ggml_reshape_2d(ctx, sh_up, w.expert_ff_dim, n_tokens); + ggml_tensor * sh_gu = ggml_mul(ctx, sh_gate, sh_up); + ggml_tensor * sh_down = ggml_mul_mat(ctx, L.ffn_down_shexp, sh_gu); + + // Shared expert gating (sigmoid) + ggml_tensor * shared_gate = ggml_mul_mat(ctx, L.ffn_gate_inp_shexp, cur); + shared_gate = ggml_sigmoid(ctx, shared_gate); + shared_gate = ggml_reshape_2d(ctx, shared_gate, 1, n_tokens); + sh_down = ggml_reshape_2d(ctx, sh_down, n_embd, n_tokens); + sh_down = ggml_mul(ctx, sh_down, shared_gate); + + // 13. Combine routed + shared + moe_out = ggml_add(ctx, moe_out, sh_down); + + return moe_out; +} + ggml_tensor * build_qwen35_layer( ggml_context * ctx, ggml_cgraph * gf, diff --git a/dflash/src/qwen3_dflash_graph.cpp b/dflash/src/qwen3_dflash_graph.cpp index e454ae3a..9686ada6 100644 --- a/dflash/src/qwen3_dflash_graph.cpp +++ b/dflash/src/qwen3_dflash_graph.cpp @@ -40,15 +40,20 @@ DraftGraphOutputs build_draft_graph( const DraftWeights & w, const DraftGraphInputs & in) { - const int q_len = DFLASH27B_DRAFT_BLOCK_SIZE; + const int q_len = w.hparams.block_size; const int ctx_len = in.ctx_len; const int total_k = ctx_len + q_len; - const int n_head = DFLASH27B_TARGET_N_HEADS; // 32 - const int n_kv = DFLASH27B_TARGET_N_KV_HEADS; // 8 - const int head_dim = DFLASH27B_TARGET_HEAD_DIM; // 128 - const float eps = DFLASH27B_RMS_EPS; - const float rope_base = DFLASH27B_ROPE_THETA; - (void)ctx_len; // used only via input tensor shapes + const int n_head = w.hparams.n_head; + const int n_kv = w.hparams.n_kv_head; + const int head_dim = w.hparams.head_dim; + const float eps = w.hparams.rms_eps; + const float rope_base = w.hparams.rope_theta; + (void)ctx_len; + + const float freq_scale = 1.0f / w.hparams.rope_factor; + const float ext_factor = (w.hparams.rope_factor > 1.0f) ? 1.0f : 0.0f; + const float mscale = (w.hparams.rope_factor > 1.0f) ? + 1.0f / (0.1f * std::log(w.hparams.rope_factor) + 1.0f) : 1.0f; // ── 1. Feature fusion: target_feat = rms_norm(fc @ target_hidden_cat, hidden_norm) // fc: [5*hidden, hidden] (ggml: ne[0]=5*hidden, ne[1]=hidden) @@ -62,7 +67,7 @@ DraftGraphOutputs build_draft_graph( // ── 2. Decoder layers ggml_tensor * h = in.noise_embed; // [hidden, q_len, 1] - for (int il = 0; il < DFLASH27B_DRAFT_LAYERS; il++) { + for (int il = 0; il < w.hparams.n_layer; il++) { const DraftLayer & L = w.layers[il]; // ── 2a. Attention pre-norm @@ -97,14 +102,16 @@ DraftGraphOutputs build_draft_graph( // ── 2d. RoPE (NEOX, theta=10M) // Q: positions_q [q_len] values [ctx_len..ctx_len+q_len-1] // K: positions_k [total_k] values [0..total_k-1] - Q = ggml_rope_ext(ctx, Q, in.positions_q, /*freq_factors=*/nullptr, - head_dim, GGML_ROPE_TYPE_NEOX, /*n_ctx_orig=*/0, - rope_base, /*freq_scale=*/1.0f, - /*ext_factor=*/0.0f, /*attn_factor=*/1.0f, - /*beta_fast=*/0.0f, /*beta_slow=*/0.0f); + Q = ggml_rope_ext(ctx, Q, in.positions_q, nullptr, + head_dim, GGML_ROPE_TYPE_NEOX, w.hparams.rope_orig_ctx, + rope_base, freq_scale, + ext_factor, mscale, + w.hparams.rope_beta_fast, w.hparams.rope_beta_slow); K = ggml_rope_ext(ctx, K, in.positions_k, nullptr, - head_dim, GGML_ROPE_TYPE_NEOX, 0, - rope_base, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f); + head_dim, GGML_ROPE_TYPE_NEOX, w.hparams.rope_orig_ctx, + rope_base, freq_scale, + ext_factor, mscale, + w.hparams.rope_beta_fast, w.hparams.rope_beta_slow); // ── 2e. Permute into the layout flash_attn_ext wants // q: [n_embd_k=head_dim, n_batch=q_len, n_head, ne3] diff --git a/dflash/src/safetensors_draft.cpp b/dflash/src/safetensors_draft.cpp index 17fbc3f5..862801c3 100644 --- a/dflash/src/safetensors_draft.cpp +++ b/dflash/src/safetensors_draft.cpp @@ -51,6 +51,8 @@ #include #endif +#include +#include #include #include @@ -307,11 +309,109 @@ static void bf16_to_f32_array(const uint16_t * src, float * dst, size_t n) { } } +static bool parse_config_json(const std::string & model_dir, DraftHparams & hp) { + std::string cfg_path = model_dir + "/config.json"; + std::ifstream ifs(cfg_path); + if (!ifs.is_open()) return false; + std::stringstream ss; + ss << ifs.rdbuf(); + std::string json = ss.str(); + + auto find_number = [&](const std::string & key, double def) -> double { + auto pos = json.find("\"" + key + "\""); + if (pos == std::string::npos) return def; + auto colon = json.find(':', pos + key.size() + 2); + if (colon == std::string::npos) return def; + size_t start = colon + 1; + while (start < json.size() && (json[start] == ' ' || json[start] == '\t' || json[start] == '\n' || json[start] == '\r')) start++; + if (start >= json.size()) return def; + char * end = nullptr; + double v = std::strtod(json.c_str() + start, &end); + if (end == json.c_str() + start) return def; + return v; + }; + + auto find_int = [&](const std::string & key, int def) -> int { + return (int)find_number(key, (double)def); + }; + + hp.n_layer = find_int("num_hidden_layers", hp.n_layer); + hp.hidden = find_int("hidden_size", hp.hidden); + hp.n_head = find_int("num_attention_heads", hp.n_head); + hp.n_kv_head = find_int("num_key_value_heads", hp.n_kv_head); + hp.head_dim = find_int("head_dim", -1); + if (hp.head_dim <= 0) hp.head_dim = hp.hidden / hp.n_head; + hp.intermediate = find_int("intermediate_size", hp.intermediate); + hp.block_size = find_int("block_size", hp.block_size); + hp.mask_token_id = find_int("mask_token_id", hp.mask_token_id); + hp.rope_theta = (float)find_number("rope_theta", hp.rope_theta); + hp.rms_eps = (float)find_number("rms_norm_eps", hp.rms_eps); + + { + auto pos = json.find("\"dflash_config\""); + if (pos != std::string::npos) { + hp.mask_token_id = find_int("mask_token_id", hp.mask_token_id); + auto ids_pos = json.find("\"target_layer_ids\"", pos); + if (ids_pos != std::string::npos) { + auto bracket = json.find('[', ids_pos); + if (bracket != std::string::npos) { + int count = 0; + auto end_bracket = json.find(']', bracket + 1); + if (end_bracket != std::string::npos) { + std::string arr = json.substr(bracket + 1, end_bracket - bracket - 1); + const char * p = arr.c_str(); + while (*p) { + while (*p && (*p == ' ' || *p == ',' || *p == '\t' || *p == '\n' || *p == '\r')) p++; + if (!*p) break; + char * ep = nullptr; + std::strtol(p, &ep, 10); + if (ep == p) break; + count++; + p = ep; + } + } + if (count > 0) hp.n_target_layers = count; + } + } + } + } + + { + auto pos = json.find("\"rope_scaling\""); + if (pos != std::string::npos) { + hp.rope_factor = (float)find_number("factor", hp.rope_factor); + hp.rope_beta_fast = (float)find_number("beta_fast", hp.rope_beta_fast); + hp.rope_beta_slow = (float)find_number("beta_slow", hp.rope_beta_slow); + hp.rope_orig_ctx = find_int("original_max_position_embeddings", hp.rope_orig_ctx); + } + } + + return true; +} + } // namespace bool load_draft_safetensors(const std::string & path, ggml_backend_t backend, DraftWeights & out) { + // ── 0. Try to load config.json from the model directory ────── + { + std::string model_dir; + auto slash = path.rfind('/'); +#if defined(_WIN32) + auto bslash = path.rfind('\\'); + if (bslash != std::string::npos && (slash == std::string::npos || bslash > slash)) + slash = bslash; +#endif + if (slash != std::string::npos) + model_dir = path.substr(0, slash); + else + model_dir = "."; + if (!parse_config_json(model_dir, out.hparams)) { + out.hparams = DraftHparams{}; + } + } + // ── 1. Open + mmap ──────────────────────────────────────────── Mmap mm; std::string err; @@ -334,9 +434,9 @@ bool load_draft_safetensors(const std::string & path, const uint8_t * blob = (const uint8_t *)mm.addr + 8 + header_len; const size_t blob_len = mm.len - 8 - header_len; - // ── 3. Allocate ggml context big enough for 5 layers × 11 + 3 top ─ - const int n_layers = DFLASH27B_DRAFT_LAYERS; - const int n_tensors = 3 + 11 * n_layers; // with some headroom below + // ── 3. Allocate ggml context ────────────────────────────────── + const int n_layers = out.hparams.n_layer; + const int n_tensors = 3 + 11 * n_layers; ggml_init_params ip{}; ip.mem_size = (size_t)(n_tensors + 16) * ggml_tensor_overhead(); ip.mem_buffer = nullptr; @@ -346,12 +446,12 @@ bool load_draft_safetensors(const std::string & path, out.backend = backend; out.layers.assign(n_layers, DraftLayer{}); - const int64_t HIDDEN = DFLASH27B_TARGET_HIDDEN; // 5120 - const int64_t Q_DIM = DFLASH27B_TARGET_N_HEADS * DFLASH27B_TARGET_HEAD_DIM; // 4096 - const int64_t KV_DIM = DFLASH27B_TARGET_N_KV_HEADS * DFLASH27B_TARGET_HEAD_DIM; // 1024 - const int64_t INTER = DFLASH27B_TARGET_INTERMEDIATE; // 17408 - const int64_t HD = DFLASH27B_TARGET_HEAD_DIM; // 128 - const int64_t FC_IN = DFLASH27B_DRAFT_N_TARGET_LAYERS * HIDDEN; // 25600 + const int64_t HIDDEN = out.hparams.hidden; + const int64_t Q_DIM = (int64_t)out.hparams.n_head * out.hparams.head_dim; + const int64_t KV_DIM = (int64_t)out.hparams.n_kv_head * out.hparams.head_dim; + const int64_t INTER = out.hparams.intermediate; + const int64_t HD = out.hparams.head_dim; + const int64_t FC_IN = (int64_t)out.hparams.n_target_layers * HIDDEN; // ── 4. Create named tensors in the context ─────────────────── // diff --git a/dflash/test/smoke_draft_graph.cpp b/dflash/test/smoke_draft_graph.cpp index 16672216..a1d90c5c 100644 --- a/dflash/test/smoke_draft_graph.cpp +++ b/dflash/test/smoke_draft_graph.cpp @@ -52,13 +52,7 @@ int main(int argc, char ** argv) { } const char * path = argv[1]; const int ctx_len = (argc >= 3) ? std::atoi(argv[2]) : 64; - const int q_len = DFLASH27B_DRAFT_BLOCK_SIZE; // 16 - const int hidden = DFLASH27B_TARGET_HIDDEN; // 5120 - const int fc_in = DFLASH27B_DRAFT_N_TARGET_LAYERS * hidden; // 25600 - std::printf("ctx_len=%d q_len=%d hidden=%d fc_in=%d\n", ctx_len, q_len, hidden, fc_in); - - // ── 1. Backend + weights ggml_backend_t backend = ggml_backend_cuda_init(0); if (!backend) { std::fprintf(stderr, "ggml_backend_cuda_init failed\n"); return 1; } @@ -69,6 +63,12 @@ int main(int argc, char ** argv) { } std::printf("draft loaded\n"); + const int hidden_actual = w.hparams.hidden; + const int q_len_actual = w.hparams.block_size; + const int fc_in_actual = w.hparams.n_target_layers * hidden_actual; + + std::printf("ctx_len=%d q_len=%d hidden=%d fc_in=%d\n", ctx_len, q_len_actual, hidden_actual, fc_in_actual); + // ── 2. Graph context (separate from weights context) const size_t mem_size = 256 * 1024 * 1024; // 256 MB — plenty for nodes ggml_init_params ip{}; @@ -81,10 +81,10 @@ int main(int argc, char ** argv) { // ── 3. Input placeholder tensors // Activations flow as F32 through the graph (CUDA rms_norm requires F32). // Weights stay bf16 — ggml_mul_mat auto-casts. - ggml_tensor * noise_embed = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, hidden, q_len, 1); - ggml_tensor * target_hid = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, fc_in, ctx_len, 1); - ggml_tensor * pos_q = ggml_new_tensor_1d(gctx, GGML_TYPE_I32, q_len); - ggml_tensor * pos_k = ggml_new_tensor_1d(gctx, GGML_TYPE_I32, ctx_len + q_len); + ggml_tensor * noise_embed = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, hidden_actual, q_len_actual, 1); + ggml_tensor * target_hid = ggml_new_tensor_3d(gctx, GGML_TYPE_F32, fc_in_actual, ctx_len, 1); + ggml_tensor * pos_q = ggml_new_tensor_1d(gctx, GGML_TYPE_I32, q_len_actual); + ggml_tensor * pos_k = ggml_new_tensor_1d(gctx, GGML_TYPE_I32, ctx_len + q_len_actual); ggml_set_name(noise_embed, "noise_embed"); ggml_set_name(target_hid, "target_hidden_cat"); ggml_set_name(pos_q, "positions_q"); @@ -122,23 +122,23 @@ int main(int argc, char ** argv) { std::uniform_real_distribution u(-0.02f, 0.02f); { - std::vector data((size_t)hidden * q_len); + std::vector data((size_t)hidden_actual * q_len_actual); for (auto & v : data) v = u(rng); ggml_backend_tensor_set(noise_embed, data.data(), 0, sizeof(float) * data.size()); } { - std::vector data((size_t)fc_in * ctx_len); + std::vector data((size_t)fc_in_actual * ctx_len); for (auto & v : data) v = u(rng); ggml_backend_tensor_set(target_hid, data.data(), 0, sizeof(float) * data.size()); } { - std::vector pq(q_len); - for (int i = 0; i < q_len; i++) pq[i] = ctx_len + i; + std::vector pq(q_len_actual); + for (int i = 0; i < q_len_actual; i++) pq[i] = ctx_len + i; ggml_backend_tensor_set(pos_q, pq.data(), 0, sizeof(int32_t) * pq.size()); } { - std::vector pk(ctx_len + q_len); - for (int i = 0; i < ctx_len + q_len; i++) pk[i] = i; + std::vector pk(ctx_len + q_len_actual); + for (int i = 0; i < ctx_len + q_len_actual; i++) pk[i] = i; ggml_backend_tensor_set(pos_k, pk.data(), 0, sizeof(int32_t) * pk.size()); } @@ -152,8 +152,8 @@ int main(int argc, char ** argv) { // ── 8. Read output, check shape + no NaN + print summary stats const size_t n_out_elems = ggml_nelements(go.hidden_states); - if (n_out_elems != (size_t)hidden * q_len) { - std::fprintf(stderr, "out elems mismatch: %zu vs %d\n", n_out_elems, hidden * q_len); + if (n_out_elems != (size_t)hidden_actual * q_len_actual) { + std::fprintf(stderr, "out elems mismatch: %zu vs %d\n", n_out_elems, hidden_actual * q_len_actual); return 1; } std::vector out(n_out_elems); diff --git a/dflash/test/smoke_load_moe_draft.cpp b/dflash/test/smoke_load_moe_draft.cpp new file mode 100644 index 00000000..7125171d --- /dev/null +++ b/dflash/test/smoke_load_moe_draft.cpp @@ -0,0 +1,97 @@ +// Smoke test: load 35B-A3B DFlash draft model from safetensors. +// Validates that DraftHparams are read correctly from config.json, +// tensor shapes match, and all 8 layers are loaded. +// +// Usage: smoke_load_moe_draft + +#include "dflash27b.h" +#include "internal.h" + +#include "ggml.h" +#include "ggml-backend.h" +#include "ggml-cuda.h" + +#include +#include + +using namespace dflash27b; + +int main(int argc, char ** argv) { + if (argc < 2) { + std::fprintf(stderr, "usage: %s \n", argv[0]); + return 2; + } + + ggml_backend_t backend = ggml_backend_cuda_init(0); + if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; } + + DraftWeights w; + if (!load_draft_safetensors(argv[1], backend, w)) { + std::fprintf(stderr, "load_draft_safetensors: %s\n", dflash27b_last_error()); + return 1; + } + + bool ok = true; + auto & hp = w.hparams; + + std::printf("[hparams] n_layer=%d hidden=%d n_head=%d n_kv_head=%d head_dim=%d " + "intermediate=%d n_target_layers=%d block_size=%d\n", + hp.n_layer, hp.hidden, hp.n_head, hp.n_kv_head, hp.head_dim, + hp.intermediate, hp.n_target_layers, hp.block_size); + std::printf("[rope] theta=%.0f factor=%.1f beta_fast=%.1f beta_slow=%.1f orig_ctx=%d\n", + hp.rope_theta, hp.rope_factor, hp.rope_beta_fast, hp.rope_beta_slow, hp.rope_orig_ctx); + + if (hp.n_layer != 8) { std::fprintf(stderr, "FAIL: n_layer=%d expected 8\n", hp.n_layer); ok = false; } + if (hp.hidden != 2048) { std::fprintf(stderr, "FAIL: hidden=%d expected 2048\n", hp.hidden); ok = false; } + if (hp.n_head != 32) { std::fprintf(stderr, "FAIL: n_head=%d expected 32\n", hp.n_head); ok = false; } + if (hp.n_kv_head != 4) { std::fprintf(stderr, "FAIL: n_kv_head=%d expected 4\n", hp.n_kv_head); ok = false; } + if (hp.head_dim != 128) { std::fprintf(stderr, "FAIL: head_dim=%d expected 128\n", hp.head_dim); ok = false; } + if (hp.intermediate != 6144){ std::fprintf(stderr, "FAIL: intermediate=%d expected 6144\n", hp.intermediate); ok = false; } + if (hp.n_target_layers != 5){ std::fprintf(stderr, "FAIL: n_target_layers=%d expected 5\n", hp.n_target_layers); ok = false; } + if (hp.block_size != 16) { std::fprintf(stderr, "FAIL: block_size=%d expected 16\n", hp.block_size); ok = false; } + if (hp.rope_factor != 64.0f){ std::fprintf(stderr, "FAIL: rope_factor=%.1f expected 64.0\n", hp.rope_factor); ok = false; } + if (hp.rope_beta_fast != 32.0f){ std::fprintf(stderr, "FAIL: rope_beta_fast=%.1f expected 32.0\n", hp.rope_beta_fast); ok = false; } + if (hp.rope_beta_slow != 1.0f) { std::fprintf(stderr, "FAIL: rope_beta_slow=%.1f expected 1.0\n", hp.rope_beta_slow); ok = false; } + if (hp.rope_orig_ctx != 4096) { std::fprintf(stderr, "FAIL: rope_orig_ctx=%d expected 4096\n", hp.rope_orig_ctx); ok = false; } + + if ((int)w.layers.size() != hp.n_layer) { + std::fprintf(stderr, "FAIL: layers.size=%zu expected %d\n", w.layers.size(), hp.n_layer); + ok = false; + } + + std::printf("[fc] ne=[%lld,%lld]\n", (long long)w.fc->ne[0], (long long)w.fc->ne[1]); + const int64_t fc_in = hp.n_target_layers * hp.hidden; + if (w.fc->ne[1] != hp.hidden || w.fc->ne[0] != fc_in) { + std::fprintf(stderr, "FAIL: fc shape [%lld,%lld] expected [%lld,%lld]\n", + (long long)w.fc->ne[0], (long long)w.fc->ne[1], (long long)fc_in, (long long)hp.hidden); + ok = false; + } + + if (!w.layers.empty()) { + auto & L0 = w.layers[0]; + const int64_t q_dim = hp.n_head * hp.head_dim; + const int64_t kv_dim = hp.n_kv_head * hp.head_dim; + std::printf("[layer0] wq=[%lld,%lld] wk=[%lld,%lld] w_gate=[%lld,%lld]\n", + (long long)L0.wq->ne[0], (long long)L0.wq->ne[1], + (long long)L0.wk->ne[0], (long long)L0.wk->ne[1], + (long long)L0.w_gate->ne[0], (long long)L0.w_gate->ne[1]); + if (L0.wq->ne[1] != q_dim || L0.wq->ne[0] != hp.hidden) { + std::fprintf(stderr, "FAIL: wq shape mismatch\n"); ok = false; + } + if (L0.wk->ne[1] != kv_dim || L0.wk->ne[0] != hp.hidden) { + std::fprintf(stderr, "FAIL: wk shape mismatch\n"); ok = false; + } + if (L0.w_gate->ne[1] != hp.intermediate || L0.w_gate->ne[0] != hp.hidden) { + std::fprintf(stderr, "FAIL: w_gate shape mismatch\n"); ok = false; + } + } + + free_draft_weights(w); + ggml_backend_free(backend); + + if (ok) { + std::printf("OK\n"); + return 0; + } + return 1; +} diff --git a/dflash/test/smoke_load_moe_target.cpp b/dflash/test/smoke_load_moe_target.cpp new file mode 100644 index 00000000..5a381cf4 --- /dev/null +++ b/dflash/test/smoke_load_moe_target.cpp @@ -0,0 +1,194 @@ +// Smoke test for the GGUF target loader with Qwen3.6-35B-A3B MoE model. +// Validates that the loader accepts the "qwen35moe" architecture, reads MoE +// hyperparameters, and wires expert tensors correctly. +// +// Usage: smoke_load_moe_target + +#include "dflash27b.h" +#include "internal.h" + +#include "ggml.h" +#include "ggml-backend.h" +#include "ggml-cuda.h" + +#include +#include +#include +#include +#include + +using namespace dflash27b; + +int main(int argc, char ** argv) { + if (argc < 2) { + std::fprintf(stderr, "usage: %s \n", argv[0]); + return 2; + } + + ggml_backend_t backend = ggml_backend_cuda_init(0); + if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; } + + TargetWeights w; + if (!load_target_gguf(argv[1], backend, w)) { + std::fprintf(stderr, "FAIL: load_target_gguf: %s\n", dflash27b_last_error()); + return 1; + } + std::printf("%s\n", dflash27b_last_error()); + + // Validate MoE-specific hyperparameters + bool ok = true; + + if (w.n_layer != 40) { + std::fprintf(stderr, "FAIL: n_layer=%d expected 40\n", w.n_layer); + ok = false; + } + if (w.n_embd != 2048) { + std::fprintf(stderr, "FAIL: n_embd=%d expected 2048\n", w.n_embd); + ok = false; + } + if (w.n_head != 16) { + std::fprintf(stderr, "FAIL: n_head=%d expected 16\n", w.n_head); + ok = false; + } + if (w.n_head_kv != 2) { + std::fprintf(stderr, "FAIL: n_head_kv=%d expected 2\n", w.n_head_kv); + ok = false; + } + if (w.n_embd_head_k != 256 || w.n_embd_head_v != 256) { + std::fprintf(stderr, "FAIL: head_dim k=%d v=%d expected 256/256\n", + w.n_embd_head_k, w.n_embd_head_v); + ok = false; + } + if (w.full_attention_interval != 4) { + std::fprintf(stderr, "FAIL: fai=%d expected 4\n", w.full_attention_interval); + ok = false; + } + if (w.ssm_d_inner != 4096) { + std::fprintf(stderr, "FAIL: ssm_d_inner=%d expected 4096\n", w.ssm_d_inner); + ok = false; + } + if (w.ssm_dt_rank != 32) { + std::fprintf(stderr, "FAIL: ssm_dt_rank=%d expected 32\n", w.ssm_dt_rank); + ok = false; + } + if (w.ssm_d_state != 128) { + std::fprintf(stderr, "FAIL: ssm_d_state=%d expected 128\n", w.ssm_d_state); + ok = false; + } + if (w.ssm_n_group != 16) { + std::fprintf(stderr, "FAIL: ssm_n_group=%d expected 16\n", w.ssm_n_group); + ok = false; + } + + // MoE-specific: expert fields + if (w.n_expert != 256) { + std::fprintf(stderr, "FAIL: n_expert=%d expected 256\n", w.n_expert); + ok = false; + } + if (w.n_expert_used != 8) { + std::fprintf(stderr, "FAIL: n_expert_used=%d expected 8\n", w.n_expert_used); + ok = false; + } + if (w.expert_ff_dim != 512) { + std::fprintf(stderr, "FAIL: expert_ff_dim=%d expected 512\n", w.expert_ff_dim); + ok = false; + } + if (w.shared_ff_dim != 512) { + std::fprintf(stderr, "FAIL: shared_ff_dim=%d expected 512\n", w.shared_ff_dim); + ok = false; + } + + // Count layer types + int n_attn = 0, n_delta = 0; + int n_expert_layers = 0; + for (int il = 0; il < w.n_layer; il++) { + const auto & L = w.layers[il]; + bool attn = L.wq && L.wk && L.wv && L.wo; + bool ssm = L.wqkv && L.wqkv_gate && L.ssm_conv1d; + if (attn) n_attn++; + if (ssm) n_delta++; + if (L.ffn_gate_inp) n_expert_layers++; + } + + std::printf("hparams: n_layer=%d n_embd=%d n_head=%d n_head_kv=%d head_dim=%d/%d fai=%d\n", + w.n_layer, w.n_embd, w.n_head, w.n_head_kv, w.n_embd_head_k, w.n_embd_head_v, + w.full_attention_interval); + std::printf("ssm: conv=%d inner=%d state=%d dt_rank=%d n_group=%d\n", + w.ssm_d_conv, w.ssm_d_inner, w.ssm_d_state, w.ssm_dt_rank, w.ssm_n_group); + std::printf("moe: n_expert=%d n_expert_used=%d expert_ff=%d shared_ff=%d\n", + w.n_expert, w.n_expert_used, w.expert_ff_dim, w.shared_ff_dim); + std::printf("layer counts: full_attn=%d delta_net=%d expert_layers=%d\n", + n_attn, n_delta, n_expert_layers); + + if (n_attn != 10) { + std::fprintf(stderr, "FAIL: expected 10 full-attn layers, got %d\n", n_attn); + ok = false; + } + if (n_delta != 30) { + std::fprintf(stderr, "FAIL: expected 30 delta-net layers, got %d\n", n_delta); + ok = false; + } + if (n_expert_layers != 40) { + std::fprintf(stderr, "FAIL: expected 40 layers with ffn_gate_inp, got %d\n", + n_expert_layers); + ok = false; + } + + // Verify expert tensors exist on layer 0 + { + const auto & L = w.layers[0]; + if (!L.ffn_gate_inp) { + std::fprintf(stderr, "FAIL: layer 0 missing ffn_gate_inp\n"); + ok = false; + } + if (!L.ffn_up_exps) { + std::fprintf(stderr, "FAIL: layer 0 missing ffn_up_exps\n"); + ok = false; + } + if (!L.ffn_gate_exps) { + std::fprintf(stderr, "FAIL: layer 0 missing ffn_gate_exps\n"); + ok = false; + } + if (!L.ffn_down_exps) { + std::fprintf(stderr, "FAIL: layer 0 missing ffn_down_exps\n"); + ok = false; + } + if (!L.ffn_up_shexp) { + std::fprintf(stderr, "FAIL: layer 0 missing ffn_up_shexp\n"); + ok = false; + } + if (!L.ffn_gate_shexp) { + std::fprintf(stderr, "FAIL: layer 0 missing ffn_gate_shexp\n"); + ok = false; + } + if (!L.ffn_down_shexp) { + std::fprintf(stderr, "FAIL: layer 0 missing ffn_down_shexp\n"); + ok = false; + } + if (!L.ffn_gate_inp_shexp) { + std::fprintf(stderr, "FAIL: layer 0 missing ffn_gate_inp_shexp\n"); + ok = false; + } + + if (L.ffn_gate_inp) { + std::printf("layer 0 ffn_gate_inp: [%lld, %lld] type=%s\n", + (long long)L.ffn_gate_inp->ne[0], (long long)L.ffn_gate_inp->ne[1], + ggml_type_name(L.ffn_gate_inp->type)); + } + if (L.ffn_up_exps) { + std::printf("layer 0 ffn_up_exps: [%lld, %lld, %lld] type=%s\n", + (long long)L.ffn_up_exps->ne[0], (long long)L.ffn_up_exps->ne[1], + (long long)L.ffn_up_exps->ne[2], ggml_type_name(L.ffn_up_exps->type)); + } + } + + free_target_weights(w); + ggml_backend_free(backend); + + if (ok) { + std::printf("OK\n"); + return 0; + } else { + return 1; + } +} diff --git a/dflash/test/smoke_moe_ffn.cpp b/dflash/test/smoke_moe_ffn.cpp new file mode 100644 index 00000000..fa50d53f --- /dev/null +++ b/dflash/test/smoke_moe_ffn.cpp @@ -0,0 +1,128 @@ +// Smoke test: single MoE FFN layer forward pass. +// Loads the 35B-A3B MoE GGUF, picks layer 0, feeds a random vector through +// just the MoE FFN (post-attention-norm → expert routing → shared expert → output), +// and prints the output shape + a spot value. +// +// Usage: smoke_moe_ffn + +#include "dflash27b.h" +#include "internal.h" + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-cuda.h" + +#include +#include +#include +#include +#include +#include + +using namespace dflash27b; + +int main(int argc, char ** argv) { + if (argc < 2) { + std::fprintf(stderr, "usage: %s \n", argv[0]); + return 2; + } + + ggml_backend_t backend = ggml_backend_cuda_init(0); + if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; } + + TargetWeights w; + if (!load_target_gguf(argv[1], backend, w)) { + std::fprintf(stderr, "load_target_gguf: %s\n", dflash27b_last_error()); + return 1; + } + + if (w.n_expert == 0) { + std::fprintf(stderr, "FAIL: expected MoE model, got dense (n_expert=0)\n"); + return 1; + } + + // Build a simple graph: MoE FFN on a single token + const int n_tokens = 1; + const int n_embd = w.n_embd; + + ggml_context * ctx = nullptr; + { + struct ggml_init_params params = {}; + params.mem_size = 256 * 1024 * 1024; + params.no_alloc = true; + ctx = ggml_init(params); + } + + ggml_cgraph * gf = ggml_new_graph(ctx); + + // Input: random-ish values (just use 1.0 everywhere) + ggml_tensor * inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_name(inp, "inp"); + + // Apply post-attention norm (RMS norm) + const float eps = 1e-6f; + ggml_tensor * normed = ggml_rms_norm(ctx, inp, eps); + normed = ggml_mul(ctx, normed, w.layers[0].attn_post_norm); + + // MoE FFN + ggml_tensor * moe_out = build_moe_ffn(ctx, gf, normed, w.layers[0], w); + + // Residual add + ggml_tensor * out = ggml_add(ctx, moe_out, inp); + ggml_set_name(out, "out"); + ggml_set_output(out); + + ggml_build_forward_expand(gf, out); + + // Allocate + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + std::fprintf(stderr, "FAIL: graph alloc failed\n"); + return 1; + } + + // Set input data + std::vector inp_data(n_embd * n_tokens, 1.0f); + ggml_backend_tensor_set(inp, inp_data.data(), 0, sizeof(float) * inp_data.size()); + + // Compute + ggml_backend_graph_compute(backend, gf); + + // Read output + std::vector out_data(n_embd * n_tokens); + ggml_backend_tensor_get(out, out_data.data(), 0, sizeof(float) * out_data.size()); + + std::printf("MoE FFN output shape: [%lld, %lld]\n", + (long long)out->ne[0], (long long)out->ne[1]); + std::printf("First 8 values: "); + for (int i = 0; i < 8 && i < (int)out_data.size(); i++) { + std::printf("%.6f ", out_data[i]); + } + std::printf("\n"); + + // Sanity: output should not be all zeros or NaN/Inf + bool ok = true; + bool all_zero = true; + bool has_nan = false; + bool has_inf = false; + for (auto v : out_data) { + if (v != 0.0f) all_zero = false; + if (std::isnan(v)) has_nan = true; + if (std::isinf(v)) has_inf = true; + } + if (all_zero) { std::fprintf(stderr, "FAIL: output is all zeros\n"); ok = false; } + if (has_nan) { std::fprintf(stderr, "FAIL: output contains NaN\n"); ok = false; } + if (has_inf) { std::fprintf(stderr, "FAIL: output contains Inf\n"); ok = false; } + + ggml_gallocr_free(alloc); + ggml_free(ctx); + free_target_weights(w); + ggml_backend_free(backend); + + if (ok) { + std::printf("OK\n"); + return 0; + } + return 1; +} diff --git a/dflash/test/smoke_moe_target_forward.cpp b/dflash/test/smoke_moe_target_forward.cpp new file mode 100644 index 00000000..e1ce5986 --- /dev/null +++ b/dflash/test/smoke_moe_target_forward.cpp @@ -0,0 +1,162 @@ +// Smoke test: full MoE target forward pass. +// Loads Qwen3.6-35B-A3B, creates cache, runs 1-token decode through all 40 layers, +// validates logits are sane. +// +// Usage: smoke_moe_target_forward + +#include "dflash27b.h" +#include "internal.h" + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-cuda.h" + +#include +#include +#include +#include +#include +#include + +using namespace dflash27b; + +int main(int argc, char ** argv) { + if (argc < 2) { + std::fprintf(stderr, "usage: %s \n", argv[0]); + return 2; + } + + ggml_backend_t backend = ggml_backend_cuda_init(0); + if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; } + + TargetWeights w; + if (!load_target_gguf(argv[1], backend, w)) { + std::fprintf(stderr, "load_target_gguf: %s\n", dflash27b_last_error()); + return 1; + } + std::printf("[target] %s\n", dflash27b_last_error()); + + if (w.n_expert == 0) { + std::fprintf(stderr, "FAIL: expected MoE model\n"); + return 1; + } + + // Create cache + TargetCache cache; + const int max_ctx = 512; + if (!create_target_cache(w, max_ctx, 0, backend, cache)) { + std::fprintf(stderr, "FAIL: create_target_cache: %s\n", dflash27b_last_error()); + return 1; + } + std::printf("[cache] attn_k=%zu attn_v=%zu ssm=%zu conv=%zu\n", + cache.attn_k.size(), cache.attn_v.size(), + cache.ssm_state.size(), cache.conv_state.size()); + + // Validate cache dimensions + bool ok = true; + // 10 full-attn layers (indices 3,7,11,15,19,23,27,31,35,39) + if (cache.attn_k.size() != 10) { + std::fprintf(stderr, "FAIL: expected 10 attn_k, got %zu\n", cache.attn_k.size()); + ok = false; + } + // 30 delta-net layers + if (cache.ssm_state.size() != 30) { + std::fprintf(stderr, "FAIL: expected 30 ssm_state, got %zu\n", cache.ssm_state.size()); + ok = false; + } + + // Embed a single token + const int n_tokens = 1; + const int token_id = 1; // arbitrary + std::vector embed_data(w.n_embd * n_tokens); + if (!w.embedder.embed(&token_id, n_tokens, embed_data.data())) { + std::fprintf(stderr, "FAIL: embedder.embed failed\n"); + return 1; + } + + // Build graph + ggml_context * ctx = nullptr; + { + struct ggml_init_params params = {}; + params.mem_size = 512 * 1024 * 1024; + params.no_alloc = true; + ctx = ggml_init(params); + } + + ggml_cgraph * gf = ggml_new_graph_custom(ctx, 8192, false); + + ggml_tensor * inp_embed = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w.n_embd, n_tokens, 1); + ggml_tensor * positions = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4 * n_tokens); + ggml_set_name(inp_embed, "inp_embed"); + ggml_set_name(positions, "positions"); + ggml_set_input(inp_embed); + ggml_set_input(positions); + + QwenGraphInputs in; + in.inp_embed = inp_embed; + in.positions = positions; + in.n_tokens = n_tokens; + in.kv_start = 0; + in.attn_mask = nullptr; // single token, no mask needed + in.capture_layers = false; + in.capture_delta_intermediate = false; + + QwenGraphOutputs og = build_qwen35_graph(ctx, gf, w, cache, in); + ggml_set_output(og.logits); + + ggml_build_forward_expand(gf, og.logits); + std::printf("[graph] nodes=%d\n", ggml_graph_n_nodes(gf)); + + // Allocate + set input + ggml_gallocr_t alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + std::fprintf(stderr, "FAIL: graph alloc failed\n"); + return 1; + } + ggml_backend_tensor_set(inp_embed, embed_data.data(), 0, sizeof(float) * embed_data.size()); + int32_t pos4[4] = { 0, 0, 0, 0 }; + ggml_backend_tensor_set(positions, pos4, 0, sizeof(int32_t) * 4); + + // Compute + if (ggml_backend_graph_compute(backend, gf) != GGML_STATUS_SUCCESS) { + std::fprintf(stderr, "FAIL: graph compute failed\n"); + return 1; + } + + // Read logits + std::printf("[logits_tensor] ne=[%lld,%lld,%lld]\n", + (long long)og.logits->ne[0], (long long)og.logits->ne[1], (long long)og.logits->ne[2]); + const int64_t vocab = og.logits->ne[0]; + const int64_t logits_count = vocab * og.logits->ne[1]; + std::vector logits(logits_count); + ggml_backend_tensor_get(og.logits, logits.data(), 0, sizeof(float) * logits.size()); + + int n_nan = 0, n_inf = 0; + float min_l = INFINITY, max_l = -INFINITY; + for (int64_t i = 0; i < logits_count; i++) { + auto v = logits[i]; + if (std::isnan(v)) n_nan++; + if (std::isinf(v)) n_inf++; + if (v < min_l) min_l = v; + if (v > max_l) max_l = v; + } + std::printf("[logits] vocab=%lld nan=%d inf=%d min=%.2f max=%.2f\n", + (long long)vocab, n_nan, n_inf, min_l, max_l); + + if (n_nan > 0) { std::fprintf(stderr, "FAIL: logits contain NaN\n"); ok = false; } + if (n_inf > 0) { std::fprintf(stderr, "FAIL: logits contain Inf\n"); ok = false; } + if (max_l - min_l < 1.0f) { std::fprintf(stderr, "FAIL: logits range too small, possible constant output\n"); ok = false; } + + ggml_gallocr_free(alloc); + ggml_free(ctx); + free_target_cache(cache); + free_target_weights(w); + ggml_backend_free(backend); + + if (ok) { + std::printf("OK\n"); + return 0; + } + return 1; +} diff --git a/dflash/test/test_dflash.cpp b/dflash/test/test_dflash.cpp index 14b020f8..5a6a904e 100644 --- a/dflash/test/test_dflash.cpp +++ b/dflash/test/test_dflash.cpp @@ -615,7 +615,7 @@ static bool build_target_step( sg.ctx = ggml_init(ip); if (!sg.ctx) return false; - const int hidden = DFLASH27B_TARGET_HIDDEN; + const int hidden = w.n_embd; sg.inp_embed = ggml_new_tensor_3d(sg.ctx, GGML_TYPE_F32, hidden, n_tokens, 1); ggml_set_name(sg.inp_embed, "inp_embed"); ggml_set_input(sg.inp_embed); @@ -684,7 +684,7 @@ static bool build_target_step_tree( sg.ctx = ggml_init(ip); if (!sg.ctx) return false; - const int hidden = DFLASH27B_TARGET_HIDDEN; + const int hidden = w.n_embd; sg.inp_embed = ggml_new_tensor_3d(sg.ctx, GGML_TYPE_F32, hidden, n_tokens, 1); ggml_set_name(sg.inp_embed, "inp_embed"); ggml_set_input(sg.inp_embed); @@ -731,6 +731,91 @@ static bool build_target_step_tree( return ggml_gallocr_alloc_graph(sg.alloc, sg.gf); } +// Reusable tree-verify graph: fixed n_tokens and kv_start so CUDA graphs can +// replay across steps. Writes K/V to scratch slots [ws..ws+max_n-1] and +// the attention mask covers [0..ws+max_n). After compute the caller copies +// K/V from scratch slots to the real committed positions. +static bool build_target_step_tree_reusable( + StepGraph & sg, + const TargetWeights & w, + TargetCache & cache, + ggml_backend_t backend, + int max_ctx, + int max_n) +{ + step_graph_free(sg); + + ggml_init_params ip{}; + ip.mem_size = 512 * 1024 * 1024; + ip.mem_buffer = nullptr; + ip.no_alloc = true; + sg.ctx = ggml_init(ip); + if (!sg.ctx) return false; + + const int hidden = w.n_embd; + const int write_start = max_ctx - max_n; + + sg.inp_embed = ggml_new_tensor_3d(sg.ctx, GGML_TYPE_F32, hidden, max_n, 1); + ggml_set_name(sg.inp_embed, "inp_embed"); + ggml_set_input(sg.inp_embed); + + sg.positions = ggml_new_tensor_1d(sg.ctx, GGML_TYPE_I32, 4 * max_n); + ggml_set_name(sg.positions, "positions"); + ggml_set_input(sg.positions); + + const int kv_pad = align_up(max_ctx, g_kq_stride_pad); + const int q_pad = align_up(max_n, KQ_MASK_PAD); + sg.attn_mask = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, kv_pad, q_pad); + ggml_set_name(sg.attn_mask, "attn_mask"); + ggml_set_input(sg.attn_mask); + + sg.parent_ids = ggml_new_tensor_1d(sg.ctx, GGML_TYPE_I32, max_n); + ggml_set_name(sg.parent_ids, "parent_ids"); + ggml_set_input(sg.parent_ids); + + sg.gf = ggml_new_graph_custom(sg.ctx, 16384, false); + + QwenGraphInputs gi{}; + gi.inp_embed = sg.inp_embed; + gi.positions = sg.positions; + gi.attn_mask = sg.attn_mask; + gi.n_tokens = max_n; + gi.kv_start = write_start; + gi.capture_layers = true; + gi.capture_delta_intermediate = true; + gi.parent_ids = sg.parent_ids; + + QwenGraphOutputs go = build_qwen35_graph(sg.ctx, sg.gf, w, cache, gi); + if (!go.logits) return false; + sg.logits = go.logits; + sg.delta_captures = std::move(go.delta_captures); + ggml_set_output(sg.logits); + ggml_build_forward_expand(sg.gf, sg.logits); + + if (!sg.alloc) { + sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + } + if (!ggml_gallocr_reserve(sg.alloc, sg.gf)) return false; + return ggml_gallocr_alloc_graph(sg.alloc, sg.gf); +} + +static void copy_kv_slots_batch( + const std::vector & cache_tensors, + int src_start, int dst_start, int n_slots, int n_heads) +{ + for (auto * ct : cache_tensors) { + const size_t slot_bytes = ct->nb[1]; + const size_t head_stride = ct->nb[2]; + const char * base = (const char *) ct->data; + for (int s = 0; s < n_slots; s++) { + cudaMemcpy2D( + (void *)(base + (size_t)(dst_start + s) * slot_bytes), head_stride, + (const void *)(base + (size_t)(src_start + s) * slot_bytes), head_stride, + slot_bytes, n_heads, cudaMemcpyDeviceToDevice); + } + } +} + static bool build_draft_step( StepGraph & sg, const DraftWeights & dw, @@ -746,9 +831,9 @@ static bool build_draft_step( sg.ctx = ggml_init(ip); if (!sg.ctx) return false; - const int hidden = DFLASH27B_TARGET_HIDDEN; - const int q_len = DFLASH27B_DRAFT_BLOCK_SIZE; - const int fc_in = DFLASH27B_DRAFT_N_TARGET_LAYERS * hidden; + const int hidden = tw.n_embd; + const int q_len = dw.hparams.block_size; + const int fc_in = dw.hparams.n_target_layers * hidden; sg.inp_embed = ggml_new_tensor_3d(sg.ctx, GGML_TYPE_F32, hidden, q_len, 1); ggml_set_name(sg.inp_embed, "inp_embed"); @@ -916,10 +1001,10 @@ int main(int argc, char ** argv) { // Profile mode intentionally keeps the intermediate cache tiny (no capture) // so we can go up to n_tokens=128 without OOM. const int max_verify_tokens = profile_scaling - ? DFLASH27B_DRAFT_BLOCK_SIZE + ? dw.hparams.block_size : (ddtree_mode - ? std::max(DFLASH27B_DRAFT_BLOCK_SIZE, ddtree_budget + 1) - : DFLASH27B_DRAFT_BLOCK_SIZE); + ? std::max(dw.hparams.block_size, ddtree_budget + 1) + : dw.hparams.block_size); TargetCache cache; if (!create_target_cache(w, max_ctx, max_verify_tokens, backend, cache, /*prefill_only=*/true)) { @@ -929,7 +1014,7 @@ int main(int argc, char ** argv) { // ── Profile mode: microbench target forward at varying N ─────────── if (profile_scaling) { - const int hidden_p = DFLASH27B_TARGET_HIDDEN; + const int hidden_p = w.n_embd; StepGraph psg; const int n_values[] = { 1, 4, 8, 12, 16, 20, 24, 32, 48, 64, 96, 128 }; std::printf("[profile] target forward ms at varying N (kv_start=0, no capture)\n"); @@ -1499,6 +1584,22 @@ int main(int argc, char ** argv) { std::vector pos_q_buf(q_len), pos_k_buf(max_ctx + q_len); std::vector pos4_buf(4 * q_len); + // Reusable tree-verify graph for DDTree CUDA graph replay. + // Built once with max_n = budget+1 tokens and fixed kv_start. + StepGraph sg_tree; + const int ddtree_max_n = ddtree_budget + 1; + const int tree_write_start = max_ctx - ddtree_max_n; + const int tree_kv_pad = align_up(max_ctx, g_kq_stride_pad); + const int tree_q_pad = align_up(ddtree_max_n, KQ_MASK_PAD); + const int n_head_kv_tree = w.n_head_kv; + std::vector tree_mask_buf((size_t)tree_kv_pad * tree_q_pad, F16_NEG_INF); + if (ddtree_mode) { + if (!build_target_step_tree_reusable(sg_tree, w, cache, backend, + max_ctx, ddtree_max_n)) { + std::fprintf(stderr, "reusable tree verify build failed\n"); return 1; + } + } + auto t_gen0 = std::chrono::steady_clock::now(); // Per-phase timing accumulators (microseconds) @@ -1682,64 +1783,80 @@ int main(int argc, char ** argv) { const int N = 1 + tree.n_nodes; // flat size including root - if (!build_target_step_tree(sg, w, cache, backend, - /*kv_start=*/committed, /*n_tokens=*/N, - g_fa_window)) { - std::fprintf(stderr, "ddtree verify build failed\n"); return 1; - } + // Reuse pre-built graph — no rebuild needed T_verify_build = sync_us(); tt_verify_build += std::chrono::duration(T_verify_build - T_snap).count(); // Embeddings: [last_tok, tree.token_ids[0..n_nodes-1]] - std::vector flat_tokens(N); + // Pad to ddtree_max_n with last_tok (harmless — unused by the mask) + std::vector flat_tokens(ddtree_max_n, last_tok); flat_tokens[0] = last_tok; for (int i = 0; i < tree.n_nodes; i++) flat_tokens[1 + i] = tree.token_ids[i]; - std::vector tree_embed(hidden * N); - if (!w.embedder.embed(flat_tokens.data(), N, tree_embed.data())) return 1; - ggml_backend_tensor_set(sg.inp_embed, tree_embed.data(), 0, - sizeof(float) * hidden * N); - - // M-RoPE axis-major positions: committed + depth_of_node. - // Slot 0 = root = depth 0 → position `committed`. - std::vector pos4(4 * N); - for (int i = 0; i < N; i++) { - int p = committed + (i == 0 ? 0 : tree.depths[i - 1]); - pos4[0 * N + i] = p; - pos4[1 * N + i] = p; - pos4[2 * N + i] = p; - pos4[3 * N + i] = 0; + std::vector tree_embed(hidden * ddtree_max_n); + if (!w.embedder.embed(flat_tokens.data(), ddtree_max_n, tree_embed.data())) return 1; + ggml_backend_tensor_set(sg_tree.inp_embed, tree_embed.data(), 0, + sizeof(float) * hidden * ddtree_max_n); + + // M-RoPE positions: committed + depth_of_node for real nodes; + // pad remaining slots with committed (harmless, masked out). + std::vector pos4(4 * ddtree_max_n); + for (int i = 0; i < ddtree_max_n; i++) { + int p = (i < N) + ? committed + (i == 0 ? 0 : tree.depths[i - 1]) + : committed; + pos4[0 * ddtree_max_n + i] = p; + pos4[1 * ddtree_max_n + i] = p; + pos4[2 * ddtree_max_n + i] = p; + pos4[3 * ddtree_max_n + i] = 0; } - ggml_backend_tensor_set(sg.positions, pos4.data(), 0, sizeof(int32_t) * 4 * N); + ggml_backend_tensor_set(sg_tree.positions, pos4.data(), 0, + sizeof(int32_t) * 4 * ddtree_max_n); - // Ancestor-only attention mask (f16). + // Ancestor-only mask: fill the full fixed-size buffer. + // build_tree_mask writes [q_pad, kv_pad] but with variable kv_pad. + // We build into a temp buffer then copy, OR build directly into + // tree_mask_buf which is pre-sized for max dimensions. const int tree_win_start = (g_fa_window > 0 && committed > g_fa_window) ? (committed - g_fa_window) : 0; - build_tree_mask(tree, /*past_length=*/committed, mask_buf, tree_win_start); - ggml_backend_tensor_set(sg.attn_mask, mask_buf.data(), 0, - sizeof(uint16_t) * mask_buf.size()); - - // parent_ids for tree-mode DeltaNet kernel. - // Slot 0 (root): -1 (reload initial state — matches kernel's skip - // at t==0). Slots 1..N-1: the tree's parent index in the flat array. - std::vector parent_ids(N); + { + // Fill with -inf + tree_mask_buf.assign((size_t)tree_kv_pad * tree_q_pad, F16_NEG_INF); + // Past KV (windowed committed positions): always visible + for (int q = 0; q < N; q++) { + for (int k = tree_win_start; k < committed; k++) { + tree_mask_buf[(size_t)q * tree_kv_pad + k] = F16_ZERO; + } + // Tree region at scratch slots: ancestors only + for (int j = 0; j < N; j++) { + if (tree.visibility[(size_t)q * N + j]) { + tree_mask_buf[(size_t)q * tree_kv_pad + (tree_write_start + j)] = F16_ZERO; + } + } + } + } + ggml_backend_tensor_set(sg_tree.attn_mask, tree_mask_buf.data(), 0, + sizeof(uint16_t) * tree_mask_buf.size()); + + // parent_ids: pad to ddtree_max_n with -1 + std::vector parent_ids(ddtree_max_n, -1); parent_ids[0] = -1; for (int i = 1; i < N; i++) parent_ids[i] = (int32_t)tree.parents[i]; - ggml_backend_tensor_set(sg.parent_ids, parent_ids.data(), 0, - sizeof(int32_t) * N); + ggml_backend_tensor_set(sg_tree.parent_ids, parent_ids.data(), 0, + sizeof(int32_t) * ddtree_max_n); T_verify_set = sync_us(); tt_verify_set += std::chrono::duration(T_verify_set - T_verify_build).count(); - st = ggml_backend_graph_compute(backend, sg.gf); + st = ggml_backend_graph_compute(backend, sg_tree.gf); if (st != GGML_STATUS_SUCCESS) { std::fprintf(stderr, "ddtree verify compute %d\n", (int)st); return 1; } T_verify_compute = sync_us(); tt_verify_compute += std::chrono::duration(T_verify_compute - T_verify_set).count(); - // Read the N verify logits, compute posterior argmax per slot. - ggml_backend_tensor_get(sg.logits, verify_logits_buf.data(), 0, + // Read the N verify logits (only first N of ddtree_max_n slots) + ggml_backend_tensor_get(sg_tree.logits, verify_logits_buf.data(), 0, sizeof(float) * vocab * N); std::vector posterior(N); for (int i = 0; i < N; i++) { @@ -1825,10 +1942,10 @@ int main(int argc, char ** argv) { } { - const int n_delta = (int)sg.delta_captures.size(); + const int n_delta = (int)sg_tree.delta_captures.size(); cudaStream_t stream = nullptr; for (int il = 0; il < n_delta; il++) { - const DeltaNetCapture & cap = sg.delta_captures[il]; + const DeltaNetCapture & cap = sg_tree.delta_captures[il]; if (!cap.ssm_intermediate_states || !cap.conv_input) { std::fprintf(stderr, "ddtree rollback: missing capture layer %d\n", il); return 1; @@ -1905,24 +2022,20 @@ int main(int argc, char ** argv) { } } - // target_feat compaction: written in DFS order during verify - // (column kv_start+i = dfs slot i's features). Same logic as - // KV cache: when accepted[d] != d, copy the accepted DFS slot's - // features to the spine slot at d so next iter's draft reads - // the right history. Position→slot uses `% target_feat_cap` - // to account for the ring buffer. + // target_feat compaction: written at scratch slots + // (tree_write_start + dfs_idx) during verify. Copy accepted + // DFS slots' features to the committed spine positions. if (cache.target_feat) { const size_t elt = ggml_element_size(cache.target_feat); - const int fc_in = (int)cache.target_feat->ne[0]; // 5*hidden + const int fc_in = (int)cache.target_feat->ne[0]; const size_t col_stride = cache.target_feat->nb[1]; const int tcap = cache.target_feat_cap; - for (int d = 1; d < commit_n; d++) { + for (int d = 0; d < commit_n; d++) { const int src_dfs = accepted[d]; - if (src_dfs == d) continue; - const int src_slot = (committed + src_dfs) % tcap; - const int dst_slot = (committed + d) % tcap; - const size_t src_off = (size_t)src_slot * col_stride; - const size_t dst_off = (size_t)dst_slot * col_stride; + const int src_feat_slot = (tree_write_start + src_dfs) % tcap; + const int dst_feat_slot = (committed + d) % tcap; + const size_t src_off = (size_t)src_feat_slot * col_stride; + const size_t dst_off = (size_t)dst_feat_slot * col_stride; cudaMemcpyAsync((char *)cache.target_feat->data + dst_off, (const char *)cache.target_feat->data + src_off, (size_t)fc_in * elt, @@ -1930,34 +2043,22 @@ int main(int argc, char ** argv) { } } - // Full-attention KV compaction: the verify wrote K/V at slots - // [committed..committed+N-1] in DFS tree order (slot 0 = root). - // For the next iter's verify to see the correct committed - // prefix, slots [committed..committed+commit_n-1] must hold - // the K/V of the accepted path's committed tokens. For each - // committed position d in 0..commit_n-1, the source K/V is at - // DFS slot accepted[d]. d==0 is always the root (DFS slot 0), - // trivially aligned. For d>=1, copy if accepted[d] != d. + // Full-attention KV compaction: the verify wrote K/V at scratch + // slots [tree_write_start..tree_write_start+N-1]. Copy the + // accepted DFS slots to committed positions. const int n_full_attn = (int)cache.attn_k.size(); for (int d = 0; d < commit_n; d++) { const int src_dfs = accepted[d]; - const int dst_slot = d; - if (src_dfs == dst_slot) continue; // already aligned + const int src_slot = tree_write_start + src_dfs; + const int dst_slot = committed + d; for (int l = 0; l < n_full_attn; l++) { - // Each slot: head_dim * n_kv floats in f16 per tensor. ggml_tensor * ck = cache.attn_k[l]; ggml_tensor * cv = cache.attn_v[l]; - const size_t slot_bytes = ck->nb[1]; // stride between slots - const size_t src_off = (size_t)(committed + src_dfs) * slot_bytes; - const size_t dst_off = (size_t)(committed + dst_slot) * slot_bytes; - // Per-head-kv layout: shape [head_dim, max_ctx, n_head_kv]. - // nb[2] is distance between heads; we copy one slot's - // slice per head. For simplicity, do a 2D copy across - // the head dimension. + const size_t slot_bytes = ck->nb[1]; const int n_kv = (int)ck->ne[2]; for (int h = 0; h < n_kv; h++) { - const size_t head_src = src_off + (size_t)h * ck->nb[2]; - const size_t head_dst = dst_off + (size_t)h * ck->nb[2]; + const size_t head_src = (size_t)src_slot * slot_bytes + (size_t)h * ck->nb[2]; + const size_t head_dst = (size_t)dst_slot * slot_bytes + (size_t)h * ck->nb[2]; cudaMemcpyAsync((char *)ck->data + head_dst, (const char *)ck->data + head_src, slot_bytes, cudaMemcpyDeviceToDevice, stream); diff --git a/dflash/test/test_generate.cpp b/dflash/test/test_generate.cpp index 68be0fe8..dfbfdcd8 100644 --- a/dflash/test/test_generate.cpp +++ b/dflash/test/test_generate.cpp @@ -18,6 +18,8 @@ #include "ggml-backend.h" #include "ggml-cuda.h" +#include + #include #include #include @@ -49,21 +51,19 @@ struct StepGraph { ggml_gallocr_t alloc = nullptr; ggml_tensor * inp_embed = nullptr; ggml_tensor * positions = nullptr; + ggml_tensor * attn_mask = nullptr; ggml_tensor * logits = nullptr; + ggml_tensor * argmax_out = nullptr; }; -// Build a fresh single-token forward graph. We rebuild per step so that -// `kv_start` updates drive the correct KV cache slot. The graph is cheap to -// rebuild — all the weights + KV cache stay persistent. -static bool build_step_graph( +static bool build_reusable_graph( StepGraph & sg, const TargetWeights & w, TargetCache & cache, ggml_backend_t backend, - int kv_start + int max_ctx ) { - if (sg.alloc) { ggml_gallocr_free(sg.alloc); sg.alloc = nullptr; } - if (sg.ctx) { ggml_free(sg.ctx); sg.ctx = nullptr; } + if (sg.ctx) { ggml_free(sg.ctx); sg.ctx = nullptr; } ggml_init_params ip{}; ip.mem_size = 256 * 1024 * 1024; @@ -73,30 +73,40 @@ static bool build_step_graph( if (!sg.ctx) return false; const int n_tokens = 1; - const int hidden = DFLASH27B_TARGET_HIDDEN; + const int hidden = w.n_embd; + sg.inp_embed = ggml_new_tensor_3d(sg.ctx, GGML_TYPE_F32, hidden, n_tokens, 1); sg.positions = ggml_new_tensor_1d(sg.ctx, GGML_TYPE_I32, 4 * n_tokens); + sg.attn_mask = ggml_new_tensor_2d(sg.ctx, GGML_TYPE_F16, max_ctx, n_tokens); ggml_set_input(sg.inp_embed); ggml_set_input(sg.positions); + ggml_set_input(sg.attn_mask); sg.gf = ggml_new_graph_custom(sg.ctx, 8192, false); QwenGraphInputs gi{}; gi.inp_embed = sg.inp_embed; gi.positions = sg.positions; - gi.attn_mask = nullptr; // n_tokens==1, no mask needed + gi.attn_mask = sg.attn_mask; gi.n_tokens = n_tokens; - gi.kv_start = kv_start; + gi.kv_start = max_ctx - 1; gi.capture_layers = false; QwenGraphOutputs go = build_qwen35_graph(sg.ctx, sg.gf, w, cache, gi); if (!go.logits) return false; - ggml_set_output(go.logits); - ggml_build_forward_expand(sg.gf, go.logits); + + sg.argmax_out = ggml_argmax(sg.ctx, go.logits); + ggml_set_output(sg.argmax_out); + ggml_build_forward_expand(sg.gf, sg.argmax_out); sg.logits = go.logits; - sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); - return ggml_gallocr_alloc_graph(sg.alloc, sg.gf); + if (!sg.alloc) { + sg.alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + } + if (!ggml_gallocr_reserve(sg.alloc, sg.gf)) return false; + if (!ggml_gallocr_alloc_graph(sg.alloc, sg.gf)) return false; + + return true; } static std::vector read_int32_file(const std::string & path) { @@ -116,6 +126,22 @@ static bool write_int32_file(const std::string & path, const std::vector & cache_tensors, + int src_slot, int dst_slot, int max_ctx, int n_heads +) { + for (auto * ct : cache_tensors) { + const size_t pos_bytes = ct->nb[1]; + const size_t head_stride = ct->nb[2]; + const char * base = (const char *) ct->data; + cudaMemcpy2D( + (void *)(base + (size_t)dst_slot * pos_bytes), head_stride, + (const void *)(base + (size_t)src_slot * pos_bytes), head_stride, + pos_bytes, n_heads, cudaMemcpyDeviceToDevice + ); + } +} + int main(int argc, char ** argv) { if (argc < 5) { std::fprintf(stderr, @@ -144,7 +170,6 @@ int main(int argc, char ** argv) { #endif }; - // ── Load model and cache ggml_backend_t backend = ggml_backend_cuda_init(0); if (!backend) { std::fprintf(stderr, "cuda init failed\n"); return 1; } @@ -173,22 +198,25 @@ int main(int argc, char ** argv) { return 1; } + const int n_full_attn = w.n_layer / w.full_attention_interval; + const int n_head_kv = w.n_head_kv; + + StepGraph sg; + if (!build_reusable_graph(sg, w, cache, backend, max_ctx)) { + std::fprintf(stderr, "build_reusable_graph failed\n"); + return 1; + } + + std::vector mask_buf(max_ctx); std::vector all_tokens = prompt; all_tokens.reserve(prompt.size() + n_gen); - - const int hidden = DFLASH27B_TARGET_HIDDEN; + const int hidden = w.n_embd; std::vector embed_buf(hidden); + int32_t argmax_result = 0; - StepGraph sg; - - // ── Helper: run one step given current token + absolute position - auto run_step = [&](int32_t tok, int pos) -> int32_t { - if (!build_step_graph(sg, w, cache, backend, pos)) { - std::fprintf(stderr, "build_step_graph failed at pos=%d\n", pos); - std::exit(1); - } + const int write_slot = max_ctx - 1; - // CPU embed + auto run_step_reuse = [&](int32_t tok, int pos) -> int32_t { int32_t ids[1] = { tok }; if (!w.embedder.embed(ids, 1, embed_buf.data())) { std::fprintf(stderr, "embed failed tok=%d\n", tok); @@ -197,53 +225,78 @@ int main(int argc, char ** argv) { ggml_backend_tensor_set(sg.inp_embed, embed_buf.data(), 0, sizeof(float) * embed_buf.size()); - // M-RoPE positions: 4 copies of pos int32_t p4[4] = { pos, pos, pos, pos }; ggml_backend_tensor_set(sg.positions, p4, 0, sizeof(int32_t) * 4); + for (int i = 0; i < max_ctx; i++) { + bool attend = (i < pos) || (i == write_slot); + mask_buf[i] = ggml_fp32_to_fp16(attend ? 0.0f : -INFINITY); + } + ggml_backend_tensor_set(sg.attn_mask, mask_buf.data(), 0, + sizeof(ggml_fp16_t) * max_ctx); + auto st = ggml_backend_graph_compute(backend, sg.gf); if (st != GGML_STATUS_SUCCESS) { std::fprintf(stderr, "compute failed at pos=%d (%d)\n", pos, (int)st); std::exit(1); } - // argmax on logits - const int vocab = DFLASH27B_TARGET_VOCAB; - std::vector logits(vocab); - ggml_backend_tensor_get(sg.logits, logits.data(), 0, sizeof(float) * vocab); - int best = 0; - float bv = logits[0]; - for (int i = 1; i < vocab; i++) { - if (logits[i] > bv) { bv = logits[i]; best = i; } - } - return best; + ggml_backend_tensor_get(sg.argmax_out, &argmax_result, 0, sizeof(int32_t)); + + copy_kv_slot(cache.attn_k, write_slot, pos, max_ctx, n_head_kv); + copy_kv_slot(cache.attn_v, write_slot, pos, max_ctx, n_head_kv); + + return argmax_result; }; // ── Prefill: feed prompt tokens one at a time (decode-only mode). - // We throw away the logits for all prompt tokens except the last one. int next = -1; for (int i = 0; i < (int)prompt.size(); i++) { - next = run_step(prompt[i], i); + next = run_step_reuse(prompt[i], i); } std::printf("[prefill] last-token argmax=%d\n", next); - // ── Generation loop + // ── Generation loop (CUDA graph captures on first step) auto t_start = std::chrono::steady_clock::now(); + double total_compute = 0; int gen_start_pos = (int)prompt.size(); for (int g = 0; g < n_gen; g++) { int32_t tok = next; all_tokens.push_back(tok); stream_emit(tok); - next = run_step(tok, gen_start_pos + g); + + auto t0 = std::chrono::steady_clock::now(); + + if (!w.embedder.embed(&tok, 1, embed_buf.data())) return 1; + ggml_backend_tensor_set(sg.inp_embed, embed_buf.data(), 0, sizeof(float) * embed_buf.size()); + int32_t p4[4] = { gen_start_pos + g, gen_start_pos + g, gen_start_pos + g, gen_start_pos + g }; + ggml_backend_tensor_set(sg.positions, p4, 0, sizeof(int32_t) * 4); + for (int i = 0; i < max_ctx; i++) { + bool attend = (i < gen_start_pos + g) || (i == write_slot); + mask_buf[i] = ggml_fp32_to_fp16(attend ? 0.0f : -INFINITY); + } + ggml_backend_tensor_set(sg.attn_mask, mask_buf.data(), 0, sizeof(ggml_fp16_t) * max_ctx); + + ggml_backend_graph_compute(backend, sg.gf); + + ggml_backend_tensor_get(sg.argmax_out, &argmax_result, 0, sizeof(int32_t)); + next = argmax_result; + + copy_kv_slot(cache.attn_k, write_slot, gen_start_pos + g, max_ctx, n_head_kv); + copy_kv_slot(cache.attn_v, write_slot, gen_start_pos + g, max_ctx, n_head_kv); + + auto t1 = std::chrono::steady_clock::now(); + total_compute += std::chrono::duration(t1 - t0).count(); } auto t_end = std::chrono::steady_clock::now(); double secs = std::chrono::duration(t_end - t_start).count(); double tps = n_gen / std::max(1e-9, secs); - // Also push the final next token so downstream sees it all_tokens.push_back(next); std::printf("[gen] %d new tokens in %.3f s -> %.2f tok/s\n", n_gen, secs, tps); + std::printf("[gen] compute=%.3f s (%.1f%% of total)\n", + total_compute, 100.0 * total_compute / std::max(1e-12, secs)); std::printf("[gen] tokens: "); for (int i = 0; i < n_gen; i++) std::printf("%d ", all_tokens[prompt.size() + i]); std::printf("\n");