Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions dflash/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ endif()
# Hardcoded for CUDA. No libllama, no BLAS, no Metal, no Vulkan.

set(GGML_CUDA ON CACHE BOOL "" FORCE)
set(GGML_CUDA_GRAPHS ON CACHE BOOL "" FORCE)
set(GGML_BACKEND_DL OFF CACHE BOOL "" FORCE)
set(GGML_METAL OFF CACHE BOOL "" FORCE)
set(GGML_VULKAN OFF CACHE BOOL "" FORCE)
Expand Down Expand Up @@ -137,6 +138,26 @@ if(DFLASH27B_TESTS)
target_include_directories(test_generate PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_generate PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_moe_draft.cpp")
add_executable(smoke_load_moe_draft test/smoke_load_moe_draft.cpp)
target_include_directories(smoke_load_moe_draft PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(smoke_load_moe_draft PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_moe_target_forward.cpp")
add_executable(smoke_moe_target_forward test/smoke_moe_target_forward.cpp)
target_include_directories(smoke_moe_target_forward PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(smoke_moe_target_forward PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_moe_ffn.cpp")
add_executable(smoke_moe_ffn test/smoke_moe_ffn.cpp)
target_include_directories(smoke_moe_ffn PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(smoke_moe_ffn PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_moe_target.cpp")
add_executable(smoke_load_moe_target test/smoke_load_moe_target.cpp)
target_include_directories(smoke_load_moe_target PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(smoke_load_moe_target PRIVATE dflash27b ggml ggml-cuda)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_dflash.cpp")
add_executable(test_dflash test/test_dflash.cpp)
target_include_directories(test_dflash PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
Expand Down
2 changes: 1 addition & 1 deletion dflash/deps/llama.cpp
148 changes: 109 additions & 39 deletions dflash/src/gguf_target_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,56 +202,89 @@ bool load_target_gguf(const std::string & path,
return false;
}

// Validate arch + the dimensions we hardcode everywhere.
// Validate arch — accept both qwen35 (dense) and qwen35moe (MoE)
std::string arch;
{
int64_t arch_id = gguf_find_key(gctx, "general.architecture");
if (arch_id < 0) {
set_last_error("missing general.architecture");
gguf_free(gctx);
return false;
}
const char * arch = gguf_get_val_str(gctx, arch_id);
if (std::string(arch) != "qwen35") {
set_last_error(std::string("unexpected arch: ") + arch + " (expected qwen35)");
arch = gguf_get_val_str(gctx, arch_id);
if (arch != "qwen35" && arch != "qwen35moe") {
set_last_error(std::string("unexpected arch: ") + arch + " (expected qwen35 or qwen35moe)");
gguf_free(gctx);
return false;
}
}

std::string err;
const uint32_t n_embd = get_u32_or(gctx, "qwen35.embedding_length", 0);
const uint32_t n_ff = get_u32_or(gctx, "qwen35.feed_forward_length", 0);
const uint32_t n_layer= get_u32_or(gctx, "qwen35.block_count", 0);
const uint32_t n_head = get_u32_or(gctx, "qwen35.attention.head_count",0);
const uint32_t n_headkv=get_u32_or(gctx, "qwen35.attention.head_count_kv",0);
const uint32_t kl = get_u32_or(gctx, "qwen35.attention.key_length", 0);
const uint32_t vl = get_u32_or(gctx, "qwen35.attention.value_length", 0);
const uint32_t fai = get_u32_or(gctx, "qwen35.full_attention_interval",0);
const uint32_t ssm_conv = get_u32_or(gctx, "qwen35.ssm.conv_kernel", 0);
const uint32_t ssm_inner = get_u32_or(gctx, "qwen35.ssm.inner_size", 0);
const uint32_t ssm_state = get_u32_or(gctx, "qwen35.ssm.state_size", 0);
const uint32_t ssm_dt = get_u32_or(gctx, "qwen35.ssm.time_step_rank",0);
const uint32_t ssm_grp = get_u32_or(gctx, "qwen35.ssm.group_count", 0);

if (n_embd != 5120 || n_layer != 64 || n_head != 24 || n_headkv != 4 ||
kl != 256 || vl != 256 || n_ff != 17408 || fai != 4 ||
ssm_conv != 4 || ssm_inner != 6144 || ssm_state != 128 ||
ssm_dt != 48 || ssm_grp != 16) {
char buf[512];
std::snprintf(buf, sizeof(buf),
"unexpected hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u "
"kl=%u vl=%u n_ff=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u}",
n_embd, n_layer, n_head, n_headkv, kl, vl, n_ff, fai,
ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp);
set_last_error(buf);
gguf_free(gctx);
return false;
const uint32_t n_embd = get_u32_or(gctx, (arch + ".embedding_length").c_str(), 0);
const uint32_t n_ff = get_u32_or(gctx, (arch + ".feed_forward_length").c_str(), 0);
const uint32_t n_layer= get_u32_or(gctx, (arch + ".block_count").c_str(), 0);
const uint32_t n_head = get_u32_or(gctx, (arch + ".attention.head_count").c_str(),0);
const uint32_t n_headkv=get_u32_or(gctx, (arch + ".attention.head_count_kv").c_str(),0);
const uint32_t kl = get_u32_or(gctx, (arch + ".attention.key_length").c_str(), 0);
const uint32_t vl = get_u32_or(gctx, (arch + ".attention.value_length").c_str(), 0);
const uint32_t fai = get_u32_or(gctx, (arch + ".full_attention_interval").c_str(),0);
const uint32_t ssm_conv = get_u32_or(gctx, (arch + ".ssm.conv_kernel").c_str(), 0);
const uint32_t ssm_inner = get_u32_or(gctx, (arch + ".ssm.inner_size").c_str(), 0);
const uint32_t ssm_state = get_u32_or(gctx, (arch + ".ssm.state_size").c_str(), 0);
const uint32_t ssm_dt = get_u32_or(gctx, (arch + ".ssm.time_step_rank").c_str(),0);
const uint32_t ssm_grp = get_u32_or(gctx, (arch + ".ssm.group_count").c_str(), 0);

// MoE fields (zero for dense qwen35)
const uint32_t n_expert = get_u32_or(gctx, (arch + ".expert_count").c_str(), 0);
const uint32_t n_expert_used = get_u32_or(gctx, (arch + ".expert_used_count").c_str(), 0);
const uint32_t expert_ff = get_u32_or(gctx, (arch + ".expert_feed_forward_length").c_str(), 0);
const uint32_t shared_ff = get_u32_or(gctx, (arch + ".expert_shared_feed_forward_length").c_str(), 0);

const bool is_moe = (arch == "qwen35moe");

if (is_moe) {
// Validate qwen35moe hparams: 40 layers, 2048 hidden, 16 heads, 2 kv heads,
// 256 experts, 8 active, 512 expert ff, 512 shared ff
if (n_embd != 2048 || n_layer != 40 || n_head != 16 || n_headkv != 2 ||
kl != 256 || vl != 256 || fai != 4 ||
ssm_conv != 4 || ssm_inner != 4096 || ssm_state != 128 ||
ssm_dt != 32 || ssm_grp != 16 ||
n_expert != 256 || n_expert_used != 8 ||
expert_ff != 512 || shared_ff != 512) {
char buf[512];
std::snprintf(buf, sizeof(buf),
"unexpected qwen35moe hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u "
"kl=%u vl=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u} "
"n_expert=%u n_expert_used=%u expert_ff=%u shared_ff=%u",
n_embd, n_layer, n_head, n_headkv, kl, vl, fai,
ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp,
n_expert, n_expert_used, expert_ff, shared_ff);
set_last_error(buf);
gguf_free(gctx);
return false;
}
} else {
// Validate qwen35 dense hparams (unchanged)
if (n_embd != 5120 || n_layer != 64 || n_head != 24 || n_headkv != 4 ||
kl != 256 || vl != 256 || n_ff != 17408 || fai != 4 ||
ssm_conv != 4 || ssm_inner != 6144 || ssm_state != 128 ||
ssm_dt != 48 || ssm_grp != 16) {
char buf[512];
std::snprintf(buf, sizeof(buf),
"unexpected hparams: n_embd=%u n_layer=%u n_head=%u n_head_kv=%u "
"kl=%u vl=%u n_ff=%u fai=%u ssm{conv=%u inner=%u state=%u dt=%u grp=%u}",
n_embd, n_layer, n_head, n_headkv, kl, vl, n_ff, fai,
ssm_conv, ssm_inner, ssm_state, ssm_dt, ssm_grp);
set_last_error(buf);
gguf_free(gctx);
return false;
}
}

// rope dimension_sections (array of 4 uint32)
int rope_sections[4] = {0, 0, 0, 0};
{
int64_t rid = gguf_find_key(gctx, "qwen35.rope.dimension_sections");
int64_t rid = gguf_find_key(gctx, (arch + ".rope.dimension_sections").c_str());
if (rid >= 0) {
size_t n = gguf_get_arr_n(gctx, rid);
if (n >= 4) {
Expand All @@ -265,7 +298,7 @@ bool load_target_gguf(const std::string & path,
out.backend = backend;
out.n_layer = (int)n_layer;
out.n_embd = (int)n_embd;
out.n_ff = (int)n_ff;
out.n_ff = is_moe ? 0 : (int)n_ff;
out.n_head = (int)n_head;
out.n_head_kv = (int)n_headkv;
out.n_embd_head_k = (int)kl;
Expand All @@ -277,6 +310,10 @@ bool load_target_gguf(const std::string & path,
out.ssm_d_state= (int)ssm_state;
out.ssm_dt_rank= (int)ssm_dt;
out.ssm_n_group= (int)ssm_grp;
out.n_expert = (int)n_expert;
out.n_expert_used = (int)n_expert_used;
out.expert_ff_dim = (int)expert_ff;
out.shared_ff_dim = (int)shared_ff;
out.layers.assign((size_t)n_layer, TargetLayer{});

// ── 2. Wire our layer pointers to tensors inside meta_ctx ─────────
Expand All @@ -303,15 +340,48 @@ bool load_target_gguf(const std::string & path,
// Always-present tensors
L.attn_norm = fnd("attn_norm.weight");
L.attn_post_norm = fnd("post_attention_norm.weight");

// Dense FFN tensors (qwen35) or MoE tensors (qwen35moe)
L.w_gate = fnd("ffn_gate.weight");
L.w_up = fnd("ffn_up.weight");
L.w_down = fnd("ffn_down.weight");
if (!L.attn_norm || !L.attn_post_norm || !L.w_gate || !L.w_up || !L.w_down) {
char b[128];
std::snprintf(b, sizeof(b), "layer %d: missing shared tensor", il);
set_last_error(b);
gguf_free(gctx);
return false;

// MoE FFN tensors (qwen35moe only; null for dense)
L.ffn_gate_inp = fnd("ffn_gate_inp.weight");
L.ffn_up_exps = fnd("ffn_up_exps.weight");
L.ffn_gate_exps = fnd("ffn_gate_exps.weight");
L.ffn_down_exps = fnd("ffn_down_exps.weight");
L.ffn_up_shexp = fnd("ffn_up_shexp.weight");
L.ffn_gate_shexp = fnd("ffn_gate_shexp.weight");
L.ffn_down_shexp = fnd("ffn_down_shexp.weight");
L.ffn_gate_inp_shexp = fnd("ffn_gate_inp_shexp.weight");

if (!is_moe) {
// Dense model: gate/up/down required
if (!L.attn_norm || !L.attn_post_norm || !L.w_gate || !L.w_up || !L.w_down) {
char b[128];
std::snprintf(b, sizeof(b), "layer %d: missing shared tensor", il);
set_last_error(b);
gguf_free(gctx);
return false;
}
} else {
// MoE model: attn_norm + post_norm + expert tensors required
if (!L.attn_norm || !L.attn_post_norm) {
char b[128];
std::snprintf(b, sizeof(b), "layer %d: missing attn_norm/post_norm", il);
set_last_error(b);
gguf_free(gctx);
return false;
}
if (!L.ffn_gate_inp || !L.ffn_up_exps || !L.ffn_gate_exps || !L.ffn_down_exps ||
!L.ffn_up_shexp || !L.ffn_gate_shexp || !L.ffn_down_shexp || !L.ffn_gate_inp_shexp) {
char b[256];
std::snprintf(b, sizeof(b), "layer %d: missing MoE tensors", il);
set_last_error(b);
gguf_free(gctx);
return false;
}
}

// Full-attention tensors (only on layers where (il+1)%fai == 0,
Expand Down
56 changes: 52 additions & 4 deletions dflash/src/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,22 @@ struct TargetLayer {
ggml_tensor * attn_norm = nullptr; // [hidden]
ggml_tensor * attn_post_norm = nullptr; // [hidden] (post-block norm before FFN)
ggml_tensor * ffn_norm = nullptr; // [hidden]

// Dense FFN (non-null for qwen35 dense target)
ggml_tensor * w_gate = nullptr; // [hidden, intermediate]
ggml_tensor * w_up = nullptr; // [hidden, intermediate]
ggml_tensor * w_down = nullptr; // [intermediate, hidden]

// MoE FFN (non-null for qwen35moe target)
ggml_tensor * ffn_gate_inp = nullptr; // [n_embd, n_expert] — router
ggml_tensor * ffn_up_exps = nullptr; // [expert_ff, n_embd, n_expert]
ggml_tensor * ffn_gate_exps = nullptr; // [expert_ff, n_embd, n_expert]
ggml_tensor * ffn_down_exps = nullptr; // [n_embd, expert_ff, n_expert]
ggml_tensor * ffn_up_shexp = nullptr; // [shared_ff, n_embd] — shared expert
ggml_tensor * ffn_gate_shexp = nullptr; // [shared_ff, n_embd]
ggml_tensor * ffn_down_shexp = nullptr; // [n_embd, shared_ff]
ggml_tensor * ffn_gate_inp_shexp = nullptr; // [n_embd] — shared expert gate

// Full-attention block (non-null for layers where (il+1) % 4 == 0)
ggml_tensor * wq = nullptr; // [hidden, q_dim]
ggml_tensor * wk = nullptr; // [hidden, kv_dim]
Expand Down Expand Up @@ -127,6 +139,13 @@ struct TargetWeights {
int ssm_d_state = 128;
int ssm_dt_rank = 48;
int ssm_n_group = 16;

// MoE-specific (zero for dense models)
int n_expert = 0;
int n_expert_used = 0;
int expert_ff_dim = 0;
int shared_ff_dim = 0;
float expert_weights_scale = 1.0f;
};

// Load a Q4_K_M target model from a GGUF file on disk.
Expand All @@ -139,6 +158,24 @@ void free_target_weights(TargetWeights & w);

// ─── Draft weights (z-lab DFlash, bf16) ───────────────────────────

struct DraftHparams {
int n_layer = DFLASH27B_DRAFT_LAYERS;
int hidden = DFLASH27B_TARGET_HIDDEN;
int n_head = DFLASH27B_TARGET_N_HEADS;
int n_kv_head = DFLASH27B_TARGET_N_KV_HEADS;
int head_dim = DFLASH27B_TARGET_HEAD_DIM;
int intermediate = DFLASH27B_TARGET_INTERMEDIATE;
int n_target_layers = DFLASH27B_DRAFT_N_TARGET_LAYERS;
int block_size = DFLASH27B_DRAFT_BLOCK_SIZE;
int mask_token_id = DFLASH27B_DRAFT_MASK_TOKEN_ID;
float rope_theta = DFLASH27B_ROPE_THETA;
float rms_eps = DFLASH27B_RMS_EPS;
float rope_factor = 1.0f;
float rope_beta_fast = 0.0f;
float rope_beta_slow = 0.0f;
int rope_orig_ctx = 0;
};

struct DraftLayer {
ggml_tensor * attn_norm;
ggml_tensor * ffn_norm;
Expand All @@ -158,10 +195,11 @@ struct DraftWeights {
ggml_backend_t backend = nullptr;
ggml_backend_buffer_t buf = nullptr;

ggml_tensor * fc = nullptr; // [5*hidden, hidden]
ggml_tensor * hidden_norm = nullptr; // [hidden]
std::vector<DraftLayer> layers; // size = 5
ggml_tensor * out_norm = nullptr; // [hidden]
DraftHparams hparams;
ggml_tensor * fc = nullptr;
ggml_tensor * hidden_norm = nullptr;
std::vector<DraftLayer> layers;
ggml_tensor * out_norm = nullptr;
};

bool load_draft_safetensors(const std::string & path,
Expand Down Expand Up @@ -310,6 +348,16 @@ QwenGraphOutputs build_qwen35_graph(
TargetCache & cache,
const QwenGraphInputs & in);

// MoE FFN forward pass (qwen35moe). Computes expert routing, per-expert
// SwiGLU, shared expert with sigmoid gating, and returns the combined output.
// Shape: [n_embd, n_tokens] f32.
ggml_tensor * build_moe_ffn(
ggml_context * ctx,
ggml_cgraph * gf,
ggml_tensor * cur,
const TargetLayer & L,
const TargetWeights & w);

// Build a single-layer forward graph. Mirrors build_qwen35_graph but processes
// only one layer, taking `inp` as the input activation and returning the output.
// Used by layer-segmented prefill to iterate layers as the outer loop.
Expand Down
Loading