@@ -1542,6 +1542,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1542
1542
hparams.dec_start_token_id = dec_start_token_id;
1543
1543
}
1544
1544
1545
+ hparams.dec_n_layer = hparams.n_layer;
1546
+ ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
1547
+
1545
1548
switch (hparams.n_layer) {
1546
1549
case 6: type = LLM_TYPE_60M; break; // t5-small
1547
1550
case 8: type = LLM_TYPE_80M; break; // flan-t5-small
@@ -4414,6 +4417,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4414
4417
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
4415
4418
}
4416
4419
4420
+ // n_layer: number of encoder_layers
4421
+ // dec_n_layer: number of decoder_layers
4422
+ const int dec_n_layer = hparams.dec_n_layer;
4423
+ if (dec_n_layer > n_layer) {
4424
+ layers.resize(dec_n_layer);
4425
+ }
4426
+
4427
+ // load encoder layers
4417
4428
for (int i = 0; i < n_layer; ++i) {
4418
4429
auto & layer = layers[i];
4419
4430
@@ -4429,6 +4440,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
4429
4440
layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
4430
4441
layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
4431
4442
layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
4443
+ }
4444
+
4445
+ // load decoder layers
4446
+ for (int i = 0; i < dec_n_layer; ++i) {
4447
+ auto & layer = layers[i];
4432
4448
4433
4449
layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
4434
4450
layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
@@ -13509,7 +13525,9 @@ struct llm_build_t5_dec : public llm_graph_context {
13509
13525
13510
13526
ggml_tensor * inp_out_ids = build_inp_out_ids();
13511
13527
13512
- for (int il = 0; il < n_layer; ++il) {
13528
+ const int64_t dec_n_layer = hparams.dec_n_layer;
13529
+
13530
+ for (int il = 0; il < dec_n_layer; ++il) {
13513
13531
ggml_tensor * inpSA = inpL;
13514
13532
13515
13533
// norm
@@ -13600,7 +13618,7 @@ struct llm_build_t5_dec : public llm_graph_context {
13600
13618
//cb(cur, "kqv_out", il);
13601
13619
}
13602
13620
13603
- if (il == n_layer - 1 && inp_out_ids) {
13621
+ if (il == dec_n_layer - 1 && inp_out_ids) {
13604
13622
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
13605
13623
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
13606
13624
}
@@ -13621,8 +13639,8 @@ struct llm_build_t5_dec : public llm_graph_context {
13621
13639
model.layers[il].ffn_gate, NULL, NULL,
13622
13640
model.layers[il].ffn_down, NULL, NULL,
13623
13641
NULL,
13624
- model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
13625
- model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
13642
+ model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
13643
+ model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
13626
13644
il);
13627
13645
cb(cur, "ffn_out", il);
13628
13646
}
0 commit comments