From 89eaf5036a07b394bea4c7b35f97d171bee2e87b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Dec 2024 21:03:24 +0200
Subject: [PATCH 01/45] server : add "tokens" output

ggml-ci
---
 examples/server/server.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 5ed4e8d274428..57db582d72754 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -687,8 +687,6 @@ struct server_task_result_cmpl_partial : server_task_result {
                 json second_ret = json{
                             {"choices", json::array({json{{"finish_reason", nullptr},
                                                             {"index", 0},
-                                                            {"delta", json {
-                                                            {"content", content}}}
                                                             }})},
                             {"created", t},
                             {"id", oaicompat_cmpl_id},
@@ -704,6 +702,7 @@ struct server_task_result_cmpl_partial : server_task_result {
                 {"delta",
                 json {
                     {"content", content},
+                    {"tokens",  tokens}
                 }},
             }});
         }
@@ -1017,6 +1016,7 @@ struct server_slot {
         n_prompt_tokens    = 0;
         last_nl_pos        = 0;
         generated_text     = "";
+        generated_tokens   = {};
         has_new_line       = false;
         truncated          = false;
         stop               = STOP_TYPE_NONE;

From 06e85401b0f265f8ffa76144d31c1c7fdca45b6b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Dec 2024 10:56:20 +0200
Subject: [PATCH 02/45] server : output embeddings for all tokens when pooling
 = none

ggml-ci
---
 examples/server/tests/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index 277125e88b534..da95c830b036d 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -275,7 +275,7 @@ def tinyllama2() -> ServerProcess:
         return server
 
     @staticmethod
-    def bert_bge_small() -> ServerProcess:
+    def bert_bge_small(pooling = 'last') -> ServerProcess:
         server = ServerProcess()
         server.model_hf_repo = "ggml-org/models"
         server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
@@ -286,6 +286,7 @@ def bert_bge_small() -> ServerProcess:
         server.n_slots = 2
         server.seed = 42
         server.server_embeddings = True
+        server.pooling = pooling
         return server
 
     @staticmethod

From 1b18b2d7b0d55da16b3c1b1d8abd94e5be85bf91 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Dec 2024 11:45:18 +0200
Subject: [PATCH 03/45] server : be explicit about the pooling type in the
 tests

ggml-ci
---
 examples/server/tests/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index da95c830b036d..277125e88b534 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -275,7 +275,7 @@ def tinyllama2() -> ServerProcess:
         return server
 
     @staticmethod
-    def bert_bge_small(pooling = 'last') -> ServerProcess:
+    def bert_bge_small() -> ServerProcess:
         server = ServerProcess()
         server.model_hf_repo = "ggml-org/models"
         server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
@@ -286,7 +286,6 @@ def bert_bge_small(pooling = 'last') -> ServerProcess:
         server.n_slots = 2
         server.seed = 42
         server.server_embeddings = True
-        server.pooling = pooling
         return server
 
     @staticmethod

From e65556f1748451847b1b84382348e150a200bb34 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Dec 2024 13:36:32 +0200
Subject: [PATCH 04/45] server : do not normalize embeddings when there is no
 pooling

ggml-ci
---
 examples/server/tests/unit/test_embedding.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py
index e32d745829605..af6d148537452 100644
--- a/examples/server/tests/unit/test_embedding.py
+++ b/examples/server/tests/unit/test_embedding.py
@@ -92,6 +92,10 @@ def test_embedding_pooling_none():
     for x in res.body[0]['embedding']:
         assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON
 
+    # make sure embedding vector is not normalized
+    for x in res.body[0]['embedding']:
+        assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON
+
 
 def test_embedding_pooling_none_oai():
     global server

From f16996515861463b6a7dd1c4ce85aee52e74b96a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 14:40:03 +0200
Subject: [PATCH 05/45] llama : add OuteTTS support (wip)

---
 convert_hf_to_gguf.py            |  26 ++++++
 examples/tts/convert_pt_to_hf.py | 141 +++++++++++++++++++++++++++++++
 gguf-py/gguf/constants.py        |  65 ++++++++++++++
 gguf-py/gguf/tensor_mapping.py   |  83 ++++++++++++++++++
 4 files changed, 315 insertions(+)
 create mode 100644 examples/tts/convert_pt_to_hf.py

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 9dc1673bc2c06..2a9ed6a71fcc4 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -689,6 +689,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         return res
         # Marker: End get_vocab_base_pre
 
+    def _set_vocab_none(self) -> None:
+        self.gguf_writer.add_tokenizer_model("none")
+
     def _set_vocab_gpt2(self) -> None:
         tokens, toktypes, tokpre = self.get_vocab_base()
         self.gguf_writer.add_tokenizer_model("gpt2")
@@ -2027,6 +2030,29 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
             yield name, data
 
 
+@Model.register("OuteTTSVocoder")
+class OuteTTSVocoderModel(Model):
+    model_arch = gguf.MODEL_ARCH.OUTETTS_VOC
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if \
+                name.endswith("codebook.cluster_size") or \
+                name.endswith("codebook.embed_avg") or \
+                name.endswith("codebook.inited"):
+            logger.debug(f"Skipping {name!r}")
+            return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def set_vocab(self):
+        self._set_vocab_none()
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_block_count(self.block_count)
+
+
 @Model.register("Qwen2MoeForCausalLM")
 class Qwen2MoeModel(Model):
     model_arch = gguf.MODEL_ARCH.QWEN2MOE
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
new file mode 100644
index 0000000000000..c77aee6a8c0e2
--- /dev/null
+++ b/examples/tts/convert_pt_to_hf.py
@@ -0,0 +1,141 @@
+# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
+# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the OuteTTSS vocoder
+#
+# TODO: this script is LLM-generated and probably very inefficient and should be rewritten
+
+import torch
+import json
+import os
+import sys
+import re
+
+from safetensors.torch import save_file
+
+# change path to script dir
+os.chdir(os.path.dirname(os.path.abspath(__file__)))
+
+# default
+model_path = './model.pt';
+
+# read from CLI
+if len(sys.argv) > 1:
+    model_path = sys.argv[1]
+
+# get the directory of the input model
+path_dst = os.path.dirname(model_path)
+
+print(f"Loading model from {model_path}")
+
+model = torch.load(model_path, map_location='cpu')
+
+#print(model)
+
+# print all keys
+for key in model.keys():
+    print(key)
+    if key == 'hyper_parameters':
+        #print(model[key])
+        # dump as json pretty
+        print(json.dumps(model[key], indent=4))
+    #if key != 'state_dict' and key != 'optimizer_states':
+    #    print(model[key])
+
+# Check if the loaded model is a state_dict or a model instance
+if isinstance(model, torch.nn.Module):
+    state_dict = model.state_dict()
+else:
+    state_dict = model
+
+# Print the structure of the state_dict to understand its format
+print("State dictionary keys:")
+for key in state_dict.keys():
+    print(key)
+
+# Ensure the state_dict is flat and contains only torch.Tensor objects
+def flatten_state_dict(state_dict, parent_key='', sep='.'):
+    items = []
+    items_new = []
+
+    for k, v in state_dict.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, torch.Tensor):
+            items.append((new_key, v))
+        elif isinstance(v, dict):
+            items.extend(flatten_state_dict(v, new_key, sep=sep).items())
+            return dict(items)
+
+    size_total_mb = 0
+
+    for key, value in list(items):
+        # keep only what we need for inference
+        if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \
+           not key.startswith('state_dict.backbone.') and \
+           not key.startswith('state_dict.head.'):
+               print('Skipping key: ', key)
+               continue
+
+        new_key = key
+
+        new_key = new_key.replace('state_dict.', '')
+
+        # check if matches "backbone.pos_net.%d.bias" or "backbone.pos_net.%d.weight"
+        if new_key.startswith("backbone.pos_net."):
+            match = re.match(r"backbone\.pos_net\.(\d+)\.(bias|weight)", new_key)
+            if match:
+               new_key = f"backbone.pos_net.{match.group(1)}.norm.{match.group(2)}"
+
+        size_mb = value.element_size() * value.nelement() / (1024 * 1024)
+        print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
+
+        size_total_mb += size_mb
+
+        #print(key, '->', new_key, ': ', value)
+        #print(key, '->', new_key)
+
+        items_new.append((new_key, value))
+
+    print(f"Total size: {size_total_mb:8.2f} MB")
+
+    return dict(items_new)
+
+flattened_state_dict = flatten_state_dict(state_dict)
+
+
+# Convert the model to the safetensors format
+output_path = path_dst + '/model.safetensors'
+save_file(flattened_state_dict, output_path)
+
+print(f"Model has been successfully converted and saved to {output_path}")
+
+# Calculate the total size of the .safetensors file
+total_size = os.path.getsize(output_path)
+
+# Create the weight map
+weight_map = {
+    "model.safetensors": ["*"]  # Assuming all weights are in one file
+}
+
+# Create metadata for the index.json file
+metadata = {
+    "total_size": total_size,
+    "weight_map": weight_map
+}
+
+# Save the metadata to index.json
+index_path = path_dst + '/index.json'
+with open(index_path, 'w') as f:
+    json.dump(metadata, f, indent=4)
+
+print(f"Metadata has been saved to {index_path}")
+
+config = {
+    "architectures": [
+        "OuteTTSVocoder"
+    ],
+    "num_hidden_layers": 12
+}
+
+with open(path_dst + '/config.json', 'w') as f:
+    json.dump(config, f, indent=4)
+
+print(f"Config has been saved to {path_dst + 'config.json'}")
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c2c7cad14e500..37d8bce4778ae 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -261,11 +261,13 @@ class MODEL_ARCH(IntEnum):
     GRANITE      = auto()
     GRANITE_MOE  = auto()
     CHAMELEON    = auto()
+    OUTETTS_VOC  = auto()
 
 
 class MODEL_TENSOR(IntEnum):
     TOKEN_EMBD           = auto()
     TOKEN_EMBD_NORM      = auto()
+    TOKEN_EMBD_SHIFT     = auto()
     TOKEN_TYPES          = auto()
     POS_EMBD             = auto()
     OUTPUT               = auto()
@@ -370,6 +372,24 @@ class MODEL_TENSOR(IntEnum):
     ENC_OUTPUT_NORM      = auto()
     CLS                  = auto() # classifier
     CLS_OUT              = auto() # classifier output projection
+    CONV_NEXT_DW         = auto()
+    CONV_NEXT_NORM       = auto()
+    CONV_NEXT_SHIFT      = auto()
+    CONV_NEXT_PW1        = auto()
+    CONV_NEXT_PW2        = auto()
+    CONV_NEXT_GAMMA      = auto()
+    POS_NET_CONV1        = auto()
+    POS_NET_CONV2        = auto()
+    POS_NET_NORM         = auto()
+    POS_NET_NORM1        = auto()
+    POS_NET_NORM2        = auto()
+    POS_NET_ATTN_NORM    = auto()
+    POS_NET_ATTN_Q       = auto()
+    POS_NET_ATTN_K       = auto()
+    POS_NET_ATTN_V       = auto()
+    POS_NET_ATTN_OUT     = auto()
+    QNTZ_CBOOK_EMBD      = auto()
+    HANN_WINDOW          = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -425,11 +445,13 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.GRANITE:        "granite",
     MODEL_ARCH.GRANITE_MOE:    "granitemoe",
     MODEL_ARCH.CHAMELEON:      "chameleon",
+    MODEL_ARCH.OUTETTS_VOC:    "outetts-voc",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.TOKEN_EMBD:                "token_embd",
     MODEL_TENSOR.TOKEN_EMBD_NORM:           "token_embd_norm",
+    MODEL_TENSOR.TOKEN_EMBD_SHIFT:          "token_embd_shift",
     MODEL_TENSOR.TOKEN_TYPES:               "token_types",
     MODEL_TENSOR.POS_EMBD:                  "position_embd",
     MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
@@ -534,6 +556,24 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.ENC_OUTPUT_NORM:           "enc.output_norm",
     MODEL_TENSOR.CLS:                       "cls",
     MODEL_TENSOR.CLS_OUT:                   "cls.output",
+    MODEL_TENSOR.CONV_NEXT_DW:              "conv_next.{bid}.dw",
+    MODEL_TENSOR.CONV_NEXT_NORM:            "conv_next.{bid}.norm",
+    MODEL_TENSOR.CONV_NEXT_SHIFT:           "conv_next.{bid}.shift",
+    MODEL_TENSOR.CONV_NEXT_PW1:             "conv_next.{bid}.pw1",
+    MODEL_TENSOR.CONV_NEXT_PW2:             "conv_next.{bid}.pw2",
+    MODEL_TENSOR.CONV_NEXT_GAMMA:           "conv_next.{bid}.gamma",
+    MODEL_TENSOR.POS_NET_CONV1:             "pos_net.{bid}.conv1",
+    MODEL_TENSOR.POS_NET_CONV2:             "pos_net.{bid}.conv2",
+    MODEL_TENSOR.POS_NET_NORM:              "pos_net.{bid}.norm",
+    MODEL_TENSOR.POS_NET_NORM1:             "pos_net.{bid}.norm1",
+    MODEL_TENSOR.POS_NET_NORM2:             "pos_net.{bid}.norm2",
+    MODEL_TENSOR.POS_NET_ATTN_NORM:         "pos_net.{bid}.attn_norm",
+    MODEL_TENSOR.POS_NET_ATTN_Q:            "pos_net.{bid}.attn_q",
+    MODEL_TENSOR.POS_NET_ATTN_K:            "pos_net.{bid}.attn_k",
+    MODEL_TENSOR.POS_NET_ATTN_V:            "pos_net.{bid}.attn_v",
+    MODEL_TENSOR.POS_NET_ATTN_OUT:          "pos_net.{bid}.attn_output",
+    MODEL_TENSOR.QNTZ_CBOOK_EMBD:           "qntz.cbook.{bid}.embd",
+    MODEL_TENSOR.HANN_WINDOW:               "hann_window",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1372,6 +1412,31 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.OUTETTS_VOC: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_NORM,
+        MODEL_TENSOR.TOKEN_EMBD_SHIFT,
+        MODEL_TENSOR.CONV_NEXT_DW,
+        MODEL_TENSOR.CONV_NEXT_NORM,
+        MODEL_TENSOR.CONV_NEXT_SHIFT,
+        MODEL_TENSOR.CONV_NEXT_PW1,
+        MODEL_TENSOR.CONV_NEXT_PW2,
+        MODEL_TENSOR.CONV_NEXT_GAMMA,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.POS_NET_CONV1,
+        MODEL_TENSOR.POS_NET_CONV2,
+        MODEL_TENSOR.POS_NET_NORM,
+        MODEL_TENSOR.POS_NET_NORM1,
+        MODEL_TENSOR.POS_NET_NORM2,
+        MODEL_TENSOR.POS_NET_ATTN_NORM,
+        MODEL_TENSOR.POS_NET_ATTN_Q,
+        MODEL_TENSOR.POS_NET_ATTN_K,
+        MODEL_TENSOR.POS_NET_ATTN_V,
+        MODEL_TENSOR.POS_NET_ATTN_OUT,
+        MODEL_TENSOR.QNTZ_CBOOK_EMBD,
+        MODEL_TENSOR.HANN_WINDOW,
+    ],
     # TODO
 }
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 573d0282ea599..39eeea434468d 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -28,6 +28,7 @@ class TensorNameMap:
             "transformer.token_embeddings",              # openelm
             "shared",                                    # t5
             "rwkv.embeddings",                           # rwkv
+            "backbone.embed",                            # outetts
         ),
 
         # Token type embeddings
@@ -42,6 +43,11 @@ class TensorNameMap:
             "emb_ln",                     # nomic-bert
             "transformer.norm",           # openelm
             "rwkv.blocks.0.pre_ln",       # rwkv
+            "backbone.norm.scale",        # outetts
+        ),
+
+        MODEL_TENSOR.TOKEN_EMBD_SHIFT: (
+            "backbone.norm.shift",        # outetts
         ),
 
         # Position embeddings
@@ -60,6 +66,7 @@ class TensorNameMap:
             "lm_head.linear",            # phi2
             "output_layer",              # chatglm
             "head",                      # rwkv
+            "head.out",                  # outetts
         ),
 
         # Output norm
@@ -80,6 +87,7 @@ class TensorNameMap:
             "transformer.norm",                        # openelm
             "model.norm",                              # nemotron
             "rwkv.ln_out",                             # rwkv
+            "backbone.final_layer_norm",               # outetts
         ),
 
         # Rope frequencies
@@ -90,6 +98,10 @@ class TensorNameMap:
 
         MODEL_TENSOR.ROPE_FACTORS_LONG: (),
         MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
+
+        MODEL_TENSOR.HANN_WINDOW: (
+            "head.istft.window", # outetts
+        ),
     }
 
     block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@@ -681,6 +693,8 @@ class TensorNameMap:
             "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
         ),
 
+        ############################################################################
+        # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
         MODEL_TENSOR.ENC_OUTPUT_NORM: (
             "encoder.final_layer_norm", # t5
         ),
@@ -693,6 +707,75 @@ class TensorNameMap:
         MODEL_TENSOR.CLS_OUT: (
             "classifier.out_proj", # roberta
         ),
+        #############################################################################
+
+        MODEL_TENSOR.CONV_NEXT_DW: (
+            "backbone.convnext.{bid}.dwconv", # outetts
+        ),
+
+        MODEL_TENSOR.CONV_NEXT_NORM: (
+            "backbone.convnext.{bid}.norm.scale", # outetts
+        ),
+
+        MODEL_TENSOR.CONV_NEXT_SHIFT: (
+            "backbone.convnext.{bid}.norm.shift", # outetts
+        ),
+
+        MODEL_TENSOR.CONV_NEXT_PW1: (
+            "backbone.convnext.{bid}.pwconv1", # outetts
+        ),
+
+        MODEL_TENSOR.CONV_NEXT_PW2: (
+            "backbone.convnext.{bid}.pwconv2", # outetts
+        ),
+
+        MODEL_TENSOR.CONV_NEXT_GAMMA: (
+            "backbone.convnext.{bid}.gamma", # outetts
+        ),
+
+        MODEL_TENSOR.POS_NET_CONV1: (
+            "backbone.pos_net.{bid}.conv1", # outetts
+        ),
+
+        MODEL_TENSOR.POS_NET_CONV2: (
+            "backbone.pos_net.{bid}.conv2", # outetts
+        ),
+
+        MODEL_TENSOR.POS_NET_NORM: (
+            "backbone.pos_net.{bid}.norm", # outetts
+        ),
+
+        MODEL_TENSOR.POS_NET_NORM1: (
+            "backbone.pos_net.{bid}.norm1", # outetts
+        ),
+
+        MODEL_TENSOR.POS_NET_NORM2: (
+            "backbone.pos_net.{bid}.norm2", # outetts
+        ),
+
+        MODEL_TENSOR.POS_NET_ATTN_NORM: (
+            "backbone.pos_net.{bid}.norm", # outetts
+        ),
+
+        MODEL_TENSOR.POS_NET_ATTN_Q: (
+            "backbone.pos_net.{bid}.q", # outetts
+        ),
+
+        MODEL_TENSOR.POS_NET_ATTN_K: (
+            "backbone.pos_net.{bid}.k", # outetts
+        ),
+
+        MODEL_TENSOR.POS_NET_ATTN_V: (
+            "backbone.pos_net.{bid}.v", # outetts
+        ),
+
+        MODEL_TENSOR.POS_NET_ATTN_OUT: (
+            "backbone.pos_net.{bid}.proj_out", # outetts
+        ),
+
+        MODEL_TENSOR.QNTZ_CBOOK_EMBD: (
+            "feature_extractor.encodec.quantizer.vq.layers.{bid}._codebook.embed", # outetts
+        ),
     }
 
     # architecture-specific block mappings

From ff2ea75fb4a1fc0f57b3eeb25b41461967cdc393 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 16:31:02 +0200
Subject: [PATCH 06/45] wip

---
 common/arg.cpp                   |   8 ++
 common/common.h                  |   9 +-
 convert_hf_to_gguf.py            |  15 +--
 examples/CMakeLists.txt          |   1 +
 examples/tts/CMakeLists.txt      |   5 +
 examples/tts/convert_pt_to_hf.py |   7 ++
 examples/tts/tts.cpp             | 186 +++++++++++++++++++++++++++++++
 gguf-py/gguf/constants.py        |   6 +-
 gguf-py/gguf/tensor_mapping.py   |  10 +-
 src/llama.cpp                    |  71 ++++++++++--
 10 files changed, 295 insertions(+), 23 deletions(-)
 create mode 100644 examples/tts/CMakeLists.txt
 create mode 100644 examples/tts/tts.cpp

diff --git a/common/arg.cpp b/common/arg.cpp
index 3d55289c33192..3d03c676c2b65 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2178,5 +2178,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
 
+    add_opt(common_arg(
+        {"-mv", "--model-vocoder"}, "FNAME",
+        "vocoder model for audio generation (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.vocoder.model = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+
     return ctx_arg;
 }
diff --git a/common/common.h b/common/common.h
index ec0e49f6f1806..c09c4eb0d3628 100644
--- a/common/common.h
+++ b/common/common.h
@@ -80,6 +80,7 @@ enum llama_example {
     LLAMA_EXAMPLE_LLAVA,
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
+    LLAMA_EXAMPLE_TTS,
 
     LLAMA_EXAMPLE_COUNT,
 };
@@ -159,6 +160,7 @@ struct common_params_sampling {
 
 struct common_params_speculative {
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
     int32_t n_ctx        =     0; // draft context size
     int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
     int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
@@ -172,6 +174,10 @@ struct common_params_speculative {
     std::string model = ""; // draft model for speculative decoding                          // NOLINT
 };
 
+struct common_params_vocoder {
+    std::string model = ""; // vocoder model for producing audio                             // NOLINT
+};
+
 struct common_params {
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =  4096; // context size
@@ -214,8 +220,9 @@ struct common_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
 
-    struct common_params_sampling sampling;
+    struct common_params_sampling    sampling;
     struct common_params_speculative speculative;
+    struct common_params_vocoder     vocoder;
 
     std::string model                = ""; // model path                                                    // NOLINT
     std::string model_alias          = ""; // model alias                                                   // NOLINT
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 2a9ed6a71fcc4..4b51a2ad99423 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -221,17 +221,17 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_context_length(n_ctx)
             logger.info(f"gguf: context length = {n_ctx}")
 
-        n_embd = self.find_hparam(["hidden_size", "n_embd"])
-        self.gguf_writer.add_embedding_length(n_embd)
-        logger.info(f"gguf: embedding length = {n_embd}")
+        if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
+            self.gguf_writer.add_embedding_length(n_embd)
+            logger.info(f"gguf: embedding length = {n_embd}")
 
         if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
             self.gguf_writer.add_feed_forward_length(n_ff)
             logger.info(f"gguf: feed forward length = {n_ff}")
 
-        n_head = self.find_hparam(["num_attention_heads", "n_head"])
-        self.gguf_writer.add_head_count(n_head)
-        logger.info(f"gguf: head count = {n_head}")
+        if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
+            self.gguf_writer.add_head_count(n_head)
+            logger.info(f"gguf: head count = {n_head}")
 
         if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
             self.gguf_writer.add_head_count_kv(n_head_kv)
@@ -2050,7 +2050,8 @@ def set_vocab(self):
         self._set_vocab_none()
 
     def set_gguf_parameters(self):
-        self.gguf_writer.add_block_count(self.block_count)
+        super().set_gguf_parameters()
+        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
 
 
 @Model.register("Qwen2MoeForCausalLM")
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 21b31392e81d0..66cfab2c3b796 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -51,6 +51,7 @@ else()
     add_subdirectory(speculative)
     add_subdirectory(speculative-simple)
     add_subdirectory(tokenize)
+    add_subdirectory(tts)
     add_subdirectory(gen-docs)
     if (NOT GGML_BACKEND_DL)
         # these examples use the backends directly and cannot be built with dynamic loading
diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt
new file mode 100644
index 0000000000000..c72bd814c3b31
--- /dev/null
+++ b/examples/tts/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-tts)
+add_executable(${TARGET} tts.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index c77aee6a8c0e2..a652bae4361a2 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -84,6 +84,10 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
             if match:
                new_key = f"backbone.pos_net.{match.group(1)}.norm.{match.group(2)}"
 
+        # "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight"
+        if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":
+            new_key = "backbone.embedding.weight"
+
         size_mb = value.element_size() * value.nelement() / (1024 * 1024)
         print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
 
@@ -132,6 +136,9 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
     "architectures": [
         "OuteTTSVocoder"
     ],
+    "hidden_size": 512,
+    "vocab_size": 4096,
+    "max_position_embeddings": 8192, # ?
     "num_hidden_layers": 12
 }
 
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
new file mode 100644
index 0000000000000..768015a5271c2
--- /dev/null
+++ b/examples/tts/tts.cpp
@@ -0,0 +1,186 @@
+#include "arg.h"
+#include "common.h"
+#include "sampling.h"
+#include "log.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <string>
+#include <vector>
+#include <fstream>
+
+//
+// Terminal utils
+//
+
+#define SQR(X)    ((X) * (X))
+#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
+
+/**
+ * Quantizes 24-bit RGB to xterm256 code range [16,256).
+ */
+static int rgb2xterm256(int r, int g, int b) {
+    unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
+    int av, ir, ig, ib, il, qr, qg, qb, ql;
+    av = r * .299 + g * .587 + b * .114 + .5;
+    ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
+    qr = cube[(ir = UNCUBE(r))];
+    qg = cube[(ig = UNCUBE(g))];
+    qb = cube[(ib = UNCUBE(b))];
+    if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
+        SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
+        return ir * 36 + ig * 6 + ib + 020;
+    return il + 0350;
+}
+
+static std::string set_xterm256_foreground(int r, int g, int b) {
+    int x = rgb2xterm256(r, g, b);
+    std::ostringstream oss;
+    oss << "\033[38;5;" << x << "m";
+    return oss.str();
+}
+
+const std::vector<std::string> k_colors = {
+    set_xterm256_foreground(220,   5,  12),
+    set_xterm256_foreground(232,  96,  28),
+    set_xterm256_foreground(241, 147,  45),
+    set_xterm256_foreground(246, 193,  65),
+    set_xterm256_foreground(247, 240,  86),
+    set_xterm256_foreground(144, 201, 135),
+    set_xterm256_foreground( 78, 178, 101),
+};
+
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello!\"\n", argv[0]);
+    LOG("\n");
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.prompt = "";
+
+    params.n_predict = 1024;
+    params.n_batch   = 8192;
+    params.n_ctx     = 8192;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
+        return 1;
+    }
+
+    common_init();
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model_ttc = NULL; // text-to-codes
+    llama_model * model_cts = NULL; // codes-to-speech
+
+    llama_context * ctx_ttc = NULL;
+    llama_context * ctx_cts = NULL;
+
+    common_init_result llama_init_ttc = common_init_from_params(params);
+    model_ttc = llama_init_ttc.model;
+    ctx_ttc = llama_init_ttc.context;
+
+    params.model = params.vocoder.model;
+
+    common_init_result llama_init_cts = common_init_from_params(params);
+    model_cts = llama_init_cts.model;
+    ctx_cts = llama_init_cts.context;
+
+    const auto t_main_start = ggml_time_us();
+
+    std::vector<llama_token> prompt_inp = {198, 88225, 155856, 151669, 152205,
+        153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695,
+        153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010,
+        153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286,
+        152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296,
+        153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690,
+        153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061,
+        153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670,
+        198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683,
+        152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908,
+        151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359,
+        153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424,
+        151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670,
+        198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729,
+        152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669,
+        153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670,
+        198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501,
+        152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242,
+        153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360,
+        153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055,
+        152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670,
+        198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441,
+        152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831,
+        153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133,
+        153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109,
+        152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055,
+        155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729,
+        151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337,
+        153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153,
+        153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365,
+        153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218,
+        152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464,
+        152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855,
+        152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418,
+        153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645};
+
+    {
+        const std::string inp_txt = common_detokenize(ctx_ttc, prompt_inp, true);
+        LOG_INF("prompt: '%s'\n", inp_txt.c_str());
+        LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size());
+    }
+
+    // remove all non-audio tokens (i.e. < 151672 || > 155772)
+    prompt_inp.erase(std::remove_if(prompt_inp.begin(), prompt_inp.end(), [](llama_token t) { return t < 151672 || t > 155772; }), prompt_inp.end());
+
+    {
+        const std::string inp_txt = common_detokenize(ctx_ttc, prompt_inp, true);
+        LOG_INF("prompt audio: '%s'\n", inp_txt.c_str());
+        LOG_INF("%s: prompt audio size: %d\n", __func__, (int) prompt_inp.size());
+    }
+
+
+    llama_batch batch = llama_batch_init(prompt_inp.size(), 0, 1);
+
+    // evaluate the initial prompt
+    for (size_t i = 0; i < prompt_inp.size(); ++i) {
+        common_batch_add(batch, prompt_inp[i], i, { 0 }, true); // TODO: all logits?
+    }
+    GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
+
+    if (llama_decode(ctx_ttc, batch) != 0) {
+        LOG_ERR("%s: llama_decode() failed\n", __func__);
+        return 1;
+    }
+
+    llama_synchronize(ctx_ttc);
+
+    LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
+
+    const float * embd = llama_get_embeddings(ctx_ttc);
+
+    LOG("result:\n");
+    for (int i = 0; i < 10; ++i) {
+        LOG("%8.3f ", embd[i]);
+    }
+    LOG("\n");
+
+    fprintf(stderr, "\n");
+
+    llama_free(ctx_ttc);
+    llama_free_model(model_ttc);
+
+    llama_free(ctx_cts);
+    llama_free_model(model_cts);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 37d8bce4778ae..14e68cffa522c 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -372,6 +372,7 @@ class MODEL_TENSOR(IntEnum):
     ENC_OUTPUT_NORM      = auto()
     CLS                  = auto() # classifier
     CLS_OUT              = auto() # classifier output projection
+    CONV1D               = auto()
     CONV_NEXT_DW         = auto()
     CONV_NEXT_NORM       = auto()
     CONV_NEXT_SHIFT      = auto()
@@ -388,7 +389,6 @@ class MODEL_TENSOR(IntEnum):
     POS_NET_ATTN_K       = auto()
     POS_NET_ATTN_V       = auto()
     POS_NET_ATTN_OUT     = auto()
-    QNTZ_CBOOK_EMBD      = auto()
     HANN_WINDOW          = auto()
 
 
@@ -556,6 +556,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.ENC_OUTPUT_NORM:           "enc.output_norm",
     MODEL_TENSOR.CLS:                       "cls",
     MODEL_TENSOR.CLS_OUT:                   "cls.output",
+    MODEL_TENSOR.CONV1D:                    "conv1d",
     MODEL_TENSOR.CONV_NEXT_DW:              "conv_next.{bid}.dw",
     MODEL_TENSOR.CONV_NEXT_NORM:            "conv_next.{bid}.norm",
     MODEL_TENSOR.CONV_NEXT_SHIFT:           "conv_next.{bid}.shift",
@@ -572,7 +573,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.POS_NET_ATTN_K:            "pos_net.{bid}.attn_k",
     MODEL_TENSOR.POS_NET_ATTN_V:            "pos_net.{bid}.attn_v",
     MODEL_TENSOR.POS_NET_ATTN_OUT:          "pos_net.{bid}.attn_output",
-    MODEL_TENSOR.QNTZ_CBOOK_EMBD:           "qntz.cbook.{bid}.embd",
     MODEL_TENSOR.HANN_WINDOW:               "hann_window",
 }
 
@@ -1416,6 +1416,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.TOKEN_EMBD_NORM,
         MODEL_TENSOR.TOKEN_EMBD_SHIFT,
+        MODEL_TENSOR.CONV1D,
         MODEL_TENSOR.CONV_NEXT_DW,
         MODEL_TENSOR.CONV_NEXT_NORM,
         MODEL_TENSOR.CONV_NEXT_SHIFT,
@@ -1434,7 +1435,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.POS_NET_ATTN_K,
         MODEL_TENSOR.POS_NET_ATTN_V,
         MODEL_TENSOR.POS_NET_ATTN_OUT,
-        MODEL_TENSOR.QNTZ_CBOOK_EMBD,
         MODEL_TENSOR.HANN_WINDOW,
     ],
     # TODO
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 39eeea434468d..4355ccf111456 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -28,7 +28,7 @@ class TensorNameMap:
             "transformer.token_embeddings",              # openelm
             "shared",                                    # t5
             "rwkv.embeddings",                           # rwkv
-            "backbone.embed",                            # outetts
+            "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" # outetts
         ),
 
         # Token type embeddings
@@ -102,6 +102,10 @@ class TensorNameMap:
         MODEL_TENSOR.HANN_WINDOW: (
             "head.istft.window", # outetts
         ),
+
+        MODEL_TENSOR.CONV1D: (
+            "backbone.embed", # roberta
+        ),
     }
 
     block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@@ -772,10 +776,6 @@ class TensorNameMap:
         MODEL_TENSOR.POS_NET_ATTN_OUT: (
             "backbone.pos_net.{bid}.proj_out", # outetts
         ),
-
-        MODEL_TENSOR.QNTZ_CBOOK_EMBD: (
-            "feature_extractor.encodec.quantizer.vq.layers.{bid}._codebook.embed", # outetts
-        ),
     }
 
     # architecture-specific block mappings
diff --git a/src/llama.cpp b/src/llama.cpp
index b7b04a41d99e6..eefedab8bb8ce 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -197,6 +197,7 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
+    LLM_ARCH_OUTETTS_VOC,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -253,6 +254,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_GRANITE,         "granite"      },
     { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
     { LLM_ARCH_CHAMELEON,       "chameleon"    },
+    { LLM_ARCH_OUTETTS_VOC,     "outetts-voc"  },
     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
 
@@ -503,6 +505,7 @@ struct LLM_KV {
 enum llm_tensor {
     LLM_TENSOR_TOKEN_EMBD,
     LLM_TENSOR_TOKEN_EMBD_NORM,
+    LLM_TENSOR_TOKEN_EMBD_SHIFT,
     LLM_TENSOR_TOKEN_TYPES,
     LLM_TENSOR_POS_EMBD,
     LLM_TENSOR_OUTPUT,
@@ -609,6 +612,24 @@ enum llm_tensor {
     LLM_TENSOR_ENC_OUTPUT_NORM,
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
+    LLM_TENSOR_CONV1D,
+    LLM_TENSOR_CONV_NEXT_DW,
+    LLM_TENSOR_CONV_NEXT_NORM,
+    LLM_TENSOR_CONV_NEXT_SHIFT,
+    LLM_TENSOR_CONV_NEXT_PW1,
+    LLM_TENSOR_CONV_NEXT_PW2,
+    LLM_TENSOR_CONV_NEXT_GAMMA,
+    LLM_TENSOR_POS_NET_CONV1,
+    LLM_TENSOR_POS_NET_CONV2,
+    LLM_TENSOR_POS_NET_NORM,
+    LLM_TENSOR_POS_NET_NORM1,
+    LLM_TENSOR_POS_NET_NORM2,
+    LLM_TENSOR_POS_NET_ATTN_NORM,
+    LLM_TENSOR_POS_NET_ATTN_Q,
+    LLM_TENSOR_POS_NET_ATTN_K,
+    LLM_TENSOR_POS_NET_ATTN_V,
+    LLM_TENSOR_POS_NET_ATTN_OUT,
+    LLM_TENSOR_HANN_WINDOW,
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -1593,6 +1614,34 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
         },
     },
+    {
+        LLM_ARCH_OUTETTS_VOC,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
+            { LLM_TENSOR_TOKEN_EMBD_SHIFT,  "token_embd_shift" },
+            { LLM_TENSOR_CONV1D,            "conv1d" },
+            { LLM_TENSOR_CONV_NEXT_DW,      "conv_next.dw" },
+            { LLM_TENSOR_CONV_NEXT_NORM,    "conv_next.norm" },
+            { LLM_TENSOR_CONV_NEXT_SHIFT,   "conv_next.shift" },
+            { LLM_TENSOR_CONV_NEXT_PW1,     "conv_next.pw1" },
+            { LLM_TENSOR_CONV_NEXT_PW2,     "conv_next.pw2" },
+            { LLM_TENSOR_CONV_NEXT_GAMMA,   "conv_next.gamma" },
+            { LLM_TENSOR_OUTPUT_NORM,       "output_norm" },
+            { LLM_TENSOR_OUTPUT,            "output" },
+            { LLM_TENSOR_POS_NET_CONV1,     "pos_net.conv1" },
+            { LLM_TENSOR_POS_NET_CONV2,     "pos_net.conv2" },
+            { LLM_TENSOR_POS_NET_NORM,      "pos_net.norm" },
+            { LLM_TENSOR_POS_NET_NORM1,     "pos_net.norm1" },
+            { LLM_TENSOR_POS_NET_NORM2,     "pos_net.norm2" },
+            { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.attn_norm" },
+            { LLM_TENSOR_POS_NET_ATTN_Q,    "pos_net.attn_q" },
+            { LLM_TENSOR_POS_NET_ATTN_K,    "pos_net.attn_k" },
+            { LLM_TENSOR_POS_NET_ATTN_V,    "pos_net.attn_v" },
+            { LLM_TENSOR_POS_NET_ATTN_OUT,  "pos_net.attn_output" },
+            { LLM_TENSOR_HANN_WINDOW,       "hann_window" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -2489,7 +2538,7 @@ struct llama_hparams {
     bool use_par_res;
     bool swin_norm;
 
-    uint32_t n_vocab;
+    uint32_t n_vocab = 0;
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
     uint32_t n_layer;
@@ -3005,6 +3054,9 @@ struct llama_model {
     struct ggml_tensor * cls_out   = nullptr;
     struct ggml_tensor * cls_out_b = nullptr;
 
+    // quantizer
+    struct ggml_tensor * qntz_cbook_embd = nullptr;
+
     std::vector<llama_layer> layers;
 
     // gguf metadata
@@ -5519,7 +5571,7 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
 
     // get hparams kv
-    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
+    ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
 
     // everything past this point is not vocab-related
     if (hparams.vocab_only) {
@@ -5545,8 +5597,8 @@ static void llm_load_hparams(
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
 
-    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer);
-    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
+    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
+    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
 
     // n_head_kv is optional, default to n_head
     hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -6320,7 +6372,7 @@ static void llm_load_vocab(
         ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
         ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
 
-        if (tokenizer_model == "no_vocab") {
+        if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
             vocab.type = LLAMA_VOCAB_TYPE_NONE;
 
             // default special tokens
@@ -9336,9 +9388,9 @@ static bool llm_load_tensors(
                 } break;
             case LLM_ARCH_CHAMELEON:
                 {
-                 model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
-                 // output
+                    // output
                     model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                     model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
                     // if output is NULL, init from the input tok embed
@@ -9367,6 +9419,10 @@ static bool llm_load_tensors(
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
+            case LLM_ARCH_OUTETTS_VOC:
+                {
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -20383,6 +20439,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_T5ENCODER:
         case LLM_ARCH_JAIS:
         case LLM_ARCH_RWKV6:
+        case LLM_ARCH_OUTETTS_VOC:
             return LLAMA_ROPE_TYPE_NONE;
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values

From aac7e0495379217f69c59465a17d5fbb4a6699dd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 18:23:10 +0200
Subject: [PATCH 07/45] extract features

---
 examples/tts/convert_pt_to_hf.py |   1 +
 examples/tts/tts.cpp             |  15 ++++-
 src/llama.cpp                    | 104 +++++++++++++++++++++++--------
 3 files changed, 91 insertions(+), 29 deletions(-)

diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index a652bae4361a2..389d2de50efe9 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -138,6 +138,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
     ],
     "hidden_size": 512,
     "vocab_size": 4096,
+    "n_head": 1,
     "max_position_embeddings": 8192, # ?
     "num_hidden_layers": 12
 }
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 768015a5271c2..d3fee7373ea3b 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -88,6 +88,7 @@ int main(int argc, char ** argv) {
     ctx_ttc = llama_init_ttc.context;
 
     params.model = params.vocoder.model;
+    params.embedding = true;
 
     common_init_result llama_init_cts = common_init_from_params(params);
     model_cts = llama_init_cts.model;
@@ -146,6 +147,9 @@ int main(int argc, char ** argv) {
         LOG_INF("%s: prompt audio size: %d\n", __func__, (int) prompt_inp.size());
     }
 
+    for (auto & token : prompt_inp) {
+        token -= 151672;
+    }
 
     llama_batch batch = llama_batch_init(prompt_inp.size(), 0, 1);
 
@@ -155,22 +159,27 @@ int main(int argc, char ** argv) {
     }
     GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
 
-    if (llama_decode(ctx_ttc, batch) != 0) {
+    if (llama_decode(ctx_cts, batch) != 0) {
         LOG_ERR("%s: llama_decode() failed\n", __func__);
         return 1;
     }
 
-    llama_synchronize(ctx_ttc);
+    llama_synchronize(ctx_cts);
 
     LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
 
-    const float * embd = llama_get_embeddings(ctx_ttc);
+    const float * embd = llama_get_embeddings(ctx_cts);
 
     LOG("result:\n");
     for (int i = 0; i < 10; ++i) {
         LOG("%8.3f ", embd[i]);
     }
     LOG("\n");
+    double sum = 0.0;
+    for (int i = 0; i < 261*512; ++i) {
+        sum += embd[i];
+    }
+    LOG("sum: %f\n", sum);
 
     fprintf(stderr, "\n");
 
diff --git a/src/llama.cpp b/src/llama.cpp
index eefedab8bb8ce..3262b0218eb1b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3614,7 +3614,9 @@ static bool llama_kv_cache_init(
 
     const struct llama_hparams & hparams = model.hparams;
 
-    const int64_t  n_layer = hparams.n_layer;
+    const int32_t n_layer = hparams.n_layer;
+
+    LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
 
     cache.has_shift = false;
 
@@ -3655,10 +3657,12 @@ static bool llama_kv_cache_init(
     cache.k_l.reserve(n_layer);
     cache.v_l.reserve(n_layer);
 
-    for (int i = 0; i < (int) n_layer; i++) {
+    for (int i = 0; i < n_layer; i++) {
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
+        LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
+
         ggml_backend_buffer_type_t buft;
         if (offload) {
             auto * dev = model.dev_layer.at(i).dev;
@@ -5032,7 +5036,8 @@ struct llama_model_loader {
 
     void done_getting_tensors() const {
         if (n_created != n_tensors) {
-            throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
+            // TODO: TEMPORARY DISABLED
+            //throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
         }
     }
 
@@ -9422,6 +9427,10 @@ static bool llm_load_tensors(
             case LLM_ARCH_OUTETTS_VOC:
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
                 } break;
             default:
                 throw std::runtime_error("unknown architecture");
@@ -16991,6 +17000,30 @@ struct llm_build_context {
 
         return gf;
     }
+
+    struct ggml_cgraph * build_outetts_voc() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        cur = inpL;
+
+        //cur = llm_build_norm(ctx0, cur, hparams,
+        //        model.output_norm, NULL,
+        //        LLM_NORM_RMS, cb, -1);
+        //cb(cur, "result_norm", -1);
+
+        //// lm_head
+        //cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        //cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -17266,13 +17299,18 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
+        case LLM_ARCH_OUTETTS_VOC:
+            {
+                result = llm.build_outetts_voc();
+            } break;
         default:
             GGML_ABORT("fatal error");
     }
 
     // add on pooling layer
     if (lctx.cparams.embeddings) {
-        result = llm.append_pooling(result);
+        // TODO: TEMPORARY DISABLED
+        //result = llm.append_pooling(result);
     }
 
     llm.free();
@@ -17357,30 +17395,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
     }
 
     if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
-        const int64_t n_tokens = ubatch.n_tokens;
+        //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
-        int32_t * data = (int32_t *) lctx.inp_out_ids->data;
+        if (!lctx.inp_out_ids) {
+            LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
+        } else {
+            const int64_t n_tokens = ubatch.n_tokens;
 
-        if (lctx.n_outputs == n_tokens) {
-            for (int i = 0; i < n_tokens; ++i) {
-                data[i] = i;
-            }
-        } else if (ubatch.output) {
-            int32_t n_outputs = 0;
-            for (int i = 0; i < n_tokens; ++i) {
-                if (ubatch.output[i]) {
-                    data[n_outputs++] = i;
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
+            int32_t * data = (int32_t *) lctx.inp_out_ids->data;
+
+            if (lctx.n_outputs == n_tokens) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = i;
+                }
+            } else if (ubatch.output) {
+                int32_t n_outputs = 0;
+                for (int i = 0; i < n_tokens; ++i) {
+                    if (ubatch.output[i]) {
+                        data[n_outputs++] = i;
+                    }
                 }
+                // the graph needs to have been passed the correct number of outputs
+                GGML_ASSERT(lctx.n_outputs == n_outputs);
+            } else if (lctx.n_outputs == 1) {
+                // only keep last output
+                data[0] = n_tokens - 1;
+            } else {
+                GGML_ASSERT(lctx.n_outputs == 0);
             }
-            // the graph needs to have been passed the correct number of outputs
-            GGML_ASSERT(lctx.n_outputs == n_outputs);
-        } else if (lctx.n_outputs == 1) {
-            // only keep last output
-            data[0] = n_tokens - 1;
-        } else {
-            GGML_ASSERT(lctx.n_outputs == 0);
         }
     }
 
@@ -18029,9 +18072,14 @@ static int llama_decode_internal(
 
         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
+        struct ggml_tensor * res  = nullptr;
+        struct ggml_tensor * embd = nullptr;
+
+// TODO: TEMPORARY DISABLED
+if (model.arch != LLM_ARCH_OUTETTS_VOC) {
         // the output is always the last tensor in the graph
-        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
-        struct ggml_tensor * embd = ggml_graph_node(gf, -2);
+        res  = ggml_graph_node(gf, -1);
+        embd = ggml_graph_node(gf, -2);
 
         if (lctx.n_outputs == 0) {
             // no output
@@ -18051,6 +18099,10 @@ static int llama_decode_internal(
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
+} else {
+        res  = nullptr;
+        embd = ggml_graph_node(gf, -1);
+}
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);

From 6ef14091c07b12d5c3dd3d02adb0e4000d433307 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 19:18:04 +0200
Subject: [PATCH 08/45] first conv

---
 examples/tts/tts.cpp |  8 +++++++-
 ggml/src/ggml.c      |  2 +-
 src/llama.cpp        | 40 ++++++++++++++++++++++++++++++++--------
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index d3fee7373ea3b..17dc0eff2cd52 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -170,13 +170,19 @@ int main(int argc, char ** argv) {
 
     const float * embd = llama_get_embeddings(ctx_cts);
 
+    int n = 768*261;
+
     LOG("result:\n");
     for (int i = 0; i < 10; ++i) {
         LOG("%8.3f ", embd[i]);
     }
     LOG("\n");
+    for (int i = n - 10; i < n; ++i) {
+        LOG("%8.3f ", embd[i]);
+    }
+    LOG("\n");
     double sum = 0.0;
-    for (int i = 0; i < 261*512; ++i) {
+    for (int i = 0; i < n; ++i) {
         sum += embd[i];
     }
     LOG("sum: %f\n", sum);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 0efd2b2ebf780..3665488b30c40 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3874,7 +3874,7 @@ struct ggml_tensor * ggml_im2col(
         int                   d1,
         bool                  is_2D,
         enum ggml_type        dst_type) {
-    if(is_2D) {
+    if (is_2D) {
         GGML_ASSERT(a->ne[2] == b->ne[2]);
     } else {
         GGML_ASSERT(a->ne[1] == b->ne[1]);
diff --git a/src/llama.cpp b/src/llama.cpp
index 3262b0218eb1b..ec1436288e614 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3054,8 +3054,8 @@ struct llama_model {
     struct ggml_tensor * cls_out   = nullptr;
     struct ggml_tensor * cls_out_b = nullptr;
 
-    // quantizer
-    struct ggml_tensor * qntz_cbook_embd = nullptr;
+    struct ggml_tensor * conv_1d = nullptr;
+    struct ggml_tensor * conv_1d_b = nullptr;
 
     std::vector<llama_layer> layers;
 
@@ -5036,7 +5036,7 @@ struct llama_model_loader {
 
     void done_getting_tensors() const {
         if (n_created != n_tensors) {
-            // TODO: TEMPORARY DISABLED
+            // TODO: TEMPORARY DISABLED [OUTETTS]
             //throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
         }
     }
@@ -7356,6 +7356,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
+    {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
@@ -7460,6 +7461,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
                 ggml_tensor  * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
                 op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
             } break;
+        case GGML_OP_IM2COL:
+            {
+                int n_embd = hparams.n_embd;
+                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
+                op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
+            } break;
         default:
             GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
     }
@@ -9428,6 +9435,9 @@ static bool llm_load_tensors(
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
+                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0);
+                    model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {768}, 0);
+
                     // output
                     model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
                     model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -9671,7 +9681,7 @@ static struct ggml_tensor * llm_build_inp_embd(
 
         inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
     } else {
-       lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
+        lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
         inpL = lctx.inp_embd;
         ggml_set_input(lctx.inp_embd);
     }
@@ -17009,7 +17019,13 @@ struct llm_build_context {
 
         inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
 
-        cur = inpL;
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
+
+        printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
+        printf("conv1d: %d %d %d\n", model.conv_1d->ne[0], model.conv_1d->ne[1], model.conv_1d->ne[2]);
+        cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1);
+        cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0]));
+        printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
 
         //cur = llm_build_norm(ctx0, cur, hparams,
         //        model.output_norm, NULL,
@@ -17309,7 +17325,7 @@ static struct ggml_cgraph * llama_build_graph(
 
     // add on pooling layer
     if (lctx.cparams.embeddings) {
-        // TODO: TEMPORARY DISABLED
+        // TODO: TEMPORARY DISABLED [OUTETTS]
         //result = llm.append_pooling(result);
     }
 
@@ -17798,7 +17814,13 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     }
 
     const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
+
+    // TODO: TEMPORARY !!! [OUTETTS]
+#if 0
     const size_t new_size  = (logits_size + embd_size) * sizeof(float);
+#else
+    const size_t new_size  = 1024*1024*32;
+#endif
 
     // alloc only when more than the current capacity is required
     // TODO: also consider shrinking the buffer
@@ -18075,7 +18097,7 @@ static int llama_decode_internal(
         struct ggml_tensor * res  = nullptr;
         struct ggml_tensor * embd = nullptr;
 
-// TODO: TEMPORARY DISABLED
+// TODO: TEMPORARY DISABLED [OUTETTS]
 if (model.arch != LLM_ARCH_OUTETTS_VOC) {
         // the output is always the last tensor in the graph
         res  = ggml_graph_node(gf, -1);
@@ -18170,7 +18192,9 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) {
                         if (n_outputs_new) {
                             GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
                             GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
+                            // TODO: TEMPORARY [OUTETTS]
+                            //ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
+                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*768*sizeof(float));
                         }
                     } break;
                 case LLAMA_POOLING_TYPE_MEAN:

From 5296c96ca8ea4677c3585467e88f78b677f39989 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 20:33:29 +0200
Subject: [PATCH 09/45] group norm

---
 src/llama.cpp | 44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index ec1436288e614..8e62ea324fe25 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1629,16 +1629,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_CONV_NEXT_GAMMA,   "conv_next.gamma" },
             { LLM_TENSOR_OUTPUT_NORM,       "output_norm" },
             { LLM_TENSOR_OUTPUT,            "output" },
-            { LLM_TENSOR_POS_NET_CONV1,     "pos_net.conv1" },
-            { LLM_TENSOR_POS_NET_CONV2,     "pos_net.conv2" },
+            { LLM_TENSOR_POS_NET_CONV1,     "pos_net.%d.conv1" },
+            { LLM_TENSOR_POS_NET_CONV2,     "pos_net.%d.conv2" },
             { LLM_TENSOR_POS_NET_NORM,      "pos_net.norm" },
-            { LLM_TENSOR_POS_NET_NORM1,     "pos_net.norm1" },
-            { LLM_TENSOR_POS_NET_NORM2,     "pos_net.norm2" },
-            { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.attn_norm" },
-            { LLM_TENSOR_POS_NET_ATTN_Q,    "pos_net.attn_q" },
-            { LLM_TENSOR_POS_NET_ATTN_K,    "pos_net.attn_k" },
-            { LLM_TENSOR_POS_NET_ATTN_V,    "pos_net.attn_v" },
-            { LLM_TENSOR_POS_NET_ATTN_OUT,  "pos_net.attn_output" },
+            { LLM_TENSOR_POS_NET_NORM1,     "pos_net.%d.norm1" },
+            { LLM_TENSOR_POS_NET_NORM2,     "pos_net.%d.norm2" },
+            { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.%d.attn_norm" },
+            { LLM_TENSOR_POS_NET_ATTN_Q,    "pos_net.%d.attn_q" },
+            { LLM_TENSOR_POS_NET_ATTN_K,    "pos_net.%d.attn_k" },
+            { LLM_TENSOR_POS_NET_ATTN_V,    "pos_net.%d.attn_v" },
+            { LLM_TENSOR_POS_NET_ATTN_OUT,  "pos_net.%d.attn_output" },
             { LLM_TENSOR_HANN_WINDOW,       "hann_window" },
         },
     },
@@ -3054,9 +3054,13 @@ struct llama_model {
     struct ggml_tensor * cls_out   = nullptr;
     struct ggml_tensor * cls_out_b = nullptr;
 
+    // outetts vocoder
     struct ggml_tensor * conv_1d = nullptr;
     struct ggml_tensor * conv_1d_b = nullptr;
 
+    struct ggml_tensor * posnet_0_norm1 = nullptr;
+    struct ggml_tensor * posnet_0_norm1_b = nullptr;
+
     std::vector<llama_layer> layers;
 
     // gguf metadata
@@ -7357,6 +7361,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
     {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
+    {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
@@ -9438,6 +9443,9 @@ static bool llm_load_tensors(
                     model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0);
                     model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {768}, 0);
 
+                    model.posnet_0_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0);
+                    model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   0), {768}, 0);
+
                     // output
                     model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
                     model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -9661,6 +9669,7 @@ enum llm_ffn_gate_type {
 enum llm_norm_type {
     LLM_NORM,
     LLM_NORM_RMS,
+    LLM_NORM_GROUP,
 };
 
 static struct ggml_tensor * llm_build_inp_embd(
@@ -9802,8 +9811,15 @@ static struct ggml_tensor * llm_build_norm(
          const llm_build_cb & cb,
                         int   il) {
     switch (type) {
-        case LLM_NORM:     cur = ggml_norm    (ctx, cur, hparams.f_norm_eps);     break;
-        case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
+        case LLM_NORM:       cur = ggml_norm      (ctx, cur, hparams.f_norm_eps);     break;
+        case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx, cur, hparams.f_norm_rms_eps); break;
+        case LLM_NORM_GROUP:
+            {
+                // TODO: these reshapes should be removed, fix ggml_group_norm
+                cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
+                cur = ggml_group_norm(ctx, cur, 32, 1e-6); // TODO: add groups, eps params
+                cur = ggml_reshape_2d(ctx, cur, cur->ne[0],    cur->ne[2]);
+            } break;
     }
 
     if (mw || mb) {
@@ -17025,6 +17041,12 @@ struct llm_build_context {
         printf("conv1d: %d %d %d\n", model.conv_1d->ne[0], model.conv_1d->ne[1], model.conv_1d->ne[2]);
         cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1);
         cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0]));
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                ggml_reshape_2d(ctx0, model.posnet_0_norm1,   1, model.posnet_0_norm1->ne[0]),
+                ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]),
+                LLM_NORM_GROUP, cb, 0);
+
         printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
 
         //cur = llm_build_norm(ctx0, cur, hparams,

From 3d08d62b6c6de4b8c90ceef15702427aa4a8ad4f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 20:40:52 +0200
Subject: [PATCH 10/45] resnet conv

---
 src/llama.cpp | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index 8e62ea324fe25..bfc8d6601ec3e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3061,6 +3061,15 @@ struct llama_model {
     struct ggml_tensor * posnet_0_norm1 = nullptr;
     struct ggml_tensor * posnet_0_norm1_b = nullptr;
 
+    struct ggml_tensor * posnet_0_conv1 = nullptr;
+    struct ggml_tensor * posnet_0_conv1_b = nullptr;
+
+    struct ggml_tensor * posnet_0_norm2 = nullptr;
+    struct ggml_tensor * posnet_0_norm2_b = nullptr;
+
+    struct ggml_tensor * posnet_0_conv2 = nullptr;
+    struct ggml_tensor * posnet_0_conv2_b = nullptr;
+
     std::vector<llama_layer> layers;
 
     // gguf metadata
@@ -7362,6 +7371,9 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
     {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_NORM2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_CONV1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+    {LLM_TENSOR_POS_NET_CONV2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
@@ -9446,6 +9458,15 @@ static bool llm_load_tensors(
                     model.posnet_0_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0);
                     model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   0), {768}, 0);
 
+                    model.posnet_0_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, 768, 768}, 0);
+                    model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   0), {768}, 0);
+
+                    model.posnet_0_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {768}, 0);
+                    model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   0), {768}, 0);
+
+                    model.posnet_0_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, 768, 768}, 0);
+                    model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   0), {768}, 0);
+
                     // output
                     model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
                     model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -17047,6 +17068,11 @@ struct llm_build_context {
                 ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]),
                 LLM_NORM_GROUP, cb, 0);
 
+        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+        cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1);
+        cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0]));
+
         printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
 
         //cur = llm_build_norm(ctx0, cur, hparams,

From 13dd8941a4cf0a300045935670808119d02a1e97 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 20:50:13 +0200
Subject: [PATCH 11/45] resnet

---
 src/llama.cpp | 83 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 76 insertions(+), 7 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index bfc8d6601ec3e..47bfbf9b3dd02 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3070,6 +3070,18 @@ struct llama_model {
     struct ggml_tensor * posnet_0_conv2 = nullptr;
     struct ggml_tensor * posnet_0_conv2_b = nullptr;
 
+    struct ggml_tensor * posnet_1_norm1 = nullptr;
+    struct ggml_tensor * posnet_1_norm1_b = nullptr;
+
+    struct ggml_tensor * posnet_1_conv1 = nullptr;
+    struct ggml_tensor * posnet_1_conv1_b = nullptr;
+
+    struct ggml_tensor * posnet_1_norm2 = nullptr;
+    struct ggml_tensor * posnet_1_norm2_b = nullptr;
+
+    struct ggml_tensor * posnet_1_conv2 = nullptr;
+    struct ggml_tensor * posnet_1_conv2_b = nullptr;
+
     std::vector<llama_layer> layers;
 
     // gguf metadata
@@ -9467,6 +9479,18 @@ static bool llm_load_tensors(
                     model.posnet_0_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, 768, 768}, 0);
                     model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   0), {768}, 0);
 
+                    model.posnet_1_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {768}, 0);
+                    model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   1), {768}, 0);
+
+                    model.posnet_1_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, 768, 768}, 0);
+                    model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   1), {768}, 0);
+
+                    model.posnet_1_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {768}, 0);
+                    model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   1), {768}, 0);
+
+                    model.posnet_1_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, 768, 768}, 0);
+                    model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   1), {768}, 0);
+
                     // output
                     model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
                     model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -17060,18 +17084,63 @@ struct llm_build_context {
 
         printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
         printf("conv1d: %d %d %d\n", model.conv_1d->ne[0], model.conv_1d->ne[1], model.conv_1d->ne[2]);
+
         cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1);
         cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0]));
 
-        cur = llm_build_norm(ctx0, cur, hparams,
-                ggml_reshape_2d(ctx0, model.posnet_0_norm1,   1, model.posnet_0_norm1->ne[0]),
-                ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]),
-                LLM_NORM_GROUP, cb, 0);
+        // resnet block 0
+        {
+            struct ggml_tensor * cur_rnet = cur;
+
+            cur_rnet = llm_build_norm(ctx0, cur, hparams,
+                    ggml_reshape_2d(ctx0, model.posnet_0_norm1,   1, model.posnet_0_norm1->ne[0]),
+                    ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]),
+                    LLM_NORM_GROUP, cb, 0);
+
+            cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet);
+
+            cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur_rnet, 1, 1);
+            cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0]));
 
-        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+            cur_rnet = llm_build_norm(ctx0, cur_rnet, hparams,
+                    ggml_reshape_2d(ctx0, model.posnet_0_norm2,   1, model.posnet_0_norm2->ne[0]),
+                    ggml_reshape_2d(ctx0, model.posnet_0_norm2_b, 1, model.posnet_0_norm2_b->ne[0]),
+                    LLM_NORM_GROUP, cb, 0);
 
-        cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1);
-        cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0]));
+            cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet);
+
+            cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur_rnet, 1, 1);
+            cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_0_conv2_b, 1, model.posnet_0_conv2_b->ne[0]));
+
+            cur = ggml_add(ctx0, cur_rnet, cur);
+        }
+
+        // resnet block 1
+        {
+            struct ggml_tensor * cur_rnet = cur;
+
+            cur_rnet = llm_build_norm(ctx0, cur, hparams,
+                    ggml_reshape_2d(ctx0, model.posnet_1_norm1,   1, model.posnet_1_norm1->ne[0]),
+                    ggml_reshape_2d(ctx0, model.posnet_1_norm1_b, 1, model.posnet_1_norm1_b->ne[0]),
+                    LLM_NORM_GROUP, cb, 0);
+
+            cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet);
+
+            cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur_rnet, 1, 1);
+            cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_1_conv1_b, 1, model.posnet_1_conv1_b->ne[0]));
+
+            cur_rnet = llm_build_norm(ctx0, cur_rnet, hparams,
+                    ggml_reshape_2d(ctx0, model.posnet_1_norm2,   1, model.posnet_1_norm2->ne[0]),
+                    ggml_reshape_2d(ctx0, model.posnet_1_norm2_b, 1, model.posnet_1_norm2_b->ne[0]),
+                    LLM_NORM_GROUP, cb, 0);
+
+            cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet);
+
+            cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur_rnet, 1, 1);
+            cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_1_conv2_b, 1, model.posnet_1_conv2_b->ne[0]));
+
+            cur = ggml_add(ctx0, cur_rnet, cur);
+        }
 
         printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
 

From 3046fde4205d159ad4967f20564a01b808464aa7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 21:59:45 +0200
Subject: [PATCH 12/45] attn

---
 convert_hf_to_gguf.py |   6 +-
 src/llama.cpp         | 169 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 152 insertions(+), 23 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 4b51a2ad99423..ebeb3840cd587 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -296,7 +296,9 @@ def prepare_tensors(self):
                     break
 
             for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
-                data = data_torch.squeeze().numpy()
+                # TODO: why do we squeeze here?
+                #data = data_torch.squeeze().numpy()
+                data = data_torch.numpy()
 
                 # if data ends up empty, it means data_torch was a scalar tensor -> restore
                 if len(data.shape) == 0:
@@ -2044,6 +2046,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             logger.debug(f"Skipping {name!r}")
             return []
 
+        print(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
+
         return [(self.map_tensor_name(name), data_torch)]
 
     def set_vocab(self):
diff --git a/src/llama.cpp b/src/llama.cpp
index 47bfbf9b3dd02..cae15fde5dbf4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3055,9 +3055,11 @@ struct llama_model {
     struct ggml_tensor * cls_out_b = nullptr;
 
     // outetts vocoder
+    // TODO: dedup
     struct ggml_tensor * conv_1d = nullptr;
     struct ggml_tensor * conv_1d_b = nullptr;
 
+    // resnet 0
     struct ggml_tensor * posnet_0_norm1 = nullptr;
     struct ggml_tensor * posnet_0_norm1_b = nullptr;
 
@@ -3070,6 +3072,7 @@ struct llama_model {
     struct ggml_tensor * posnet_0_conv2 = nullptr;
     struct ggml_tensor * posnet_0_conv2_b = nullptr;
 
+    // resnet 1
     struct ggml_tensor * posnet_1_norm1 = nullptr;
     struct ggml_tensor * posnet_1_norm1_b = nullptr;
 
@@ -3082,6 +3085,48 @@ struct llama_model {
     struct ggml_tensor * posnet_1_conv2 = nullptr;
     struct ggml_tensor * posnet_1_conv2_b = nullptr;
 
+    // attn 2
+    struct ggml_tensor * posnet_2_attn_norm = nullptr;
+    struct ggml_tensor * posnet_2_attn_norm_b = nullptr;
+
+    struct ggml_tensor * posnet_2_attn_q = nullptr;
+    struct ggml_tensor * posnet_2_attn_q_b = nullptr;
+
+    struct ggml_tensor * posnet_2_attn_k = nullptr;
+    struct ggml_tensor * posnet_2_attn_k_b = nullptr;
+
+    struct ggml_tensor * posnet_2_attn_v = nullptr;
+    struct ggml_tensor * posnet_2_attn_v_b = nullptr;
+
+    struct ggml_tensor * posnet_2_attn_o = nullptr;
+    struct ggml_tensor * posnet_2_attn_o_b = nullptr;
+
+    // resnet 3
+    struct ggml_tensor * posnet_3_norm1 = nullptr;
+    struct ggml_tensor * posnet_3_norm1_b = nullptr;
+
+    struct ggml_tensor * posnet_3_conv1 = nullptr;
+    struct ggml_tensor * posnet_3_conv1_b = nullptr;
+
+    struct ggml_tensor * posnet_3_norm2 = nullptr;
+    struct ggml_tensor * posnet_3_norm2_b = nullptr;
+
+    struct ggml_tensor * posnet_3_conv2 = nullptr;
+    struct ggml_tensor * posnet_3_conv2_b = nullptr;
+
+    // resnet 4
+    struct ggml_tensor * posnet_4_norm1 = nullptr;
+    struct ggml_tensor * posnet_4_norm1_b = nullptr;
+
+    struct ggml_tensor * posnet_4_conv1 = nullptr;
+    struct ggml_tensor * posnet_4_conv1_b = nullptr;
+
+    struct ggml_tensor * posnet_4_norm2 = nullptr;
+    struct ggml_tensor * posnet_4_norm2_b = nullptr;
+
+    struct ggml_tensor * posnet_4_conv2 = nullptr;
+    struct ggml_tensor * posnet_4_conv2_b = nullptr;
+
     std::vector<llama_layer> layers;
 
     // gguf metadata
@@ -7386,6 +7431,11 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_POS_NET_NORM2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_CONV1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
     {LLM_TENSOR_POS_NET_CONV2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+    {LLM_TENSOR_POS_NET_ATTN_NORM,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_POS_NET_ATTN_Q,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_POS_NET_ATTN_K,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_POS_NET_ATTN_V,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_POS_NET_ATTN_OUT,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
@@ -9491,6 +9541,45 @@ static bool llm_load_tensors(
                     model.posnet_1_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, 768, 768}, 0);
                     model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   1), {768}, 0);
 
+                    model.posnet_2_attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {768}, 0);
+                    model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   2), {768}, 0);
+
+                    model.posnet_2_attn_q   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, 768, 768}, 0);
+                    model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias",   2), {768}, 0);
+
+                    model.posnet_2_attn_k   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, 768, 768}, 0);
+                    model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias",   2), {768}, 0);
+
+                    model.posnet_2_attn_v   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, 768, 768}, 0);
+                    model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias",   2), {768}, 0);
+
+                    model.posnet_2_attn_o   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, 768, 768}, 0);
+                    model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias",   2), {768}, 0);
+
+                    model.posnet_3_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {768}, 0);
+                    model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   3), {768}, 0);
+
+                    model.posnet_3_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, 768, 768}, 0);
+                    model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   3), {768}, 0);
+
+                    model.posnet_3_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {768}, 0);
+                    model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   3), {768}, 0);
+
+                    model.posnet_3_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, 768, 768}, 0);
+                    model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   3), {768}, 0);
+
+                    model.posnet_4_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {768}, 0);
+                    model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   4), {768}, 0);
+
+                    model.posnet_4_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, 768, 768}, 0);
+                    model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   4), {768}, 0);
+
+                    model.posnet_4_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {768}, 0);
+                    model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   4), {768}, 0);
+
+                    model.posnet_4_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, 768, 768}, 0);
+                    model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   4), {768}, 0);
+
                     // output
                     model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
                     model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -17088,58 +17177,94 @@ struct llm_build_context {
         cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1);
         cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0]));
 
+        inpL = cur;
+
         // resnet block 0
         {
-            struct ggml_tensor * cur_rnet = cur;
-
-            cur_rnet = llm_build_norm(ctx0, cur, hparams,
+            cur = llm_build_norm(ctx0, cur, hparams,
                     ggml_reshape_2d(ctx0, model.posnet_0_norm1,   1, model.posnet_0_norm1->ne[0]),
                     ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]),
                     LLM_NORM_GROUP, cb, 0);
 
-            cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet);
+            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
-            cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur_rnet, 1, 1);
-            cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0]));
+            cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0]));
 
-            cur_rnet = llm_build_norm(ctx0, cur_rnet, hparams,
+            cur = llm_build_norm(ctx0, cur, hparams,
                     ggml_reshape_2d(ctx0, model.posnet_0_norm2,   1, model.posnet_0_norm2->ne[0]),
                     ggml_reshape_2d(ctx0, model.posnet_0_norm2_b, 1, model.posnet_0_norm2_b->ne[0]),
                     LLM_NORM_GROUP, cb, 0);
 
-            cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet);
+            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
-            cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur_rnet, 1, 1);
-            cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_0_conv2_b, 1, model.posnet_0_conv2_b->ne[0]));
+            cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv2_b, 1, model.posnet_0_conv2_b->ne[0]));
 
-            cur = ggml_add(ctx0, cur_rnet, cur);
+            cur = ggml_add(ctx0, cur, inpL);
         }
 
+        inpL = cur;
+
         // resnet block 1
         {
-            struct ggml_tensor * cur_rnet = cur;
-
-            cur_rnet = llm_build_norm(ctx0, cur, hparams,
+            cur = llm_build_norm(ctx0, cur, hparams,
                     ggml_reshape_2d(ctx0, model.posnet_1_norm1,   1, model.posnet_1_norm1->ne[0]),
                     ggml_reshape_2d(ctx0, model.posnet_1_norm1_b, 1, model.posnet_1_norm1_b->ne[0]),
                     LLM_NORM_GROUP, cb, 0);
 
-            cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet);
+            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
-            cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur_rnet, 1, 1);
-            cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_1_conv1_b, 1, model.posnet_1_conv1_b->ne[0]));
+            cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_1_conv1_b, 1, model.posnet_1_conv1_b->ne[0]));
 
-            cur_rnet = llm_build_norm(ctx0, cur_rnet, hparams,
+            cur = llm_build_norm(ctx0, cur, hparams,
                     ggml_reshape_2d(ctx0, model.posnet_1_norm2,   1, model.posnet_1_norm2->ne[0]),
                     ggml_reshape_2d(ctx0, model.posnet_1_norm2_b, 1, model.posnet_1_norm2_b->ne[0]),
                     LLM_NORM_GROUP, cb, 0);
 
-            cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet);
+            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+            cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_1_conv2_b, 1, model.posnet_1_conv2_b->ne[0]));
+
+            cur = ggml_add(ctx0, cur, inpL);
+        }
+
+        inpL = cur;
+
+        // attention block
+        {
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    ggml_reshape_2d(ctx0, model.posnet_2_attn_norm,   1, model.posnet_2_attn_norm->ne[0]),
+                    ggml_reshape_2d(ctx0, model.posnet_2_attn_norm_b, 1, model.posnet_2_attn_norm_b->ne[0]),
+                    LLM_NORM_GROUP, cb, 0);
+
+            struct ggml_tensor * q;
+            struct ggml_tensor * k;
+            struct ggml_tensor * v;
+
+            q = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_q, cur, 1, 1);
+            k = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_k, cur, 1, 1);
+            v = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_v, cur, 1, 1);
+
+            q = ggml_add(ctx0, q, ggml_reshape_2d(ctx0, model.posnet_2_attn_q_b, 1, model.posnet_2_attn_q_b->ne[0]));
+            k = ggml_add(ctx0, k, ggml_reshape_2d(ctx0, model.posnet_2_attn_k_b, 1, model.posnet_2_attn_k_b->ne[0]));
+            v = ggml_add(ctx0, v, ggml_reshape_2d(ctx0, model.posnet_2_attn_v_b, 1, model.posnet_2_attn_v_b->ne[0]));
 
-            cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur_rnet, 1, 1);
-            cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_1_conv2_b, 1, model.posnet_1_conv2_b->ne[0]));
+            q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
+            k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
 
-            cur = ggml_add(ctx0, cur_rnet, cur);
+            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
+
+            kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(768)), 0.0f);
+
+            cur = ggml_mul_mat(ctx0, kq, v);
+
+            cur = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_o, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_2_attn_o_b, 1, model.posnet_2_attn_o_b->ne[0]));
+
+            cur = ggml_add(ctx0, cur, inpL);
         }
 
         printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);

From 435cfd788be71f16d94efb9454daf990f858a04d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 22:05:44 +0200
Subject: [PATCH 13/45] pos net

---
 src/llama.cpp | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index cae15fde5dbf4..31ce4b31ab738 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1631,7 +1631,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_OUTPUT,            "output" },
             { LLM_TENSOR_POS_NET_CONV1,     "pos_net.%d.conv1" },
             { LLM_TENSOR_POS_NET_CONV2,     "pos_net.%d.conv2" },
-            { LLM_TENSOR_POS_NET_NORM,      "pos_net.norm" },
+            { LLM_TENSOR_POS_NET_NORM,      "pos_net.%d.norm" },
             { LLM_TENSOR_POS_NET_NORM1,     "pos_net.%d.norm1" },
             { LLM_TENSOR_POS_NET_NORM2,     "pos_net.%d.norm2" },
             { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.%d.attn_norm" },
@@ -3127,6 +3127,10 @@ struct llama_model {
     struct ggml_tensor * posnet_4_conv2 = nullptr;
     struct ggml_tensor * posnet_4_conv2_b = nullptr;
 
+    // resnet 5
+    struct ggml_tensor * posnet_5_norm = nullptr;
+    struct ggml_tensor * posnet_5_norm_b = nullptr;
+
     std::vector<llama_layer> layers;
 
     // gguf metadata
@@ -7427,6 +7431,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     // this tensor is loaded for T5, but never used
     {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
     {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
+    {LLM_TENSOR_POS_NET_NORM,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_NORM2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_POS_NET_CONV1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
@@ -9580,6 +9585,9 @@ static bool llm_load_tensors(
                     model.posnet_4_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, 768, 768}, 0);
                     model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   4), {768}, 0);
 
+                    model.posnet_5_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {768}, 0);
+                    model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   5), {768}, 0);
+
                     // output
                     model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
                     model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -17267,6 +17275,69 @@ struct llm_build_context {
             cur = ggml_add(ctx0, cur, inpL);
         }
 
+        inpL = cur;
+
+        // resnet block 3
+        {
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    ggml_reshape_2d(ctx0, model.posnet_3_norm1,   1, model.posnet_3_norm1->ne[0]),
+                    ggml_reshape_2d(ctx0, model.posnet_3_norm1_b, 1, model.posnet_3_norm1_b->ne[0]),
+                    LLM_NORM_GROUP, cb, 0);
+
+            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+            cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv1, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_3_conv1_b, 1, model.posnet_3_conv1_b->ne[0]));
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    ggml_reshape_2d(ctx0, model.posnet_3_norm2,   1, model.posnet_3_norm2->ne[0]),
+                    ggml_reshape_2d(ctx0, model.posnet_3_norm2_b, 1, model.posnet_3_norm2_b->ne[0]),
+                    LLM_NORM_GROUP, cb, 0);
+
+            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+            cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv2, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_3_conv2_b, 1, model.posnet_3_conv2_b->ne[0]));
+
+            cur = ggml_add(ctx0, cur, inpL);
+        }
+
+        inpL = cur;
+
+        // resnet block 4
+        {
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    ggml_reshape_2d(ctx0, model.posnet_4_norm1,   1, model.posnet_4_norm1->ne[0]),
+                    ggml_reshape_2d(ctx0, model.posnet_4_norm1_b, 1, model.posnet_4_norm1_b->ne[0]),
+                    LLM_NORM_GROUP, cb, 0);
+
+            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+            cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv1, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_4_conv1_b, 1, model.posnet_4_conv1_b->ne[0]));
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    ggml_reshape_2d(ctx0, model.posnet_4_norm2,   1, model.posnet_4_norm2->ne[0]),
+                    ggml_reshape_2d(ctx0, model.posnet_4_norm2_b, 1, model.posnet_4_norm2_b->ne[0]),
+                    LLM_NORM_GROUP, cb, 0);
+
+            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+
+            cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv2, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_4_conv2_b, 1, model.posnet_4_conv2_b->ne[0]));
+
+            cur = ggml_add(ctx0, cur, inpL);
+        }
+
+        // normalize block 5
+        {
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    ggml_reshape_2d(ctx0, model.posnet_5_norm,   1, model.posnet_5_norm->ne[0]),
+                    ggml_reshape_2d(ctx0, model.posnet_5_norm_b, 1, model.posnet_5_norm_b->ne[0]),
+                    LLM_NORM_GROUP, cb, 0);
+        }
+
+
         printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
 
         //cur = llm_build_norm(ctx0, cur, hparams,

From b3ba05e5bc62de2154198a8f335b58d54457e259 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 22:37:26 +0200
Subject: [PATCH 14/45] layer norm

---
 convert_hf_to_gguf.py            |  2 +-
 examples/tts/convert_pt_to_hf.py | 10 ++++++++++
 gguf-py/gguf/constants.py        |  3 ---
 gguf-py/gguf/tensor_mapping.py   |  6 +-----
 src/llama.cpp                    |  9 +++++++--
 5 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ebeb3840cd587..a86490831fdbd 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2046,7 +2046,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             logger.debug(f"Skipping {name!r}")
             return []
 
-        print(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
+        logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
 
         return [(self.map_tensor_name(name), data_torch)]
 
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index 389d2de50efe9..4a0d4bcc8cde7 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -88,6 +88,16 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":
             new_key = "backbone.embedding.weight"
 
+        # these are the only rows used
+        # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100
+        if new_key == "backbone.norm.scale.weight":
+            new_key = "backbone.norm.weight"
+            value = value[0]
+
+        if new_key == "backbone.norm.shift.weight":
+            new_key = "backbone.norm.bias"
+            value = value[0]
+
         size_mb = value.element_size() * value.nelement() / (1024 * 1024)
         print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 14e68cffa522c..81e434d117004 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -267,7 +267,6 @@ class MODEL_ARCH(IntEnum):
 class MODEL_TENSOR(IntEnum):
     TOKEN_EMBD           = auto()
     TOKEN_EMBD_NORM      = auto()
-    TOKEN_EMBD_SHIFT     = auto()
     TOKEN_TYPES          = auto()
     POS_EMBD             = auto()
     OUTPUT               = auto()
@@ -451,7 +450,6 @@ class MODEL_TENSOR(IntEnum):
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.TOKEN_EMBD:                "token_embd",
     MODEL_TENSOR.TOKEN_EMBD_NORM:           "token_embd_norm",
-    MODEL_TENSOR.TOKEN_EMBD_SHIFT:          "token_embd_shift",
     MODEL_TENSOR.TOKEN_TYPES:               "token_types",
     MODEL_TENSOR.POS_EMBD:                  "position_embd",
     MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
@@ -1415,7 +1413,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.OUTETTS_VOC: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.TOKEN_EMBD_NORM,
-        MODEL_TENSOR.TOKEN_EMBD_SHIFT,
         MODEL_TENSOR.CONV1D,
         MODEL_TENSOR.CONV_NEXT_DW,
         MODEL_TENSOR.CONV_NEXT_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 4355ccf111456..872205e77b21a 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -43,11 +43,7 @@ class TensorNameMap:
             "emb_ln",                     # nomic-bert
             "transformer.norm",           # openelm
             "rwkv.blocks.0.pre_ln",       # rwkv
-            "backbone.norm.scale",        # outetts
-        ),
-
-        MODEL_TENSOR.TOKEN_EMBD_SHIFT: (
-            "backbone.norm.shift",        # outetts
+            "backbone.norm",              # outetts
         ),
 
         # Position embeddings
diff --git a/src/llama.cpp b/src/llama.cpp
index 31ce4b31ab738..6c38d9315ea79 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -505,7 +505,6 @@ struct LLM_KV {
 enum llm_tensor {
     LLM_TENSOR_TOKEN_EMBD,
     LLM_TENSOR_TOKEN_EMBD_NORM,
-    LLM_TENSOR_TOKEN_EMBD_SHIFT,
     LLM_TENSOR_TOKEN_TYPES,
     LLM_TENSOR_POS_EMBD,
     LLM_TENSOR_OUTPUT,
@@ -1619,7 +1618,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
         {
             { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
             { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
-            { LLM_TENSOR_TOKEN_EMBD_SHIFT,  "token_embd_shift" },
             { LLM_TENSOR_CONV1D,            "conv1d" },
             { LLM_TENSOR_CONV_NEXT_DW,      "conv_next.dw" },
             { LLM_TENSOR_CONV_NEXT_NORM,    "conv_next.norm" },
@@ -9519,6 +9517,9 @@ static bool llm_load_tensors(
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
+                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0);
+                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {768}, 0);
+
                     model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0);
                     model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {768}, 0);
 
@@ -17337,6 +17338,10 @@ struct llm_build_context {
                     LLM_NORM_GROUP, cb, 0);
         }
 
+        cur = llm_build_norm(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), hparams,
+                model.tok_norm,
+                model.tok_norm_b,
+                LLM_NORM, cb, -1);
 
         printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
 

From fe6dd5aa61924006ce81a84ba47441fbd570c804 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 10:06:48 +0200
Subject: [PATCH 15/45] convnext

---
 examples/tts/convert_pt_to_hf.py |  12 +-
 ggml/include/ggml.h              |  41 ++++--
 ggml/src/ggml.c                  | 224 ++++++++++++++++++-------------
 gguf-py/gguf/constants.py        |   3 -
 gguf-py/gguf/tensor_mapping.py   |   6 +-
 src/llama.cpp                    |  92 +++++++++++--
 6 files changed, 253 insertions(+), 125 deletions(-)

diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index 4a0d4bcc8cde7..501fc4d6a6aa4 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -90,14 +90,17 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
 
         # these are the only rows used
         # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100
-        if new_key == "backbone.norm.scale.weight":
-            new_key = "backbone.norm.weight"
+        if new_key.endswith("norm.scale.weight"):
+            new_key = new_key.replace("norm.scale.weight", "norm.weight")
             value = value[0]
 
-        if new_key == "backbone.norm.shift.weight":
-            new_key = "backbone.norm.bias"
+        if new_key.endswith("norm.shift.weight"):
+            new_key = new_key.replace("norm.shift.weight", "norm.bias")
             value = value[0]
 
+        if new_key.endswith("gamma"):
+            new_key = new_key.replace("gamma", "gamma.weight")
+
         size_mb = value.element_size() * value.nelement() / (1024 * 1024)
         print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
 
@@ -149,6 +152,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
     "hidden_size": 512,
     "vocab_size": 4096,
     "n_head": 1,
+    "layer_norm_epsilon": 1e-6,
     "max_position_embeddings": 8192, # ?
     "num_hidden_layers": 12
 }
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index b0c1ac9ce2b89..c714fc8c837bb 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1564,17 +1564,6 @@ extern "C" {
         int                   d1, // dilation dimension 1
         bool                  is_2D);
 
-    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,  // convolution kernel
-            struct ggml_tensor  * b,  // data
-            int                  s0,  // stride dimension 0
-            int                  s1,  // stride dimension 1
-            int                  p0,  // padding dimension 0
-            int                  p1,  // padding dimension 1
-            int                  d0,  // dilation dimension 0
-            int                  d1); // dilation dimension 1
-
     GGML_API struct ggml_tensor * ggml_conv_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,   // convolution kernel
@@ -1592,6 +1581,23 @@ extern "C" {
             int                   s,  // stride
             int                   d); // dilation
 
+    // depthwise
+    // TODO: this is very likely wrong for some cases! - needs more testing
+    GGML_API struct ggml_tensor * ggml_conv_1d_dw(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+
+    GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   d0); // dilation
+
     GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,   // convolution kernel
@@ -1611,7 +1617,6 @@ extern "C" {
             int                   d0,  // dilation dimension 0
             int                   d1); // dilation dimension 1
 
-
     // kernel size is a->ne[0] x a->ne[1]
     // stride is equal to kernel size
     // padding is zero
@@ -1638,6 +1643,18 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // depthwise
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1
+
     GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 3665488b30c40..7c0159ab49c9f 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3760,13 +3760,84 @@ struct ggml_tensor * ggml_clamp(
     return result;
 }
 
-// ggml_conv_1d
-
 static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
 
-GGML_API struct ggml_tensor * ggml_conv_1d(
+// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OH, OW, IC*KH*KW]
+struct ggml_tensor * ggml_im2col(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D,
+        enum ggml_type        dst_type) {
+    if (is_2D) {
+        GGML_ASSERT(a->ne[2] == b->ne[2]);
+    } else {
+        //GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
+        GGML_ASSERT(b->ne[1] == a->ne[1]);
+        GGML_ASSERT(b->ne[3] == 1);
+    }
+
+    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
+    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+
+    GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
+    GGML_ASSERT((OW > 0)           && "b too small compared to a");
+
+    const int64_t ne[4] = {
+        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
+        OW,
+        is_2D ? OH : b->ne[2],
+        is_2D ?      b->ne[3] : 1,
+    };
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_IM2COL;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_im2col_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int64_t             * ne,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1,
+        bool                  is_2D) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_IM2COL_BACK;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+// ggml_conv_1d
+
+struct ggml_tensor * ggml_conv_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
@@ -3775,6 +3846,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
         int                   d0) {
     struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
 
+    printf("a: %lld %lld %lld %lld\n", a->ne[0], a->ne[1], a->ne[2], a->ne[3]);
+    printf("b: %lld %lld %lld %lld\n", b->ne[0], b->ne[1], b->ne[2], b->ne[3]);
+    printf("im2col: %lld %lld %lld %lld\n", im2col->ne[0], im2col->ne[1], im2col->ne[2], im2col->ne[3]);
+
     struct ggml_tensor * result =
         ggml_mul_mat(ctx,
                 ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
@@ -3796,137 +3871,75 @@ struct ggml_tensor* ggml_conv_1d_ph(
     return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
 }
 
-// ggml_conv_transpose_1d
-
-static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
-    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
-}
+// ggml_conv_1d_dw
 
-GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+struct ggml_tensor * ggml_conv_1d_dw(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   s0,
         int                   p0,
         int                   d0) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[2] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
+    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
 
-    GGML_ASSERT(p0 == 0);
-    GGML_ASSERT(d0 == 1);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
 
-    const int64_t ne[4] = {
-        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
-        a->ne[1], b->ne[2], 1,
-    };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-
-    int32_t params[] = { s0, p0, d0 };
-    ggml_set_op_params(result, params, sizeof(params));
+    struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
 
-    result->op     = GGML_OP_CONV_TRANSPOSE_1D;
-    result->src[0] = a;
-    result->src[1] = b;
+    result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
 
     return result;
 }
 
-// ggml_conv_depthwise
+// ggml_conv_1d_dw_ph
 
-struct ggml_tensor * ggml_conv_depthwise_2d(
+struct ggml_tensor * ggml_conv_1d_dw_ph(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   s0,
-        int                   s1,
-        int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1) {
-    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
-    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
-                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
-                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
-    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+        int                   d0) {
+    return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
+}
 
-    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
-    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
-    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+// ggml_conv_transpose_1d
 
-    return result;
+static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
 }
-// ggml_conv_2d
 
-// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-// a: [OC，IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OH, OW, IC*KH*KW]
-struct ggml_tensor * ggml_im2col(
+GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   s0,
-        int                   s1,
         int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1,
-        bool                  is_2D,
-        enum ggml_type        dst_type) {
-    if (is_2D) {
-        GGML_ASSERT(a->ne[2] == b->ne[2]);
-    } else {
-        GGML_ASSERT(a->ne[1] == b->ne[1]);
-        GGML_ASSERT(b->ne[3] == 1);
-    }
-
-    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
-    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+        int                   d0) {
+    GGML_ASSERT(ggml_is_matrix(b));
+    GGML_ASSERT(a->ne[2] == b->ne[1]);
+    GGML_ASSERT(a->ne[3] == 1);
 
-    GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
-    GGML_ASSERT((OW > 0)           && "b too small compared to a");
+    GGML_ASSERT(p0 == 0);
+    GGML_ASSERT(d0 == 1);
 
     const int64_t ne[4] = {
-        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
-        OW,
-        is_2D ? OH : b->ne[2],
-        is_2D ?      b->ne[3] : 1,
+        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+        a->ne[1], b->ne[2], 1,
     };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
 
-    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
+    int32_t params[] = { s0, p0, d0 };
     ggml_set_op_params(result, params, sizeof(params));
 
-    result->op     = GGML_OP_IM2COL;
+    result->op     = GGML_OP_CONV_TRANSPOSE_1D;
     result->src[0] = a;
     result->src[1] = b;
 
     return result;
 }
 
-struct ggml_tensor * ggml_im2col_back(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int64_t             * ne,
-        int                   s0,
-        int                   s1,
-        int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1,
-        bool                  is_2D) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
-    ggml_set_op_params(result, params, sizeof(params));
-
-    result->op     = GGML_OP_IM2COL_BACK;
-    result->src[0] = a;
-    result->src[1] = b;
-
-    return result;
-}
+// ggml_conv_2d
 
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
@@ -3973,6 +3986,31 @@ struct ggml_tensor * ggml_conv_2d_s1_ph(
     return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
 }
 
+// ggml_conv_2d_dw
+
+struct ggml_tensor * ggml_conv_2d_dw(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
+                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
+                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
+    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+
+    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
+    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+
+    return result;
+}
+
 // ggml_conv_transpose_2d_p0
 
 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 81e434d117004..ea74354a43b8b 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -374,7 +374,6 @@ class MODEL_TENSOR(IntEnum):
     CONV1D               = auto()
     CONV_NEXT_DW         = auto()
     CONV_NEXT_NORM       = auto()
-    CONV_NEXT_SHIFT      = auto()
     CONV_NEXT_PW1        = auto()
     CONV_NEXT_PW2        = auto()
     CONV_NEXT_GAMMA      = auto()
@@ -557,7 +556,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.CONV1D:                    "conv1d",
     MODEL_TENSOR.CONV_NEXT_DW:              "conv_next.{bid}.dw",
     MODEL_TENSOR.CONV_NEXT_NORM:            "conv_next.{bid}.norm",
-    MODEL_TENSOR.CONV_NEXT_SHIFT:           "conv_next.{bid}.shift",
     MODEL_TENSOR.CONV_NEXT_PW1:             "conv_next.{bid}.pw1",
     MODEL_TENSOR.CONV_NEXT_PW2:             "conv_next.{bid}.pw2",
     MODEL_TENSOR.CONV_NEXT_GAMMA:           "conv_next.{bid}.gamma",
@@ -1416,7 +1414,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.CONV1D,
         MODEL_TENSOR.CONV_NEXT_DW,
         MODEL_TENSOR.CONV_NEXT_NORM,
-        MODEL_TENSOR.CONV_NEXT_SHIFT,
         MODEL_TENSOR.CONV_NEXT_PW1,
         MODEL_TENSOR.CONV_NEXT_PW2,
         MODEL_TENSOR.CONV_NEXT_GAMMA,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 872205e77b21a..ca7bb40fef11b 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -714,11 +714,7 @@ class TensorNameMap:
         ),
 
         MODEL_TENSOR.CONV_NEXT_NORM: (
-            "backbone.convnext.{bid}.norm.scale", # outetts
-        ),
-
-        MODEL_TENSOR.CONV_NEXT_SHIFT: (
-            "backbone.convnext.{bid}.norm.shift", # outetts
+            "backbone.convnext.{bid}.norm", # outetts
         ),
 
         MODEL_TENSOR.CONV_NEXT_PW1: (
diff --git a/src/llama.cpp b/src/llama.cpp
index 6c38d9315ea79..c94cdd383d709 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -614,7 +614,6 @@ enum llm_tensor {
     LLM_TENSOR_CONV1D,
     LLM_TENSOR_CONV_NEXT_DW,
     LLM_TENSOR_CONV_NEXT_NORM,
-    LLM_TENSOR_CONV_NEXT_SHIFT,
     LLM_TENSOR_CONV_NEXT_PW1,
     LLM_TENSOR_CONV_NEXT_PW2,
     LLM_TENSOR_CONV_NEXT_GAMMA,
@@ -1619,12 +1618,11 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
             { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
             { LLM_TENSOR_CONV1D,            "conv1d" },
-            { LLM_TENSOR_CONV_NEXT_DW,      "conv_next.dw" },
-            { LLM_TENSOR_CONV_NEXT_NORM,    "conv_next.norm" },
-            { LLM_TENSOR_CONV_NEXT_SHIFT,   "conv_next.shift" },
-            { LLM_TENSOR_CONV_NEXT_PW1,     "conv_next.pw1" },
-            { LLM_TENSOR_CONV_NEXT_PW2,     "conv_next.pw2" },
-            { LLM_TENSOR_CONV_NEXT_GAMMA,   "conv_next.gamma" },
+            { LLM_TENSOR_CONV_NEXT_DW,      "conv_next.%d.dw" },
+            { LLM_TENSOR_CONV_NEXT_NORM,    "conv_next.%d.norm" },
+            { LLM_TENSOR_CONV_NEXT_PW1,     "conv_next.%d.pw1" },
+            { LLM_TENSOR_CONV_NEXT_PW2,     "conv_next.%d.pw2" },
+            { LLM_TENSOR_CONV_NEXT_GAMMA,   "conv_next.%d.gamma" },
             { LLM_TENSOR_OUTPUT_NORM,       "output_norm" },
             { LLM_TENSOR_OUTPUT,            "output" },
             { LLM_TENSOR_POS_NET_CONV1,     "pos_net.%d.conv1" },
@@ -2922,6 +2920,21 @@ struct llama_layer {
     struct ggml_tensor * ffn_gate_scale;
     struct ggml_tensor * ffn_up_scale;
     struct ggml_tensor * ffn_down_scale;
+
+    // convnext
+    struct ggml_tensor * convnext_dw;
+    struct ggml_tensor * convnext_dw_b;
+
+    struct ggml_tensor * convnext_norm;
+    struct ggml_tensor * convnext_norm_b;
+
+    struct ggml_tensor * convnext_pw1;
+    struct ggml_tensor * convnext_pw1_b;
+
+    struct ggml_tensor * convnext_pw2;
+    struct ggml_tensor * convnext_pw2_b;
+
+    struct ggml_tensor * convnext_gamma;
 };
 
 // very similar to llama_batch,
@@ -6420,6 +6433,10 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
+        case LLM_ARCH_OUTETTS_VOC:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+            } break;
         default: (void)0;
     }
 
@@ -7439,6 +7456,11 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_POS_NET_ATTN_K,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_POS_NET_ATTN_V,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_POS_NET_ATTN_OUT,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONV_NEXT_DW,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+    {LLM_TENSOR_CONV_NEXT_NORM,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CONV_NEXT_PW1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONV_NEXT_PW2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONV_NEXT_GAMMA,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
@@ -9589,6 +9611,25 @@ static bool llm_load_tensors(
                     model.posnet_5_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {768}, 0);
                     model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   5), {768}, 0);
 
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = model.layers[i];
+
+                        layer.convnext_dw   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, 768}, 0);
+                        layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias",   i), {768}, 0);
+
+                        layer.convnext_norm   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {768}, 0);
+                        layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias",   i), {768}, 0);
+
+                        // TODO: n_ff
+                        layer.convnext_pw1   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "weight", i), {768, 2304}, 0);
+                        layer.convnext_pw1_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "bias",   i), {2304}, 0);
+
+                        layer.convnext_pw2   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "weight", i), {2304, 768}, 0);
+                        layer.convnext_pw2_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "bias",   i), {768}, 0);
+
+                        layer.convnext_gamma = create_tensor(tn(LLM_TENSOR_CONV_NEXT_GAMMA, "weight", i), {768}, 0);
+                    }
+
                     // output
                     model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
                     model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
@@ -17338,11 +17379,46 @@ struct llm_build_context {
                     LLM_NORM_GROUP, cb, 0);
         }
 
-        cur = llm_build_norm(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), hparams,
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        cur = llm_build_norm(ctx0, cur, hparams,
                 model.tok_norm,
                 model.tok_norm_b,
                 LLM_NORM, cb, -1);
 
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        inpL = cur;
+
+        for (int il = 0; il < n_layer; ++il) {
+            cur = inpL;
+
+            cur = ggml_conv_1d_dw_ph(ctx0, model.layers[il].convnext_dw, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.layers[il].convnext_dw_b, 1, model.layers[il].convnext_dw_b->ne[0]));
+
+            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    model.layers[il].convnext_norm,
+                    model.layers[il].convnext_norm_b,
+                    LLM_NORM, cb, -1);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].convnext_pw1, model.layers[il].convnext_pw1_b, NULL,
+                    NULL,                          NULL,                            NULL,
+                    model.layers[il].convnext_pw2, model.layers[il].convnext_pw2_b, NULL,
+                    NULL,
+                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
+
+            cur = ggml_mul(ctx0, cur, model.layers[il].convnext_gamma);
+
+            cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+            inpL = ggml_add(ctx0, cur, inpL);
+        }
+
+        cur = inpL;
+
         printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
 
         //cur = llm_build_norm(ctx0, cur, hparams,

From 839035d1bbb818feb218c4a74a175fdda81f00fe Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 10:22:12 +0200
Subject: [PATCH 16/45] head

---
 examples/tts/tts.cpp |  2 +-
 src/llama.cpp        | 31 ++++++++++++++++++++-----------
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 17dc0eff2cd52..7f5a8a616a1bf 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -170,7 +170,7 @@ int main(int argc, char ** argv) {
 
     const float * embd = llama_get_embeddings(ctx_cts);
 
-    int n = 768*261;
+    int n = 1282*261;
 
     LOG("result:\n");
     for (int i = 0; i < 10; ++i) {
diff --git a/src/llama.cpp b/src/llama.cpp
index c94cdd383d709..adecc4bffa9ac 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9631,8 +9631,11 @@ static bool llm_load_tensors(
                     }
 
                     // output
-                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
-                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {768}, 0);
+
+                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
                 } break;
             default:
                 throw std::runtime_error("unknown architecture");
@@ -17419,16 +17422,22 @@ struct llm_build_context {
 
         cur = inpL;
 
-        printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm,
+                model.output_norm_b,
+                LLM_NORM, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output_no_bias", -1);
 
-        //cur = llm_build_norm(ctx0, cur, hparams,
-        //        model.output_norm, NULL,
-        //        LLM_NORM_RMS, cb, -1);
-        //cb(cur, "result_norm", -1);
+        cur = ggml_add(ctx0, cur, model.output_b);
+        cb(cur, "result_output", -1);
 
-        //// lm_head
-        //cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-        //cb(cur, "result_output", -1);
+        printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
 
         ggml_build_forward_expand(gf, cur);
 
@@ -18588,7 +18597,7 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) {
                             GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
                             // TODO: TEMPORARY [OUTETTS]
                             //ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*768*sizeof(float));
+                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*1282*sizeof(float));
                         }
                     } break;
                 case LLAMA_POOLING_TYPE_MEAN:

From eb1b70f42a00e0c6e83ea3ee12889d8ccf743851 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 10:52:07 +0200
Subject: [PATCH 17/45] hann window

---
 examples/tts/convert_pt_to_hf.py |  3 +++
 gguf-py/gguf/tensor_mapping.py   |  1 -
 src/llama.cpp                    | 12 ++++++++----
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index 501fc4d6a6aa4..050a61fdc53b0 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -101,6 +101,9 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         if new_key.endswith("gamma"):
             new_key = new_key.replace("gamma", "gamma.weight")
 
+        if new_key == "head.istft.window":
+            new_key = "head.istft.window.weight"
+
         size_mb = value.element_size() * value.nelement() / (1024 * 1024)
         print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index ca7bb40fef11b..93b70a1477e6a 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -28,7 +28,6 @@ class TensorNameMap:
             "transformer.token_embeddings",              # openelm
             "shared",                                    # t5
             "rwkv.embeddings",                           # rwkv
-            "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" # outetts
         ),
 
         # Token type embeddings
diff --git a/src/llama.cpp b/src/llama.cpp
index adecc4bffa9ac..54cc0ec606350 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3070,6 +3070,8 @@ struct llama_model {
     struct ggml_tensor * conv_1d = nullptr;
     struct ggml_tensor * conv_1d_b = nullptr;
 
+    struct ggml_tensor * hann_window = nullptr;
+
     // resnet 0
     struct ggml_tensor * posnet_0_norm1 = nullptr;
     struct ggml_tensor * posnet_0_norm1_b = nullptr;
@@ -5121,8 +5123,7 @@ struct llama_model_loader {
 
     void done_getting_tensors() const {
         if (n_created != n_tensors) {
-            // TODO: TEMPORARY DISABLED [OUTETTS]
-            //throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
+            throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
         }
     }
 
@@ -7461,6 +7462,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_CONV_NEXT_PW1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CONV_NEXT_PW2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CONV_NEXT_GAMMA,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_HANN_WINDOW,                {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
@@ -9634,8 +9636,10 @@ static bool llm_load_tensors(
                     model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
                     model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {768}, 0);
 
-                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
-                    model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, 0);
+                    model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {1282}, 0);
+
+                    model.hann_window = create_tensor(tn(LLM_TENSOR_HANN_WINDOW, "weight"), {1280}, 0);
                 } break;
             default:
                 throw std::runtime_error("unknown architecture");

From a1f08ad3385212a446de6ba33b2ef54c7c671a3d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 10:59:08 +0200
Subject: [PATCH 18/45] fix n_embd + remove llama.cpp hacks

---
 examples/tts/convert_pt_to_hf.py |  2 +-
 examples/tts/tts.cpp             |  3 ++-
 src/llama.cpp                    | 39 ++++++++------------------------
 3 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index 050a61fdc53b0..d066248798b41 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -152,7 +152,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
     "architectures": [
         "OuteTTSVocoder"
     ],
-    "hidden_size": 512,
+    "hidden_size": 1282,
     "vocab_size": 4096,
     "n_head": 1,
     "layer_norm_epsilon": 1e-6,
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 7f5a8a616a1bf..684f6b2fbd8e0 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -168,9 +168,10 @@ int main(int argc, char ** argv) {
 
     LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
 
+    const int n_embd = llama_n_embd(model_cts);
     const float * embd = llama_get_embeddings(ctx_cts);
 
-    int n = 1282*261;
+    int n = n_embd*261;
 
     LOG("result:\n");
     for (int i = 0; i < 10; ++i) {
diff --git a/src/llama.cpp b/src/llama.cpp
index 54cc0ec606350..841e9a491b2f6 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9539,12 +9539,12 @@ static bool llm_load_tensors(
                 } break;
             case LLM_ARCH_OUTETTS_VOC:
                 {
-                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0);
 
                     model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0);
                     model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {768}, 0);
 
-                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0);
+                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, 512, 768}, 0);
                     model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {768}, 0);
 
                     model.posnet_0_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0);
@@ -9636,8 +9636,8 @@ static bool llm_load_tensors(
                     model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
                     model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {768}, 0);
 
-                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, 0);
-                    model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {1282}, 0);
+                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, n_embd}, 0);
+                    model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
 
                     model.hann_window = create_tensor(tn(LLM_TENSOR_HANN_WINDOW, "weight"), {1280}, 0);
                 } break;
@@ -17432,14 +17432,12 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, cb, -1);
-        cb(cur, "result_norm", -1);
 
         // lm_head
         cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
-        cb(cur, "result_output_no_bias", -1);
 
         cur = ggml_add(ctx0, cur, model.output_b);
-        cb(cur, "result_output", -1);
+        cb(cur, "result_embd", -1);
 
         printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
 
@@ -17732,8 +17730,7 @@ static struct ggml_cgraph * llama_build_graph(
 
     // add on pooling layer
     if (lctx.cparams.embeddings) {
-        // TODO: TEMPORARY DISABLED [OUTETTS]
-        //result = llm.append_pooling(result);
+        result = llm.append_pooling(result);
     }
 
     llm.free();
@@ -18221,13 +18218,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     }
 
     const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
-
-    // TODO: TEMPORARY !!! [OUTETTS]
-#if 0
     const size_t new_size  = (logits_size + embd_size) * sizeof(float);
-#else
-    const size_t new_size  = 1024*1024*32;
-#endif
 
     // alloc only when more than the current capacity is required
     // TODO: also consider shrinking the buffer
@@ -18501,14 +18492,9 @@ static int llama_decode_internal(
 
         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
-        struct ggml_tensor * res  = nullptr;
-        struct ggml_tensor * embd = nullptr;
-
-// TODO: TEMPORARY DISABLED [OUTETTS]
-if (model.arch != LLM_ARCH_OUTETTS_VOC) {
         // the output is always the last tensor in the graph
-        res  = ggml_graph_node(gf, -1);
-        embd = ggml_graph_node(gf, -2);
+        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
+        struct ggml_tensor * embd = ggml_graph_node(gf, -2);
 
         if (lctx.n_outputs == 0) {
             // no output
@@ -18528,10 +18514,7 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) {
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
-} else {
-        res  = nullptr;
-        embd = ggml_graph_node(gf, -1);
-}
+
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
@@ -18599,9 +18582,7 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) {
                         if (n_outputs_new) {
                             GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
                             GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
-                            // TODO: TEMPORARY [OUTETTS]
-                            //ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
-                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*1282*sizeof(float));
+                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
                         }
                     } break;
                 case LLAMA_POOLING_TYPE_MEAN:

From e728cfd297f2f67693f5d4f94c8f66f405988fd2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 12:35:47 +0200
Subject: [PATCH 19/45] compute hann window

---
 examples/tts/convert_pt_to_hf.py |  5 +----
 examples/tts/tts.cpp             | 15 +++++++++++++++
 gguf-py/gguf/constants.py        |  3 ---
 gguf-py/gguf/tensor_mapping.py   |  4 ----
 include/llama.h                  |  3 ---
 src/llama.cpp                    | 27 +++++++++++----------------
 6 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index d066248798b41..c4a1185a83d47 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -70,7 +70,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         # keep only what we need for inference
         if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \
            not key.startswith('state_dict.backbone.') and \
-           not key.startswith('state_dict.head.'):
+           not key.startswith('state_dict.head.out'):
                print('Skipping key: ', key)
                continue
 
@@ -101,9 +101,6 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         if new_key.endswith("gamma"):
             new_key = new_key.replace("gamma", "gamma.weight")
 
-        if new_key == "head.istft.window":
-            new_key = "head.istft.window.weight"
-
         size_mb = value.element_size() * value.nelement() / (1024 * 1024)
         print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
 
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 684f6b2fbd8e0..f402ba8a2be97 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -57,6 +57,16 @@ static void print_usage(int, char ** argv) {
     LOG("\n");
 }
 
+void fill_hann_window(int length, bool periodic, float * output) {
+    int offset = -1;
+    if (periodic) {
+        offset = 0;
+    }
+    for (int i = 0; i < length; i++) {
+        output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+    }
+}
+
 int main(int argc, char ** argv) {
     common_params params;
 
@@ -171,6 +181,11 @@ int main(int argc, char ** argv) {
     const int n_embd = llama_n_embd(model_cts);
     const float * embd = llama_get_embeddings(ctx_cts);
 
+    const int w = 1280;
+    std::vector<float> hann(w);
+    fill_hann_window(hann.size(), true, hann.data());
+
+
     int n = n_embd*261;
 
     LOG("result:\n");
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index ea74354a43b8b..f1f44c7d2c77a 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -387,7 +387,6 @@ class MODEL_TENSOR(IntEnum):
     POS_NET_ATTN_K       = auto()
     POS_NET_ATTN_V       = auto()
     POS_NET_ATTN_OUT     = auto()
-    HANN_WINDOW          = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -569,7 +568,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.POS_NET_ATTN_K:            "pos_net.{bid}.attn_k",
     MODEL_TENSOR.POS_NET_ATTN_V:            "pos_net.{bid}.attn_v",
     MODEL_TENSOR.POS_NET_ATTN_OUT:          "pos_net.{bid}.attn_output",
-    MODEL_TENSOR.HANN_WINDOW:               "hann_window",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1429,7 +1427,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.POS_NET_ATTN_K,
         MODEL_TENSOR.POS_NET_ATTN_V,
         MODEL_TENSOR.POS_NET_ATTN_OUT,
-        MODEL_TENSOR.HANN_WINDOW,
     ],
     # TODO
 }
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 93b70a1477e6a..5bf1f514a04f6 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -94,10 +94,6 @@ class TensorNameMap:
         MODEL_TENSOR.ROPE_FACTORS_LONG: (),
         MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
 
-        MODEL_TENSOR.HANN_WINDOW: (
-            "head.istft.window", # outetts
-        ),
-
         MODEL_TENSOR.CONV1D: (
             "backbone.embed", # roberta
         ),
diff --git a/include/llama.h b/include/llama.h
index efbb27d21523a..a4abf395bcd93 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -482,9 +482,6 @@ extern "C" {
     // Returns the total number of parameters in the model
     LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
 
-    // Get a llama model tensor
-    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
-
     // Returns true if the model contains an encoder that requires llama_encode() call
     LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 841e9a491b2f6..07a00b2f63fb7 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -627,7 +627,6 @@ enum llm_tensor {
     LLM_TENSOR_POS_NET_ATTN_K,
     LLM_TENSOR_POS_NET_ATTN_V,
     LLM_TENSOR_POS_NET_ATTN_OUT,
-    LLM_TENSOR_HANN_WINDOW,
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -1635,7 +1634,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_POS_NET_ATTN_K,    "pos_net.%d.attn_k" },
             { LLM_TENSOR_POS_NET_ATTN_V,    "pos_net.%d.attn_v" },
             { LLM_TENSOR_POS_NET_ATTN_OUT,  "pos_net.%d.attn_output" },
-            { LLM_TENSOR_HANN_WINDOW,       "hann_window" },
         },
     },
     {
@@ -3648,6 +3646,17 @@ static int llama_get_device_count(const llama_model & model) {
     return (int) model.devices.size();
 }
 
+static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
+    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
+            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
+                return it.first == name;
+            });
+    if (it == model->tensors_by_name.end()) {
+        return nullptr;
+    }
+    return it->second;
+}
+
 template<typename F>
 static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
     ggml_init_params params = {
@@ -7462,7 +7471,6 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_CONV_NEXT_PW1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CONV_NEXT_PW2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CONV_NEXT_GAMMA,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_HANN_WINDOW,                {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
@@ -9638,8 +9646,6 @@ static bool llm_load_tensors(
 
                     model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, n_embd}, 0);
                     model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
-
-                    model.hann_window = create_tensor(tn(LLM_TENSOR_HANN_WINDOW, "weight"), {1280}, 0);
                 } break;
             default:
                 throw std::runtime_error("unknown architecture");
@@ -21021,17 +21027,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
     return model->n_elements;
 }
 
-struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
-    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
-            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
-                return it.first == name;
-            });
-    if (it == model->tensors_by_name.end()) {
-        return nullptr;
-    }
-    return it->second;
-}
-
 bool llama_model_has_encoder(const struct llama_model * model) {
     switch (model->arch) {
         case LLM_ARCH_T5:        return true;

From 5a1c98e8d24018cf0c6f7746ded80151a23abf56 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 14:51:17 +0200
Subject: [PATCH 20/45] fft

---
 examples/tts/tts.cpp | 127 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 116 insertions(+), 11 deletions(-)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index f402ba8a2be97..1fdf756d579f3 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -9,6 +9,7 @@
 #include <string>
 #include <vector>
 #include <fstream>
+#include <thread>
 
 //
 // Terminal utils
@@ -67,6 +68,45 @@ void fill_hann_window(int length, bool periodic, float * output) {
     }
 }
 
+// very poor-man fft
+void twiddle(float * real, float * imag, int k, int N) {
+    float angle = 2 * M_PI * k / N;
+    *real = cos(angle);
+    *imag = sin(angle);
+}
+
+void irfft(int n, float * inp_cplx, float * out_real) {
+    int N = n / 2 + 1;
+
+    std::vector<float> real_input(N);
+    std::vector<float> imag_input(N);
+    for (int i = 0; i < N; ++i) {
+        real_input[i] = inp_cplx[2 * i];
+        imag_input[i] = inp_cplx[2 * i + 1];
+    }
+
+    std::vector<float> real_output(n);
+    std::vector<float> imag_output(n);
+
+    for (int k = 0; k < n; ++k) {
+        real_output[k] = 0.0f;
+        imag_output[k] = 0.0f;
+        for (int m = 0; m < N; ++m) {
+            float twiddle_real;
+            float twiddle_imag;
+
+            twiddle(&twiddle_real, &twiddle_imag, k * m, n);
+
+            real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
+            imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
+        }
+    }
+
+    for (int i = 0; i < n; ++i) {
+        out_real[i] = real_output[i] / N;
+    }
+}
+
 int main(int argc, char ** argv) {
     common_params params;
 
@@ -181,28 +221,93 @@ int main(int argc, char ** argv) {
     const int n_embd = llama_n_embd(model_cts);
     const float * embd = llama_get_embeddings(ctx_cts);
 
-    const int w = 1280;
-    std::vector<float> hann(w);
+    const int n     = prompt_inp.size();
+    const int n_fft = 1280;
+    const int n_hop = 320;
+    const int n_win = 1280;
+    const int n_pad = (n_win - n_hop)/2;
+
+    std::vector<float> hann(n_fft);
+
     fill_hann_window(hann.size(), true, hann.data());
 
+    int n_spec = n_embd*n;
 
-    int n = n_embd*261;
+    std::vector<float> E (n_spec);
+    std::vector<float> S (n_spec);
+    std::vector<float> ST(n_spec);
 
-    LOG("result:\n");
-    for (int i = 0; i < 10; ++i) {
-        LOG("%8.3f ", embd[i]);
+    for (int l = 0; l < n; ++l) {
+        for (int k = 0; k < n_embd; ++k) {
+            E[k*n + l] = embd[l*n_embd + k];
+        }
     }
-    LOG("\n");
-    for (int i = n - 10; i < n; ++i) {
-        LOG("%8.3f ", embd[i]);
+
+    for (int k = 0; k < n_embd/2; ++k) {
+        for (int l = 0; l < n; ++l) {
+            float mag = E[(k           )*n + l];
+            float phi = E[(k + n_embd/2)*n + l];
+
+            mag = exp(mag);
+
+            if (mag > 1e2) {
+                mag = 1e2;
+            }
+            S[2*(k*n + l) + 0] = mag*cosf(phi);
+            S[2*(k*n + l) + 1] = mag*sinf(phi);
+        }
+    }
+
+    for (int l = 0; l < n; ++l) {
+        for (int k = 0; k < n_embd/2; ++k) {
+            ST[l*n_embd + 2*k + 0] = S[2*(k*n + l) + 0];
+            ST[l*n_embd + 2*k + 1] = S[2*(k*n + l) + 1];
+        }
+    }
+
+    std::vector<float> res(n*n_fft);
+
+    const int n_thread = std::thread::hardware_concurrency();
+    std::vector<std::thread> workers(n_thread);
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i] = std::thread([&, i]() {
+            for (int l = i; l < n; l += n_thread) {
+                irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
+            }
+        });
+    }
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i].join();
+    }
+
+    LOG("result (%d):\n", res.size());
+    for (int i = 0; i < n_fft; ++i) {
+        LOG("%d - %8.5f\n", i, res[5*n_fft + i]);
     }
     LOG("\n");
     double sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += embd[i];
+    for (int i = 0; i < n_fft; ++i) {
+        sum += res[5*n_fft + i];
     }
     LOG("sum: %f\n", sum);
 
+    {
+        LOG("result:\n");
+        for (int i = 0; i < 10; ++i) {
+            LOG("%8.3f ", S[i]);
+        }
+        LOG("\n");
+        for (int i = n_spec - 10; i < n_spec; ++i) {
+            LOG("%8.3f ", S[i]);
+        }
+        LOG("\n");
+        double sum = 0.0;
+        for (int i = 0; i < n_spec; ++i) {
+            sum += S[i];
+        }
+        LOG("sum: %f\n", sum);
+    }
+
     fprintf(stderr, "\n");
 
     llama_free(ctx_ttc);

From e52797162e2f9797d00ec27aca4fe39e849236bc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 16:35:10 +0200
Subject: [PATCH 21/45] spectrum processing

---
 examples/tts/tts.cpp | 143 ++++++++++++++++++++++++++++++-------------
 1 file changed, 101 insertions(+), 42 deletions(-)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 1fdf756d579f3..8eaf5a2623cc1 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -58,7 +58,7 @@ static void print_usage(int, char ** argv) {
     LOG("\n");
 }
 
-void fill_hann_window(int length, bool periodic, float * output) {
+static void fill_hann_window(int length, bool periodic, double * output) {
     int offset = -1;
     if (periodic) {
         offset = 0;
@@ -69,31 +69,31 @@ void fill_hann_window(int length, bool periodic, float * output) {
 }
 
 // very poor-man fft
-void twiddle(float * real, float * imag, int k, int N) {
-    float angle = 2 * M_PI * k / N;
+static void twiddle(double * real, double * imag, int k, int N) {
+    double angle = 2 * M_PI * k / N;
     *real = cos(angle);
     *imag = sin(angle);
 }
 
-void irfft(int n, float * inp_cplx, float * out_real) {
+static void irfft(int n, const double * inp_cplx, double * out_real) {
     int N = n / 2 + 1;
 
-    std::vector<float> real_input(N);
-    std::vector<float> imag_input(N);
+    std::vector<double> real_input(N);
+    std::vector<double> imag_input(N);
     for (int i = 0; i < N; ++i) {
         real_input[i] = inp_cplx[2 * i];
         imag_input[i] = inp_cplx[2 * i + 1];
     }
 
-    std::vector<float> real_output(n);
-    std::vector<float> imag_output(n);
+    std::vector<double> real_output(n);
+    std::vector<double> imag_output(n);
 
     for (int k = 0; k < n; ++k) {
         real_output[k] = 0.0f;
         imag_output[k] = 0.0f;
         for (int m = 0; m < N; ++m) {
-            float twiddle_real;
-            float twiddle_imag;
+            double twiddle_real;
+            double twiddle_imag;
 
             twiddle(&twiddle_real, &twiddle_imag, k * m, n);
 
@@ -107,6 +107,38 @@ void irfft(int n, float * inp_cplx, float * out_real) {
     }
 }
 
+static void fold(
+    const std::vector<double> & data,
+    int64_t output_size,
+    int64_t win_length,
+    int64_t hop_length,
+    int64_t pad,
+    std::vector<double>& output
+) {
+    int64_t output_height = output_size;
+    int64_t kernel_w = win_length;
+    int64_t stride_w = hop_length;
+
+    int64_t width = output_size;
+
+    output.resize(width, 0.0f);
+
+    int64_t col_idx = 0;
+    for (int64_t w_col = 0; w_col < width; ++w_col) {
+        int64_t start = w_col * stride_w - pad;
+        int64_t end = start + kernel_w;
+
+        for (int64_t w_im = start; w_im < end; ++w_im) {
+            if (w_im >= 0 && w_im < output_height) {
+                output[w_im] += data[col_idx];
+            }
+            col_idx++;
+        }
+    }
+
+    output.resize(output_size - 2 * pad);
+}
+
 int main(int argc, char ** argv) {
     common_params params;
 
@@ -226,16 +258,17 @@ int main(int argc, char ** argv) {
     const int n_hop = 320;
     const int n_win = 1280;
     const int n_pad = (n_win - n_hop)/2;
+    const int n_out = (n - 1)*n_hop + n_win;
 
-    std::vector<float> hann(n_fft);
+    std::vector<double> hann(n_fft);
 
     fill_hann_window(hann.size(), true, hann.data());
 
     int n_spec = n_embd*n;
 
-    std::vector<float> E (n_spec);
-    std::vector<float> S (n_spec);
-    std::vector<float> ST(n_spec);
+    std::vector<double> E (n_spec);
+    std::vector<double> S (n_spec);
+    std::vector<double> ST(n_spec);
 
     for (int l = 0; l < n; ++l) {
         for (int k = 0; k < n_embd; ++k) {
@@ -245,8 +278,8 @@ int main(int argc, char ** argv) {
 
     for (int k = 0; k < n_embd/2; ++k) {
         for (int l = 0; l < n; ++l) {
-            float mag = E[(k           )*n + l];
-            float phi = E[(k + n_embd/2)*n + l];
+            double mag = E[(k           )*n + l];
+            double phi = E[(k + n_embd/2)*n + l];
 
             mag = exp(mag);
 
@@ -265,7 +298,8 @@ int main(int argc, char ** argv) {
         }
     }
 
-    std::vector<float> res(n*n_fft);
+    std::vector<double> res  (n*n_fft);
+    std::vector<double> hann2(n*n_fft);
 
     const int n_thread = std::thread::hardware_concurrency();
     std::vector<std::thread> workers(n_thread);
@@ -273,6 +307,10 @@ int main(int argc, char ** argv) {
         workers[i] = std::thread([&, i]() {
             for (int l = i; l < n; l += n_thread) {
                 irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
+                for (int j = 0; j < n_fft; ++j) {
+                    res  [l*n_fft + j] *= hann[j];
+                    hann2[l*n_fft + j]  = hann[j] * hann[j];
+                }
             }
         });
     }
@@ -280,33 +318,54 @@ int main(int argc, char ** argv) {
         workers[i].join();
     }
 
-    LOG("result (%d):\n", res.size());
-    for (int i = 0; i < n_fft; ++i) {
-        LOG("%d - %8.5f\n", i, res[5*n_fft + i]);
+    //LOG("result (%d):\n", res.size());
+    //for (int i = 0; i < n_fft; ++i) {
+    //    LOG("%d - %8.5f\n", i, res[5*n_fft + i]);
+    //}
+    //LOG("\n");
+    //double sum = 0.0;
+    //for (int i = 0; i < n_fft; ++i) {
+    //    sum += res[5*n_fft + i];
+    //}
+    //LOG("sum: %f\n", sum);
+
+    std::vector<double> audio;
+    std::vector<double> env;
+
+    fold(res,   n_out, n_win, n_hop, n_pad, audio);
+    fold(hann2, n_out, n_win, n_hop, n_pad, env);
+
+    for (size_t i = 0; i < audio.size(); ++i) {
+        audio[i] /= env[i];
     }
-    LOG("\n");
-    double sum = 0.0;
-    for (int i = 0; i < n_fft; ++i) {
-        sum += res[5*n_fft + i];
-    }
-    LOG("sum: %f\n", sum);
 
-    {
-        LOG("result:\n");
-        for (int i = 0; i < 10; ++i) {
-            LOG("%8.3f ", S[i]);
-        }
-        LOG("\n");
-        for (int i = n_spec - 10; i < n_spec; ++i) {
-            LOG("%8.3f ", S[i]);
-        }
-        LOG("\n");
-        double sum = 0.0;
-        for (int i = 0; i < n_spec; ++i) {
-            sum += S[i];
-        }
-        LOG("sum: %f\n", sum);
-    }
+    //LOG("audio (%d):\n", audio.size());
+    //for (int i = 0; i < 1000; ++i) {
+    //    LOG("%d: %8.5f\n", i, audio[i]);
+    //}
+    //LOG("\n");
+    //double sum = 0.0;
+    //for (int i = 0; i < 1000; ++i) {
+    //    sum += audio[i];
+    //}
+    //LOG("sum: %f\n", sum);
+
+    //{
+    //    LOG("result:\n");
+    //    for (int i = 0; i < 10; ++i) {
+    //        LOG("%8.3f ", S[i]);
+    //    }
+    //    LOG("\n");
+    //    for (int i = n_spec - 10; i < n_spec; ++i) {
+    //        LOG("%8.3f ", S[i]);
+    //    }
+    //    LOG("\n");
+    //    double sum = 0.0;
+    //    for (int i = 0; i < n_spec; ++i) {
+    //        sum += S[i];
+    //    }
+    //    LOG("sum: %f\n", sum);
+    //}
 
     fprintf(stderr, "\n");
 

From 191da330fc2c597d7aed1152d389e7e9849d3252 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 16:50:40 +0200
Subject: [PATCH 22/45] clean-up

---
 examples/tts/tts.cpp | 304 +++++++++++++++++++++++--------------------
 ggml/src/ggml.c      |   4 -
 src/llama.cpp        |   5 -
 3 files changed, 166 insertions(+), 147 deletions(-)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 8eaf5a2623cc1..a3b923332be9b 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -107,26 +107,29 @@ static void irfft(int n, const double * inp_cplx, double * out_real) {
     }
 }
 
-static void fold(
-    const std::vector<double> & data,
-    int64_t output_size,
-    int64_t win_length,
-    int64_t hop_length,
-    int64_t pad,
-    std::vector<double>& output
-) {
-    int64_t output_height = output_size;
-    int64_t kernel_w = win_length;
-    int64_t stride_w = hop_length;
-
-    int64_t width = output_size;
+//
+//  y = torch.nn.functional.fold(
+//       data, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length),
+//  )[:, 0, 0, pad:-pad]
+//
+// data.shape =  torch.Size([1, 1280, 261])
+// output_size =  84480
+// win_length =  1280
+// hop_length =  320
+// pad =  480
+//
+static void fold(const std::vector<double> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<double> & output) {
+    int64_t output_height = n_out;
+    int64_t kernel_w = n_win;
+    int64_t stride_w = n_hop;
+    int64_t width    = n_out;
 
     output.resize(width, 0.0f);
 
     int64_t col_idx = 0;
     for (int64_t w_col = 0; w_col < width; ++w_col) {
-        int64_t start = w_col * stride_w - pad;
-        int64_t end = start + kernel_w;
+        int64_t start = w_col * stride_w - n_pad;
+        int64_t end   = start + kernel_w;
 
         for (int64_t w_im = start; w_im < end; ++w_im) {
             if (w_im >= 0 && w_im < output_height) {
@@ -136,7 +139,129 @@ static void fold(
         }
     }
 
-    output.resize(output_size - 2 * pad);
+    output.resize(n_out - 2 * n_pad);
+}
+
+struct wav_header {
+    char riff[4] = {'R', 'I', 'F', 'F'};
+    uint32_t chunk_size;
+    char wave[4] = {'W', 'A', 'V', 'E'};
+    char fmt[4] = {'f', 'm', 't', ' '};
+    uint32_t fmt_chunk_size = 16;
+    uint16_t audio_format = 1; // PCM
+    uint16_t num_channels = 1; // Mono
+    uint32_t sample_rate;
+    uint32_t byte_rate;
+    uint16_t block_align;
+    uint16_t bits_per_sample = 16;
+    char data[4] = {'d', 'a', 't', 'a'};
+    uint32_t data_size;
+};
+
+static void save_wav16(const std::string & fname, const std::vector<double> & data, int sample_rate) {
+    std::ofstream file(fname, std::ios::binary);
+    if (!file) {
+        LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
+        return;
+    }
+
+    wav_header header;
+    header.sample_rate = sample_rate;
+    header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
+    header.block_align = header.num_channels * (header.bits_per_sample / 8);
+    header.data_size = data.size() * (header.bits_per_sample / 8);
+    header.chunk_size = 36 + header.data_size;
+
+    file.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+    for (const auto & sample : data) {
+        int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
+        file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
+    }
+
+    file.close();
+}
+
+static std::vector<double> embd_to_audio(
+        const float * embd,
+        const std::vector<llama_token> & codes,
+        const int n_embd,
+        const int n_thread) {
+    const int n     = codes.size();
+    const int n_fft = 1280;
+    const int n_hop = 320;
+    const int n_win = 1280;
+    const int n_pad = (n_win - n_hop)/2;
+    const int n_out = (n - 1)*n_hop + n_win;
+
+    std::vector<double> hann(n_fft);
+
+    fill_hann_window(hann.size(), true, hann.data());
+
+    int n_spec = n_embd*n;
+
+    std::vector<double> E (n_spec);
+    std::vector<double> S (n_spec);
+    std::vector<double> ST(n_spec);
+
+    for (int l = 0; l < n; ++l) {
+        for (int k = 0; k < n_embd; ++k) {
+            E[k*n + l] = embd[l*n_embd + k];
+        }
+    }
+
+    for (int k = 0; k < n_embd/2; ++k) {
+        for (int l = 0; l < n; ++l) {
+            double mag = E[(k           )*n + l];
+            double phi = E[(k + n_embd/2)*n + l];
+
+            mag = exp(mag);
+
+            if (mag > 1e2) {
+                mag = 1e2;
+            }
+            S[2*(k*n + l) + 0] = mag*cosf(phi);
+            S[2*(k*n + l) + 1] = mag*sinf(phi);
+        }
+    }
+
+    for (int l = 0; l < n; ++l) {
+        for (int k = 0; k < n_embd/2; ++k) {
+            ST[l*n_embd + 2*k + 0] = S[2*(k*n + l) + 0];
+            ST[l*n_embd + 2*k + 1] = S[2*(k*n + l) + 1];
+        }
+    }
+
+    std::vector<double> res  (n*n_fft);
+    std::vector<double> hann2(n*n_fft);
+
+    std::vector<std::thread> workers(n_thread);
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i] = std::thread([&, i]() {
+            for (int l = i; l < n; l += n_thread) {
+                irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
+                for (int j = 0; j < n_fft; ++j) {
+                    res  [l*n_fft + j] *= hann[j];
+                    hann2[l*n_fft + j]  = hann[j] * hann[j];
+                }
+            }
+        });
+    }
+    for (int i = 0; i < n_thread; ++i) {
+        workers[i].join();
+    }
+
+    std::vector<double> audio;
+    std::vector<double> env;
+
+    fold(res,   n_out, n_win, n_hop, n_pad, audio);
+    fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
+
+    for (size_t i = 0; i < audio.size(); ++i) {
+        audio[i] /= env[i];
+    }
+
+    return audio;
 }
 
 int main(int argc, char ** argv) {
@@ -178,7 +303,7 @@ int main(int argc, char ** argv) {
 
     const auto t_main_start = ggml_time_us();
 
-    std::vector<llama_token> prompt_inp = {198, 88225, 155856, 151669, 152205,
+    std::vector<llama_token> codes = {198, 88225, 155856, 151669, 152205,
         153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695,
         153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010,
         153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286,
@@ -215,31 +340,33 @@ int main(int argc, char ** argv) {
         153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645};
 
     {
-        const std::string inp_txt = common_detokenize(ctx_ttc, prompt_inp, true);
+        const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
         LOG_INF("prompt: '%s'\n", inp_txt.c_str());
-        LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size());
+        LOG_INF("%s: prompt size: %d\n", __func__, (int) codes.size());
     }
 
     // remove all non-audio tokens (i.e. < 151672 || > 155772)
-    prompt_inp.erase(std::remove_if(prompt_inp.begin(), prompt_inp.end(), [](llama_token t) { return t < 151672 || t > 155772; }), prompt_inp.end());
+    codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end());
 
     {
-        const std::string inp_txt = common_detokenize(ctx_ttc, prompt_inp, true);
+        const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
         LOG_INF("prompt audio: '%s'\n", inp_txt.c_str());
-        LOG_INF("%s: prompt audio size: %d\n", __func__, (int) prompt_inp.size());
+        LOG_INF("%s: prompt audio size: %d\n", __func__, (int) codes.size());
     }
 
-    for (auto & token : prompt_inp) {
+    for (auto & token : codes) {
         token -= 151672;
     }
 
-    llama_batch batch = llama_batch_init(prompt_inp.size(), 0, 1);
+    const auto t_voc_start = ggml_time_us();
+
+    llama_batch batch = llama_batch_init(codes.size(), 0, 1);
 
     // evaluate the initial prompt
-    for (size_t i = 0; i < prompt_inp.size(); ++i) {
-        common_batch_add(batch, prompt_inp[i], i, { 0 }, true); // TODO: all logits?
+    for (size_t i = 0; i < codes.size(); ++i) {
+        common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits?
     }
-    GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
+    GGML_ASSERT(batch.n_tokens == (int) codes.size());
 
     if (llama_decode(ctx_cts, batch) != 0) {
         LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -248,126 +375,27 @@ int main(int argc, char ** argv) {
 
     llama_synchronize(ctx_cts);
 
-    LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
+    LOG_INF("%s: time for vocoder:      %.3f ms\n", __func__, (ggml_time_us() - t_voc_start) / 1000.0f);
+
+    const auto t_spec_start = ggml_time_us();
 
     const int n_embd = llama_n_embd(model_cts);
     const float * embd = llama_get_embeddings(ctx_cts);
 
-    const int n     = prompt_inp.size();
-    const int n_fft = 1280;
-    const int n_hop = 320;
-    const int n_win = 1280;
-    const int n_pad = (n_win - n_hop)/2;
-    const int n_out = (n - 1)*n_hop + n_win;
-
-    std::vector<double> hann(n_fft);
-
-    fill_hann_window(hann.size(), true, hann.data());
-
-    int n_spec = n_embd*n;
-
-    std::vector<double> E (n_spec);
-    std::vector<double> S (n_spec);
-    std::vector<double> ST(n_spec);
-
-    for (int l = 0; l < n; ++l) {
-        for (int k = 0; k < n_embd; ++k) {
-            E[k*n + l] = embd[l*n_embd + k];
-        }
-    }
-
-    for (int k = 0; k < n_embd/2; ++k) {
-        for (int l = 0; l < n; ++l) {
-            double mag = E[(k           )*n + l];
-            double phi = E[(k + n_embd/2)*n + l];
-
-            mag = exp(mag);
-
-            if (mag > 1e2) {
-                mag = 1e2;
-            }
-            S[2*(k*n + l) + 0] = mag*cosf(phi);
-            S[2*(k*n + l) + 1] = mag*sinf(phi);
-        }
-    }
+    // spectral operations
+    // TODO: not optimized at all
+    auto audio = embd_to_audio(embd, codes, n_embd, params.cpuparams.n_threads);
 
-    for (int l = 0; l < n; ++l) {
-        for (int k = 0; k < n_embd/2; ++k) {
-            ST[l*n_embd + 2*k + 0] = S[2*(k*n + l) + 0];
-            ST[l*n_embd + 2*k + 1] = S[2*(k*n + l) + 1];
-        }
-    }
+    const std::string fname = "output.wav";
 
-    std::vector<double> res  (n*n_fft);
-    std::vector<double> hann2(n*n_fft);
+    const int n_sr = 24000; // sampling rate
 
-    const int n_thread = std::thread::hardware_concurrency();
-    std::vector<std::thread> workers(n_thread);
-    for (int i = 0; i < n_thread; ++i) {
-        workers[i] = std::thread([&, i]() {
-            for (int l = i; l < n; l += n_thread) {
-                irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
-                for (int j = 0; j < n_fft; ++j) {
-                    res  [l*n_fft + j] *= hann[j];
-                    hann2[l*n_fft + j]  = hann[j] * hann[j];
-                }
-            }
-        });
-    }
-    for (int i = 0; i < n_thread; ++i) {
-        workers[i].join();
-    }
+    LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
+    LOG_INF("%s: total time:            %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
 
-    //LOG("result (%d):\n", res.size());
-    //for (int i = 0; i < n_fft; ++i) {
-    //    LOG("%d - %8.5f\n", i, res[5*n_fft + i]);
-    //}
-    //LOG("\n");
-    //double sum = 0.0;
-    //for (int i = 0; i < n_fft; ++i) {
-    //    sum += res[5*n_fft + i];
-    //}
-    //LOG("sum: %f\n", sum);
-
-    std::vector<double> audio;
-    std::vector<double> env;
-
-    fold(res,   n_out, n_win, n_hop, n_pad, audio);
-    fold(hann2, n_out, n_win, n_hop, n_pad, env);
-
-    for (size_t i = 0; i < audio.size(); ++i) {
-        audio[i] /= env[i];
-    }
+    save_wav16(fname, audio, n_sr);
 
-    //LOG("audio (%d):\n", audio.size());
-    //for (int i = 0; i < 1000; ++i) {
-    //    LOG("%d: %8.5f\n", i, audio[i]);
-    //}
-    //LOG("\n");
-    //double sum = 0.0;
-    //for (int i = 0; i < 1000; ++i) {
-    //    sum += audio[i];
-    //}
-    //LOG("sum: %f\n", sum);
-
-    //{
-    //    LOG("result:\n");
-    //    for (int i = 0; i < 10; ++i) {
-    //        LOG("%8.3f ", S[i]);
-    //    }
-    //    LOG("\n");
-    //    for (int i = n_spec - 10; i < n_spec; ++i) {
-    //        LOG("%8.3f ", S[i]);
-    //    }
-    //    LOG("\n");
-    //    double sum = 0.0;
-    //    for (int i = 0; i < n_spec; ++i) {
-    //        sum += S[i];
-    //    }
-    //    LOG("sum: %f\n", sum);
-    //}
-
-    fprintf(stderr, "\n");
+    LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str());
 
     llama_free(ctx_ttc);
     llama_free_model(model_ttc);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7c0159ab49c9f..2bbe5f48257b2 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3846,10 +3846,6 @@ struct ggml_tensor * ggml_conv_1d(
         int                   d0) {
     struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
 
-    printf("a: %lld %lld %lld %lld\n", a->ne[0], a->ne[1], a->ne[2], a->ne[3]);
-    printf("b: %lld %lld %lld %lld\n", b->ne[0], b->ne[1], b->ne[2], b->ne[3]);
-    printf("im2col: %lld %lld %lld %lld\n", im2col->ne[0], im2col->ne[1], im2col->ne[2], im2col->ne[3]);
-
     struct ggml_tensor * result =
         ggml_mul_mat(ctx,
                 ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
diff --git a/src/llama.cpp b/src/llama.cpp
index 07a00b2f63fb7..6397decd70c73 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17234,9 +17234,6 @@ struct llm_build_context {
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
 
-        printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
-        printf("conv1d: %d %d %d\n", model.conv_1d->ne[0], model.conv_1d->ne[1], model.conv_1d->ne[2]);
-
         cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1);
         cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0]));
 
@@ -17445,8 +17442,6 @@ struct llm_build_context {
         cur = ggml_add(ctx0, cur, model.output_b);
         cb(cur, "result_embd", -1);
 
-        printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
-
         ggml_build_forward_expand(gf, cur);
 
         return gf;

From b9a011e1237a5196688e4c72ac858e61e10ffed4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 18:12:59 +0200
Subject: [PATCH 23/45] tts : receive input text and generate codes

---
 examples/tts/tts.cpp | 433 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 391 insertions(+), 42 deletions(-)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index a3b923332be9b..8ba13944f3cf5 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -264,6 +264,25 @@ static std::vector<double> embd_to_audio(
     return audio;
 }
 
+static void prompt_add(llama_tokens & prompt, llama_token token) {
+    prompt.push_back(token);
+}
+
+static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) {
+    prompt.insert(prompt.end(), tokens.begin(), tokens.end());
+}
+
+static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) {
+    auto tmp = common_tokenize(model, txt, add_special, parse_special);
+    prompt_add(prompt, tmp);
+}
+
+static void prompt_init(llama_tokens & prompt, const llama_model * model) {
+    prompt.clear();
+
+    prompt_add(prompt, model, "<|im_start|>\n", true, true);
+}
+
 int main(int argc, char ** argv) {
     common_params params;
 
@@ -273,10 +292,18 @@ int main(int argc, char ** argv) {
     params.n_batch   = 8192;
     params.n_ctx     = 8192;
 
+    params.sampling.top_k = 128;
+    params.sampling.penalty_repeat = 1.1;
+    params.sampling.penalty_last_n = 8;
+    params.sampling.samplers = { COMMON_SAMPLER_TYPE_TEMPERATURE, COMMON_SAMPLER_TYPE_TOP_K, };
+
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
         return 1;
     }
 
+    const int n_parallel = params.n_parallel;
+    const int n_predict  = params.n_predict;
+
     common_init();
 
     // init LLM
@@ -301,48 +328,371 @@ int main(int argc, char ** argv) {
     model_cts = llama_init_cts.model;
     ctx_cts = llama_init_cts.context;
 
+    std::vector<common_sampler *> smpl(n_parallel);
+    for (int i = 0; i < n_parallel; ++i) {
+        params.sampling.no_perf = (i != 0);
+        params.sampling.seed = params.sampling.seed + 1;
+
+        smpl[i] = common_sampler_init(model_ttc, params.sampling);
+    }
+
+    LOG_INF("%s: loading done\n", __func__);
+
     const auto t_main_start = ggml_time_us();
 
-    std::vector<llama_token> codes = {198, 88225, 155856, 151669, 152205,
-        153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695,
-        153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010,
-        153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286,
-        152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296,
-        153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690,
-        153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061,
-        153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670,
-        198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683,
-        152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908,
-        151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359,
-        153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424,
-        151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670,
-        198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729,
-        152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669,
-        153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670,
-        198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501,
-        152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242,
-        153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360,
-        153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055,
-        152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670,
-        198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441,
-        152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831,
-        153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133,
-        153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109,
-        152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055,
-        155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729,
-        151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337,
-        153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153,
-        153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365,
-        153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218,
-        152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464,
-        152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855,
-        152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418,
-        153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645};
+    std::vector<llama_token> codes;
+
+    // process prompt and generate voice codes
+    {
+        LOG_INF("%s: constructing prompt ..\n", __func__);
+
+        std::vector<llama_token> prompt_inp;
+
+        prompt_init(prompt_inp, model_ttc);
+
+        prompt_add(prompt_inp, model_ttc, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
+
+        // TODO: not sure if this is correct
+        {
+            std::string prompt_user = params.prompt;
+            std::string prompt_clean = "";
+            //string_replace_all(prompt_user, " ", "<|text_sep|>");
+            for (auto & c : prompt_user) {
+                if (c == ' ') {
+                    prompt_clean += "<|text_sep|>";
+                } else {
+                    if (isalpha(c)) {
+                        c = tolower(c);
+                    } else {
+                        continue;
+                    }
+                    prompt_clean += c;
+                }
+            }
+
+            prompt_add(prompt_inp, model_ttc, prompt_clean, false, true);
+        }
+
+        prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true);
+
+        // disabled to save time on tokenizing each time
+        // TODO: load voices from the json files
+#if 0
+        const std::string voice_data = R"(<|audio_start|>
+the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
+overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
+package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
+from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
+just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
+two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
+people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
+is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
+pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
+remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
+sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
+i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
+have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
+some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
+critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
+about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
+some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
+of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
+the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
+gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
+aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
+but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
+its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
+still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
+really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
+enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
+and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
+it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
+looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
+lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
+
+        auto tmp = common_tokenize(model_ttc, voice_data, false, true);
+        printf("\n\n");
+        for (int i = 0; i < tmp.size(); ++i) {
+            printf("%d, ", tmp[i]);
+        }
+        printf("\n\n");
+#else
+        prompt_add(prompt_inp, llama_tokens {
+            151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585,
+            152460, 153375, 151670, 198, 74455, 155808, 151669, 151799,
+            151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470,
+            151970, 153413, 152419, 153334, 153289, 153374, 153199, 152040,
+            153260, 152721, 152680, 153297, 152419, 153248, 152400, 152691,
+            153368, 153437, 151670, 198, 1722, 155828, 151669, 152607,
+            152256, 152991, 152299, 152688, 153163, 153016, 152789, 153198,
+            152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207,
+            152461, 153321, 153309, 151750, 152137, 153340, 152573, 152267,
+            153347, 151789, 152681, 153339, 151992, 152512, 151751, 152179,
+            153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904,
+            152311, 151670, 198, 1499, 155791, 151669, 152276, 152454,
+            153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226,
+            153043, 152325, 153267, 152622, 151670, 198, 4250, 155797,
+            151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
+            152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213,
+            152112, 153204, 151722, 152542, 151670, 198, 19789, 155796,
+            151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002,
+            152191, 151734, 152312, 152810, 152237, 153224, 153169, 153224,
+            152244, 153387, 153404, 151670, 198, 16069, 155811, 151669,
+            152265, 151946, 151808, 152412, 152363, 152305, 153156, 152733,
+            152810, 153157, 152016, 152100, 152069, 153234, 152317, 152589,
+            152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504,
+            153376, 152272, 152433, 152325, 151941, 151670, 198, 285,
+            155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381,
+            152474, 152680, 152157, 153255, 152324, 151682, 151670, 198,
+            32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
+            152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488,
+            153070, 151883, 152890, 152489, 153144, 153375, 152358, 151685,
+            152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669,
+            151902, 152720, 153377, 152027, 152378, 152821, 153207, 153459,
+            153028, 153068, 152507, 153255, 152158, 152921, 151958, 152609,
+            152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470,
+            152606, 152162, 152186, 153071, 152244, 153118, 153375, 153018,
+            152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736,
+            153380, 153502, 152702, 152115, 153181, 152735, 153277, 153457,
+            152393, 153112, 152595, 151670, 198, 19098, 155808, 151669,
+            152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239,
+            153163, 152922, 153402, 152034, 152591, 153438, 152215, 151673,
+            152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482,
+            152718, 152862, 153347, 151670, 198, 72, 155780, 151669, 151795,
+            152111, 152746, 152377, 153471, 152309, 151670, 198, 19016,
+            155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701,
+            152939, 152536, 152091, 151815, 152733, 151672, 151670, 198,
+            14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042,
+            153504, 152589, 153333, 151839, 151941, 153038, 153180, 151670,
+            198, 36996, 8303, 155832, 151669, 152231, 152256, 152835,
+            152801, 152985, 153400, 152393, 152818, 152765, 152249, 152600,
+            151699, 152302, 152752, 153018, 153009, 151992, 153054, 152847,
+            153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458,
+            152048, 152757, 152428, 153195, 151906, 153006, 153178, 153250,
+            152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
+            152228, 152733, 151670, 198, 9096, 155801, 151669, 151698,
+            153321, 152217, 153039, 152935, 153400, 152122, 152531, 153106,
+            152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851,
+            152901, 152885, 152594, 153446, 153080, 151670, 198, 14689,
+            155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191,
+            151673, 151690, 151698, 152714, 152846, 152981, 153171, 153384,
+            153364, 153188, 153246, 151670, 198, 1055, 155779, 151669,
+            151869, 152388, 152711, 153334, 151736, 151670, 198, 1782,
+            155780, 151669, 153483, 153240, 152241, 152558, 152697, 153046,
+            151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605,
+            153034, 153434, 153372, 153347, 151887, 152453, 152758, 152133,
+            152510, 152694, 152431, 152321, 153088, 152676, 152223, 152581,
+            152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032,
+            152903, 152859, 152989, 151748, 152669, 152661, 152650, 152409,
+            151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469,
+            152988, 152894, 151819, 152391, 153019, 152058, 153062, 153230,
+            151826, 152112, 152306, 152264, 152769, 153390, 152384, 152435,
+            152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540,
+            151919, 151893, 152558, 152817, 152946, 152956, 152129, 152715,
+            153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
+            151670, 198, 8088, 155792, 151669, 152452, 153497, 153353,
+            152679, 152533, 152382, 152374, 152611, 153341, 153163, 152285,
+            153411, 152495, 153141, 152320, 151670, 198, 1199, 155781,
+            151669, 151764, 152360, 153295, 152634, 153342, 152199, 152271,
+            151670, 198, 43366, 155799, 151669, 152308, 151682, 152889,
+            152016, 152385, 152629, 152495, 151826, 153321, 152958, 152180,
+            151886, 153432, 152922, 152128, 153024, 153040, 152593, 152287,
+            151677, 151670, 198, 53660, 155808, 151669, 151727, 152092,
+            152680, 153331, 151699, 152316, 152938, 152289, 152433, 153384,
+            151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691,
+            152489, 151941, 152049, 152034, 153053, 152179, 153160, 151676,
+            153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
+            152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234,
+            153135, 152291, 153235, 152143, 152583, 152402, 153483, 152678,
+            152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825,
+            152548, 153442, 152109, 152659, 153325, 152781, 152570, 152957,
+            151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
+            151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174,
+            151792, 153409, 153327, 152990, 151670, 198, 275, 155781,
+            151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974,
+            151670, 198, 94273, 155799, 151669, 152953, 152938, 153427,
+            152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331,
+            152257, 152987, 152777, 153448, 152408, 151696, 152408, 152326,
+            152699, 151670, 198, 385, 16239, 155828, 151669, 152306, 152268,
+            153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110,
+            152918, 152923, 152467, 152331, 153053, 153330, 151889, 153444,
+            152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
+            152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499,
+            152109, 152255, 151739, 152267, 152759, 153318, 153165, 153349,
+            151670,});
+#endif
+
+        // print the prompt token-by-token
+
+        LOG("\n");
+
+        for (auto id : prompt_inp) {
+            LOG("%s", common_token_to_piece(ctx_ttc, id).c_str());
+        }
+
+        LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size());
+
+        LOG("\n");
+
+        // create a llama_batch
+        // we use this object to submit token data for decoding
+        llama_batch batch = llama_batch_init(std::max(prompt_inp.size(), (size_t) n_parallel), 0, n_parallel);
+
+        std::vector<llama_seq_id> seq_ids(n_parallel, 0);
+        for (int32_t i = 0; i < n_parallel; ++i) {
+            seq_ids[i] = i;
+        }
+
+        // evaluate the initial prompt
+        for (size_t i = 0; i < prompt_inp.size(); ++i) {
+            common_batch_add(batch, prompt_inp[i], i, seq_ids, false);
+        }
+        GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
+
+        // llama_decode will output logits only for the last token of the prompt
+        batch.logits[batch.n_tokens - 1] = true;
+
+        if (llama_decode(ctx_ttc, batch) != 0) {
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
+            return 1;
+        }
+
+        if (n_parallel > 1) {
+            LOG_INF("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        }
+
+        llama_synchronize(ctx_ttc);
+
+        LOG_INF("%s: time for prompt: %.3f ms\n\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
+
+        const auto t_dec_start = ggml_time_us();
+
+        // main loop
+
+        // remember the batch index of the last token for each parallel sequence
+        // we need this to determine which logits to sample from
+        std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
+
+        int n_past   = batch.n_tokens;
+        int n_decode = 0;
+
+        while (n_decode <= n_predict) {
+            // prepare the next batch
+            common_batch_clear(batch);
+
+            // sample the next token for each parallel sequence / stream
+            for (int32_t i = 0; i < n_parallel; ++i) {
+                if (i_batch[i] < 0) {
+                    // the stream has already finished
+                    continue;
+                }
+
+                const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
+
+                common_sampler_accept(smpl[i], new_token_id, true);
+
+                codes.push_back(new_token_id);
+
+                const auto * cands = common_sampler_get_candidates(smpl[i]);
+
+                // is it an end of generation? -> mark the stream as finished
+                if (llama_token_is_eog(model_ttc, new_token_id) || n_decode == n_predict) {
+                    std::string reason;
+                    if (llama_token_is_eog(model_ttc, new_token_id)) {
+                        reason = "eos";
+                    } else {
+                        reason = "n_predict";
+                    }
+
+                    i_batch[i] = -1;
+
+                    LOG("\n");
+                    if (n_parallel > 1) {
+                        LOG_CNT("\n");
+                        LOG_INF("%s: stream %d finished at n_past = %d, reason = '%s'\n", __func__, i, n_past, reason.c_str());
+                    }
+
+                    continue;
+                }
+
+                {
+                    const float p = cands->data[cands->selected].p;
+
+                    const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) ((3*p)*float(k_colors.size()))));
+
+                    LOG_CNT("%s%d%s", k_colors[col].c_str(), i, "\033[0m");
+                    //LOG_CNT("%d", i);
+                }
+
+                i_batch[i] = batch.n_tokens;
+
+                // push this new token for next evaluation
+                common_batch_add(batch, new_token_id, n_past, { i }, true);
+            }
+
+            // all streams are finished
+            if (batch.n_tokens == 0) {
+                break;
+            }
+
+            n_decode += 1;
+            n_past += 1;
+
+            // evaluate the current batch with the transformer model
+            if (llama_decode(ctx_ttc, batch)) {
+                LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+                return 1;
+            }
+        }
+
+        llama_batch_free(batch);
+
+        LOG("\n");
+        LOG_INF("%s: time for decoder:       %.3f ms\n", __func__, (ggml_time_us() - t_dec_start) / 1000.0f);
+    }
+
+    //std::vector<llama_token> codes = {198, 88225, 155856, 151669, 152205,
+    //    153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695,
+    //    153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010,
+    //    153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286,
+    //    152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296,
+    //    153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690,
+    //    153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061,
+    //    153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670,
+    //    198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683,
+    //    152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908,
+    //    151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359,
+    //    153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424,
+    //    151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670,
+    //    198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729,
+    //    152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669,
+    //    153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670,
+    //    198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501,
+    //    152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242,
+    //    153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360,
+    //    153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055,
+    //    152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670,
+    //    198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441,
+    //    152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831,
+    //    153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133,
+    //    153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109,
+    //    152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055,
+    //    155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729,
+    //    151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337,
+    //    153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153,
+    //    153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365,
+    //    153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218,
+    //    152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464,
+    //    152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855,
+    //    152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418,
+    //    153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645};
 
     {
         const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
-        LOG_INF("prompt: '%s'\n", inp_txt.c_str());
-        LOG_INF("%s: prompt size: %d\n", __func__, (int) codes.size());
+
+        LOG("\n");
+        LOG_INF("codes: '%s'\n", inp_txt.c_str());
+        LOG_INF("%s: codes size: %d\n", __func__, (int) codes.size());
     }
 
     // remove all non-audio tokens (i.e. < 151672 || > 155772)
@@ -350,8 +700,8 @@ int main(int argc, char ** argv) {
 
     {
         const std::string inp_txt = common_detokenize(ctx_ttc, codes, true);
-        LOG_INF("prompt audio: '%s'\n", inp_txt.c_str());
-        LOG_INF("%s: prompt audio size: %d\n", __func__, (int) codes.size());
+        LOG_INF("codes audio: '%s'\n", inp_txt.c_str());
+        LOG_INF("%s: codes audio size: %d\n", __func__, (int) codes.size());
     }
 
     for (auto & token : codes) {
@@ -362,7 +712,6 @@ int main(int argc, char ** argv) {
 
     llama_batch batch = llama_batch_init(codes.size(), 0, 1);
 
-    // evaluate the initial prompt
     for (size_t i = 0; i < codes.size(); ++i) {
         common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits?
     }
@@ -379,11 +728,11 @@ int main(int argc, char ** argv) {
 
     const auto t_spec_start = ggml_time_us();
 
+    // spectral operations
+    // TODO: not optimized at all
     const int n_embd = llama_n_embd(model_cts);
     const float * embd = llama_get_embeddings(ctx_cts);
 
-    // spectral operations
-    // TODO: not optimized at all
     auto audio = embd_to_audio(embd, codes, n_embd, params.cpuparams.n_threads);
 
     const std::string fname = "output.wav";

From db613915de3cc9e982b28294bf596a56a935c4ab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 18:20:17 +0200
Subject: [PATCH 24/45] clip : fix new conv name

---
 examples/llava/clip.cpp | 6 +++---
 examples/tts/tts.cpp    | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index ba28c07c6aeec..463b7c865b90c 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -896,7 +896,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
                 mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                 // stride = 1, padding = 1, bias is nullptr
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
 
                 // layer norm
                 // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
@@ -944,7 +944,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             // block_2
             {
                 // stride = 2
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
 
                 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                 // layer norm
@@ -1005,7 +1005,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             // mlp_2 ne [24, 24, 2048, 1]
             mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
             // weight ne = [3, 3, 2048, 1]
-            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
             peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
             peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
             mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 8ba13944f3cf5..e2306a5a8085b 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -651,6 +651,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
         LOG_INF("%s: time for decoder:       %.3f ms\n", __func__, (ggml_time_us() - t_dec_start) / 1000.0f);
     }
 
+    common_perf_print(ctx_ttc, smpl[0]);
+
     //std::vector<llama_token> codes = {198, 88225, 155856, 151669, 152205,
     //    153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695,
     //    153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010,

From 8329e850cc17491c895a19a51a1c160539d879d0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 18:43:02 +0200
Subject: [PATCH 25/45] tts : minor fix

---
 examples/tts/convert_pt_to_hf.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index c4a1185a83d47..62dcc05efc5c2 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -11,9 +11,6 @@
 
 from safetensors.torch import save_file
 
-# change path to script dir
-os.chdir(os.path.dirname(os.path.abspath(__file__)))
-
 # default
 model_path = './model.pt';
 

From d4fa34bdd4ec91f5f86477f3dff55e1c8dafae53 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 19:00:03 +0200
Subject: [PATCH 26/45] tts : add header + minor fixes

ggml-ci
---
 convert_hf_to_gguf.py | 2 +-
 examples/tts/tts.cpp  | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a86490831fdbd..c0ab0f955a496 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -297,7 +297,7 @@ def prepare_tensors(self):
 
             for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)):
                 # TODO: why do we squeeze here?
-                #data = data_torch.squeeze().numpy()
+                # data = data_torch.squeeze().numpy()
                 data = data_torch.numpy()
 
                 # if data ends up empty, it means data_torch was a scalar tensor -> restore
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index e2306a5a8085b..33ecb8aafe0f3 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -6,6 +6,7 @@
 
 #include <algorithm>
 #include <cstdio>
+#include <cmath>
 #include <string>
 #include <vector>
 #include <fstream>
@@ -354,9 +355,9 @@ int main(int argc, char ** argv) {
 
         // TODO: not sure if this is correct
         {
+            std::string prompt_clean;
             std::string prompt_user = params.prompt;
-            std::string prompt_clean = "";
-            //string_replace_all(prompt_user, " ", "<|text_sep|>");
+
             for (auto & c : prompt_user) {
                 if (c == ' ') {
                     prompt_clean += "<|text_sep|>";

From 2221e54278eff46b6e8d5e14c1dfcd8074a27a11 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 20:31:20 +0200
Subject: [PATCH 27/45] tts : add matchematical constant

ggml-ci
---
 examples/tts/tts.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 33ecb8aafe0f3..c174efe30a524 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -4,6 +4,8 @@
 #include "log.h"
 #include "llama.h"
 
+#define _USE_MATH_DEFINES // For M_PI on MSVC
+
 #include <algorithm>
 #include <cstdio>
 #include <cmath>

From 906a0edb5aa1a9c6700e71b5379a1b13e0900363 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 21:33:51 +0200
Subject: [PATCH 28/45] tts : fix sampling + cut initial noise

---
 examples/tts/tts.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index c174efe30a524..b9b2e7f9e21a6 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -295,7 +295,7 @@ int main(int argc, char ** argv) {
     params.n_batch   = 8192;
     params.n_ctx     = 8192;
 
-    params.sampling.top_k = 128;
+    params.sampling.top_k = 4;
     params.sampling.penalty_repeat = 1.1;
     params.sampling.penalty_last_n = 8;
     params.sampling.samplers = { COMMON_SAMPLER_TYPE_TEMPERATURE, COMMON_SAMPLER_TYPE_TOP_K, };
@@ -744,6 +744,11 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
 
     const int n_sr = 24000; // sampling rate
 
+    // zero out first 0.25 seconds
+    for (int i = 0; i < 24000/4; ++i) {
+        audio[i] = 0.0f;
+    }
+
     LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f);
     LOG_INF("%s: total time:            %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
 

From 1d7c27ca93aa2e016ea754b6f230d1d5015d9613 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 21:42:53 +0200
Subject: [PATCH 29/45] tts : fixes

---
 examples/tts/tts.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index b9b2e7f9e21a6..cf04c70607be4 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -135,7 +135,7 @@ static void fold(const std::vector<double> & data, int64_t n_out, int64_t n_win,
         int64_t end   = start + kernel_w;
 
         for (int64_t w_im = start; w_im < end; ++w_im) {
-            if (w_im >= 0 && w_im < output_height) {
+            if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
                 output[w_im] += data[col_idx];
             }
             col_idx++;
@@ -291,7 +291,7 @@ int main(int argc, char ** argv) {
 
     params.prompt = "";
 
-    params.n_predict = 1024;
+    params.n_predict = 4096;
     params.n_batch   = 8192;
     params.n_ctx     = 8192;
 
@@ -364,7 +364,7 @@ int main(int argc, char ** argv) {
                 if (c == ' ') {
                     prompt_clean += "<|text_sep|>";
                 } else {
-                    if (isalpha(c)) {
+                    if (isalpha(c) || isdigit(c)) {
                         c = tolower(c);
                     } else {
                         continue;

From 3d54be4d840b623155787d9b5fc98941a2e82bd5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Dec 2024 13:21:36 +0200
Subject: [PATCH 30/45] tts : update default samplers

ggml-ci
---
 examples/tts/tts.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index cf04c70607be4..82bb6d724184a 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -296,9 +296,7 @@ int main(int argc, char ** argv) {
     params.n_ctx     = 8192;
 
     params.sampling.top_k = 4;
-    params.sampling.penalty_repeat = 1.1;
-    params.sampling.penalty_last_n = 8;
-    params.sampling.samplers = { COMMON_SAMPLER_TYPE_TEMPERATURE, COMMON_SAMPLER_TYPE_TOP_K, };
+    params.sampling.samplers = { COMMON_SAMPLER_TYPE_TOP_K, };
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) {
         return 1;

From befdcd249272313d654189b124b81d54532d4083 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Dec 2024 13:37:36 +0200
Subject: [PATCH 31/45] tts : text pre-processing

---
 examples/tts/tts.cpp | 165 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 146 insertions(+), 19 deletions(-)

diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 82bb6d724184a..57956e5a17075 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -7,12 +7,14 @@
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
 #include <algorithm>
-#include <cstdio>
 #include <cmath>
-#include <string>
-#include <vector>
+#include <cstdio>
 #include <fstream>
+#include <map>
+#include <regex>
+#include <string>
 #include <thread>
+#include <vector>
 
 //
 // Terminal utils
@@ -267,6 +269,143 @@ static std::vector<double> embd_to_audio(
     return audio;
 }
 
+static const std::map<int, std::string> ones = {
+    {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
+    {5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
+    {10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
+    {15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
+};
+
+static const std::map<int, std::string> tens = {
+    {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
+    {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
+};
+
+// Convert a number less than 1000 to words
+static std::string convert_less_than_thousand(int num) {
+    std::string result;
+
+    if (num >= 100) {
+        result += ones.at(num / 100) + " hundred ";
+        num %= 100;
+    }
+
+    if (num >= 20) {
+        result += tens.at(num / 10);
+        if (num % 10 > 0) {
+            result += "-" + ones.at(num % 10);
+        }
+    } else if (num > 0) {
+        result += ones.at(num);
+    }
+
+    return result;
+}
+
+static std::string number_to_words(const std::string & number_str) {
+    try {
+        size_t decimal_pos = number_str.find('.');
+        std::string integer_part = number_str.substr(0, decimal_pos);
+
+        int int_number = std::stoi(integer_part);
+        std::string result;
+
+        if (int_number == 0) {
+            result = "zero";
+        } else {
+            if (int_number >= 1000000000) {
+                int billions = int_number / 1000000000;
+                result += convert_less_than_thousand(billions) + " billion ";
+                int_number %= 1000000000;
+            }
+
+            if (int_number >= 1000000) {
+                int millions = int_number / 1000000;
+                result += convert_less_than_thousand(millions) + " million ";
+                int_number %= 1000000;
+            }
+
+            if (int_number >= 1000) {
+                int thousands = int_number / 1000;
+                result += convert_less_than_thousand(thousands) + " thousand ";
+                int_number %= 1000;
+            }
+
+            if (int_number > 0) {
+                result += convert_less_than_thousand(int_number);
+            }
+        }
+
+        // Handle decimal part
+        if (decimal_pos != std::string::npos) {
+            result += " point";
+            std::string decimal_part = number_str.substr(decimal_pos + 1);
+            for (char digit : decimal_part) {
+                result += " " + ones.at(digit - '0');
+            }
+        }
+
+        return result;
+    } catch (const std::exception& e) {
+        // Skip if fails
+        return " ";
+    }
+}
+
+static std::string replace_numbers_with_words(const std::string & input_text) {
+    std::regex number_pattern(R"(\d+(\.\d+)?)");
+    std::string result;
+    auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
+    auto end = std::sregex_iterator();
+
+    size_t last_pos = 0;
+    for (std::sregex_iterator i = it; i != end; ++i) {
+        const std::smatch& match = *i;
+        result.append(input_text, last_pos, match.position() - last_pos);
+        result.append(number_to_words(match.str()));
+        last_pos = match.position() + match.length();
+    }
+    result.append(input_text, last_pos);
+
+    return result;
+}
+
+// Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
+static std::string process_text(const std::string & text) {
+
+    // For now I skipped text romanization as I am unsure how to handle
+    // uroman and MeCab implementations in C++
+    // maybe something like https://github.com/anyascii/anyascii/ could work.
+    // currently only English would be supported in this function
+
+    std::string processed_text = replace_numbers_with_words(text);
+
+    std::transform(processed_text.begin(), processed_text.end(),
+                  processed_text.begin(), ::tolower);
+
+    std::regex special_chars(R"([-_/,\.\\])");
+    processed_text = std::regex_replace(processed_text, special_chars, " ");
+
+    std::regex non_alpha(R"([^a-z\s])");
+    processed_text = std::regex_replace(processed_text, non_alpha, "");
+
+    std::regex multiple_spaces(R"(\s+)");
+    processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
+
+    processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
+
+    /*
+        Replace spaces with the separator token same as in line 365
+
+        for (auto & c : prompt_user) {
+        if (c == ' ') {
+            prompt_clean += "<|text_sep|>";
+    */
+    processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>");
+
+    return processed_text;
+}
+
 static void prompt_add(llama_tokens & prompt, llama_token token) {
     prompt.push_back(token);
 }
@@ -353,23 +492,11 @@ int main(int argc, char ** argv) {
 
         prompt_add(prompt_inp, model_ttc, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true);
 
-        // TODO: not sure if this is correct
+        // convert the input text into the necessary format expected by OuteTTS
         {
-            std::string prompt_clean;
-            std::string prompt_user = params.prompt;
-
-            for (auto & c : prompt_user) {
-                if (c == ' ') {
-                    prompt_clean += "<|text_sep|>";
-                } else {
-                    if (isalpha(c) || isdigit(c)) {
-                        c = tolower(c);
-                    } else {
-                        continue;
-                    }
-                    prompt_clean += c;
-                }
-            }
+            std::string prompt_clean = process_text(params.prompt);
+
+            LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
 
             prompt_add(prompt_inp, model_ttc, prompt_clean, false, true);
         }

From e70f140c04bcd94696ab35e48df6062148d5b0ca Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Dec 2024 13:51:09 +0200
Subject: [PATCH 32/45] tts : outetts-voc -> wavtokenizer-dec

---
 convert_hf_to_gguf.py            |   6 +-
 examples/tts/convert_pt_to_hf.py |   4 +-
 gguf-py/gguf/constants.py        | 214 +++++++++++++++----------------
 gguf-py/gguf/tensor_mapping.py   |  36 +++---
 src/llama.cpp                    | 136 ++++++++++----------
 5 files changed, 198 insertions(+), 198 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index c0ab0f955a496..8fc4c4f5655f4 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2032,9 +2032,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
             yield name, data
 
 
-@Model.register("OuteTTSVocoder")
-class OuteTTSVocoderModel(Model):
-    model_arch = gguf.MODEL_ARCH.OUTETTS_VOC
+@Model.register("WavTokenizerDec")
+class WavTokenizerDecModel(Model):
+    model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index 62dcc05efc5c2..8d68290e2beb3 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -1,5 +1,5 @@
 # convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
-# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the OuteTTSS vocoder
+# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder
 #
 # TODO: this script is LLM-generated and probably very inefficient and should be rewritten
 
@@ -144,7 +144,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
 
 config = {
     "architectures": [
-        "OuteTTSVocoder"
+        "WavTokenizerDec"
     ],
     "hidden_size": 1282,
     "vocab_size": 4096,
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index f1f44c7d2c77a..af2a4f4f4d15a 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -209,59 +209,59 @@ class GGUFType:
 
 
 class MODEL_ARCH(IntEnum):
-    LLAMA        = auto()
-    FALCON       = auto()
-    BAICHUAN     = auto()
-    GROK         = auto()
-    GPT2         = auto()
-    GPTJ         = auto()
-    GPTNEOX      = auto()
-    MPT          = auto()
-    STARCODER    = auto()
-    REFACT       = auto()
-    BERT         = auto()
-    NOMIC_BERT   = auto()
-    JINA_BERT_V2 = auto()
-    BLOOM        = auto()
-    STABLELM     = auto()
-    QWEN         = auto()
-    QWEN2        = auto()
-    QWEN2MOE     = auto()
-    QWEN2VL      = auto()
-    PHI2         = auto()
-    PHI3         = auto()
-    PLAMO        = auto()
-    CODESHELL    = auto()
-    ORION        = auto()
-    INTERNLM2    = auto()
-    MINICPM      = auto()
-    MINICPM3     = auto()
-    GEMMA        = auto()
-    GEMMA2       = auto()
-    STARCODER2   = auto()
-    RWKV6        = auto()
-    MAMBA        = auto()
-    XVERSE       = auto()
-    COMMAND_R    = auto()
-    DBRX         = auto()
-    OLMO         = auto()
-    OLMO2        = auto()
-    OLMOE        = auto()
-    OPENELM      = auto()
-    ARCTIC       = auto()
-    DEEPSEEK     = auto()
-    DEEPSEEK2    = auto()
-    CHATGLM      = auto()
-    BITNET       = auto()
-    T5           = auto()
-    T5ENCODER    = auto()
-    JAIS         = auto()
-    NEMOTRON     = auto()
-    EXAONE       = auto()
-    GRANITE      = auto()
-    GRANITE_MOE  = auto()
-    CHAMELEON    = auto()
-    OUTETTS_VOC  = auto()
+    LLAMA            = auto()
+    FALCON           = auto()
+    BAICHUAN         = auto()
+    GROK             = auto()
+    GPT2             = auto()
+    GPTJ             = auto()
+    GPTNEOX          = auto()
+    MPT              = auto()
+    STARCODER        = auto()
+    REFACT           = auto()
+    BERT             = auto()
+    NOMIC_BERT       = auto()
+    JINA_BERT_V2     = auto()
+    BLOOM            = auto()
+    STABLELM         = auto()
+    QWEN             = auto()
+    QWEN2            = auto()
+    QWEN2MOE         = auto()
+    QWEN2VL          = auto()
+    PHI2             = auto()
+    PHI3             = auto()
+    PLAMO            = auto()
+    CODESHELL        = auto()
+    ORION            = auto()
+    INTERNLM2        = auto()
+    MINICPM          = auto()
+    MINICPM3         = auto()
+    GEMMA            = auto()
+    GEMMA2           = auto()
+    STARCODER2       = auto()
+    RWKV6            = auto()
+    MAMBA            = auto()
+    XVERSE           = auto()
+    COMMAND_R        = auto()
+    DBRX             = auto()
+    OLMO             = auto()
+    OLMO2            = auto()
+    OLMOE            = auto()
+    OPENELM          = auto()
+    ARCTIC           = auto()
+    DEEPSEEK         = auto()
+    DEEPSEEK2        = auto()
+    CHATGLM          = auto()
+    BITNET           = auto()
+    T5               = auto()
+    T5ENCODER        = auto()
+    JAIS             = auto()
+    NEMOTRON         = auto()
+    EXAONE           = auto()
+    GRANITE          = auto()
+    GRANITE_MOE      = auto()
+    CHAMELEON        = auto()
+    WAVTOKENIZER_DEC = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -390,59 +390,59 @@ class MODEL_TENSOR(IntEnum):
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
-    MODEL_ARCH.LLAMA:          "llama",
-    MODEL_ARCH.FALCON:         "falcon",
-    MODEL_ARCH.BAICHUAN:       "baichuan",
-    MODEL_ARCH.GROK:           "grok",
-    MODEL_ARCH.GPT2:           "gpt2",
-    MODEL_ARCH.GPTJ:           "gptj",
-    MODEL_ARCH.GPTNEOX:        "gptneox",
-    MODEL_ARCH.MPT:            "mpt",
-    MODEL_ARCH.STARCODER:      "starcoder",
-    MODEL_ARCH.REFACT:         "refact",
-    MODEL_ARCH.BERT:           "bert",
-    MODEL_ARCH.NOMIC_BERT:     "nomic-bert",
-    MODEL_ARCH.JINA_BERT_V2:   "jina-bert-v2",
-    MODEL_ARCH.BLOOM:          "bloom",
-    MODEL_ARCH.STABLELM:       "stablelm",
-    MODEL_ARCH.QWEN:           "qwen",
-    MODEL_ARCH.QWEN2:          "qwen2",
-    MODEL_ARCH.QWEN2MOE:       "qwen2moe",
-    MODEL_ARCH.QWEN2VL:        "qwen2vl",
-    MODEL_ARCH.PHI2:           "phi2",
-    MODEL_ARCH.PHI3:           "phi3",
-    MODEL_ARCH.PLAMO:          "plamo",
-    MODEL_ARCH.CODESHELL:      "codeshell",
-    MODEL_ARCH.ORION:          "orion",
-    MODEL_ARCH.INTERNLM2:      "internlm2",
-    MODEL_ARCH.MINICPM:        "minicpm",
-    MODEL_ARCH.MINICPM3:       "minicpm3",
-    MODEL_ARCH.GEMMA:          "gemma",
-    MODEL_ARCH.GEMMA2:         "gemma2",
-    MODEL_ARCH.STARCODER2:     "starcoder2",
-    MODEL_ARCH.RWKV6:          "rwkv6",
-    MODEL_ARCH.MAMBA:          "mamba",
-    MODEL_ARCH.XVERSE:         "xverse",
-    MODEL_ARCH.COMMAND_R:      "command-r",
-    MODEL_ARCH.DBRX:           "dbrx",
-    MODEL_ARCH.OLMO:           "olmo",
-    MODEL_ARCH.OLMO2:          "olmo2",
-    MODEL_ARCH.OLMOE:          "olmoe",
-    MODEL_ARCH.OPENELM:        "openelm",
-    MODEL_ARCH.ARCTIC:         "arctic",
-    MODEL_ARCH.DEEPSEEK:       "deepseek",
-    MODEL_ARCH.DEEPSEEK2:      "deepseek2",
-    MODEL_ARCH.CHATGLM:        "chatglm",
-    MODEL_ARCH.BITNET:         "bitnet",
-    MODEL_ARCH.T5:             "t5",
-    MODEL_ARCH.T5ENCODER:      "t5encoder",
-    MODEL_ARCH.JAIS:           "jais",
-    MODEL_ARCH.NEMOTRON:       "nemotron",
-    MODEL_ARCH.EXAONE:         "exaone",
-    MODEL_ARCH.GRANITE:        "granite",
-    MODEL_ARCH.GRANITE_MOE:    "granitemoe",
-    MODEL_ARCH.CHAMELEON:      "chameleon",
-    MODEL_ARCH.OUTETTS_VOC:    "outetts-voc",
+    MODEL_ARCH.LLAMA:            "llama",
+    MODEL_ARCH.FALCON:           "falcon",
+    MODEL_ARCH.BAICHUAN:         "baichuan",
+    MODEL_ARCH.GROK:             "grok",
+    MODEL_ARCH.GPT2:             "gpt2",
+    MODEL_ARCH.GPTJ:             "gptj",
+    MODEL_ARCH.GPTNEOX:          "gptneox",
+    MODEL_ARCH.MPT:              "mpt",
+    MODEL_ARCH.STARCODER:        "starcoder",
+    MODEL_ARCH.REFACT:           "refact",
+    MODEL_ARCH.BERT:             "bert",
+    MODEL_ARCH.NOMIC_BERT:       "nomic-bert",
+    MODEL_ARCH.JINA_BERT_V2:     "jina-bert-v2",
+    MODEL_ARCH.BLOOM:            "bloom",
+    MODEL_ARCH.STABLELM:         "stablelm",
+    MODEL_ARCH.QWEN:             "qwen",
+    MODEL_ARCH.QWEN2:            "qwen2",
+    MODEL_ARCH.QWEN2MOE:         "qwen2moe",
+    MODEL_ARCH.QWEN2VL:          "qwen2vl",
+    MODEL_ARCH.PHI2:             "phi2",
+    MODEL_ARCH.PHI3:             "phi3",
+    MODEL_ARCH.PLAMO:            "plamo",
+    MODEL_ARCH.CODESHELL:        "codeshell",
+    MODEL_ARCH.ORION:            "orion",
+    MODEL_ARCH.INTERNLM2:        "internlm2",
+    MODEL_ARCH.MINICPM:          "minicpm",
+    MODEL_ARCH.MINICPM3:         "minicpm3",
+    MODEL_ARCH.GEMMA:            "gemma",
+    MODEL_ARCH.GEMMA2:           "gemma2",
+    MODEL_ARCH.STARCODER2:       "starcoder2",
+    MODEL_ARCH.RWKV6:            "rwkv6",
+    MODEL_ARCH.MAMBA:            "mamba",
+    MODEL_ARCH.XVERSE:           "xverse",
+    MODEL_ARCH.COMMAND_R:        "command-r",
+    MODEL_ARCH.DBRX:             "dbrx",
+    MODEL_ARCH.OLMO:             "olmo",
+    MODEL_ARCH.OLMO2:            "olmo2",
+    MODEL_ARCH.OLMOE:            "olmoe",
+    MODEL_ARCH.OPENELM:          "openelm",
+    MODEL_ARCH.ARCTIC:           "arctic",
+    MODEL_ARCH.DEEPSEEK:         "deepseek",
+    MODEL_ARCH.DEEPSEEK2:        "deepseek2",
+    MODEL_ARCH.CHATGLM:          "chatglm",
+    MODEL_ARCH.BITNET:           "bitnet",
+    MODEL_ARCH.T5:               "t5",
+    MODEL_ARCH.T5ENCODER:        "t5encoder",
+    MODEL_ARCH.JAIS:             "jais",
+    MODEL_ARCH.NEMOTRON:         "nemotron",
+    MODEL_ARCH.EXAONE:           "exaone",
+    MODEL_ARCH.GRANITE:          "granite",
+    MODEL_ARCH.GRANITE_MOE:      "granitemoe",
+    MODEL_ARCH.CHAMELEON:        "chameleon",
+    MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -1406,7 +1406,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
-    MODEL_ARCH.OUTETTS_VOC: [
+    MODEL_ARCH.WAVTOKENIZER_DEC: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.TOKEN_EMBD_NORM,
         MODEL_TENSOR.CONV1D,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 5bf1f514a04f6..296f1ca054cf2 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -42,7 +42,7 @@ class TensorNameMap:
             "emb_ln",                     # nomic-bert
             "transformer.norm",           # openelm
             "rwkv.blocks.0.pre_ln",       # rwkv
-            "backbone.norm",              # outetts
+            "backbone.norm",              # wavtokenizer
         ),
 
         # Position embeddings
@@ -61,7 +61,7 @@ class TensorNameMap:
             "lm_head.linear",            # phi2
             "output_layer",              # chatglm
             "head",                      # rwkv
-            "head.out",                  # outetts
+            "head.out",                  # wavtokenizer
         ),
 
         # Output norm
@@ -82,7 +82,7 @@ class TensorNameMap:
             "transformer.norm",                        # openelm
             "model.norm",                              # nemotron
             "rwkv.ln_out",                             # rwkv
-            "backbone.final_layer_norm",               # outetts
+            "backbone.final_layer_norm",               # wavtokenizer
         ),
 
         # Rope frequencies
@@ -705,63 +705,63 @@ class TensorNameMap:
         #############################################################################
 
         MODEL_TENSOR.CONV_NEXT_DW: (
-            "backbone.convnext.{bid}.dwconv", # outetts
+            "backbone.convnext.{bid}.dwconv", # wavtokenizer
         ),
 
         MODEL_TENSOR.CONV_NEXT_NORM: (
-            "backbone.convnext.{bid}.norm", # outetts
+            "backbone.convnext.{bid}.norm", # wavtokenizer
         ),
 
         MODEL_TENSOR.CONV_NEXT_PW1: (
-            "backbone.convnext.{bid}.pwconv1", # outetts
+            "backbone.convnext.{bid}.pwconv1", # wavtokenizer
         ),
 
         MODEL_TENSOR.CONV_NEXT_PW2: (
-            "backbone.convnext.{bid}.pwconv2", # outetts
+            "backbone.convnext.{bid}.pwconv2", # wavtokenizer
         ),
 
         MODEL_TENSOR.CONV_NEXT_GAMMA: (
-            "backbone.convnext.{bid}.gamma", # outetts
+            "backbone.convnext.{bid}.gamma", # wavtokenizer
         ),
 
         MODEL_TENSOR.POS_NET_CONV1: (
-            "backbone.pos_net.{bid}.conv1", # outetts
+            "backbone.pos_net.{bid}.conv1", # wavtokenizer
         ),
 
         MODEL_TENSOR.POS_NET_CONV2: (
-            "backbone.pos_net.{bid}.conv2", # outetts
+            "backbone.pos_net.{bid}.conv2", # wavtokenizer
         ),
 
         MODEL_TENSOR.POS_NET_NORM: (
-            "backbone.pos_net.{bid}.norm", # outetts
+            "backbone.pos_net.{bid}.norm", # wavtokenizer
         ),
 
         MODEL_TENSOR.POS_NET_NORM1: (
-            "backbone.pos_net.{bid}.norm1", # outetts
+            "backbone.pos_net.{bid}.norm1", # wavtokenizer
         ),
 
         MODEL_TENSOR.POS_NET_NORM2: (
-            "backbone.pos_net.{bid}.norm2", # outetts
+            "backbone.pos_net.{bid}.norm2", # wavtokenizer
         ),
 
         MODEL_TENSOR.POS_NET_ATTN_NORM: (
-            "backbone.pos_net.{bid}.norm", # outetts
+            "backbone.pos_net.{bid}.norm", # wavtokenizer
         ),
 
         MODEL_TENSOR.POS_NET_ATTN_Q: (
-            "backbone.pos_net.{bid}.q", # outetts
+            "backbone.pos_net.{bid}.q", # wavtokenizer
         ),
 
         MODEL_TENSOR.POS_NET_ATTN_K: (
-            "backbone.pos_net.{bid}.k", # outetts
+            "backbone.pos_net.{bid}.k", # wavtokenizer
         ),
 
         MODEL_TENSOR.POS_NET_ATTN_V: (
-            "backbone.pos_net.{bid}.v", # outetts
+            "backbone.pos_net.{bid}.v", # wavtokenizer
         ),
 
         MODEL_TENSOR.POS_NET_ATTN_OUT: (
-            "backbone.pos_net.{bid}.proj_out", # outetts
+            "backbone.pos_net.{bid}.proj_out", # wavtokenizer
         ),
     }
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 6397decd70c73..9aa5ad17a5bdb 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -197,65 +197,65 @@ enum llm_arch {
     LLM_ARCH_GRANITE,
     LLM_ARCH_GRANITE_MOE,
     LLM_ARCH_CHAMELEON,
-    LLM_ARCH_OUTETTS_VOC,
+    LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_UNKNOWN,
 };
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-    { LLM_ARCH_LLAMA,           "llama"        },
-    { LLM_ARCH_FALCON,          "falcon"       },
-    { LLM_ARCH_GROK,            "grok"         },
-    { LLM_ARCH_GPT2,            "gpt2"         },
-    { LLM_ARCH_GPTJ,            "gptj"         },
-    { LLM_ARCH_GPTNEOX,         "gptneox"      },
-    { LLM_ARCH_MPT,             "mpt"          },
-    { LLM_ARCH_BAICHUAN,        "baichuan"     },
-    { LLM_ARCH_STARCODER,       "starcoder"    },
-    { LLM_ARCH_REFACT,          "refact"       },
-    { LLM_ARCH_BERT,            "bert"         },
-    { LLM_ARCH_NOMIC_BERT,      "nomic-bert"   },
-    { LLM_ARCH_JINA_BERT_V2,    "jina-bert-v2" },
-    { LLM_ARCH_BLOOM,           "bloom"        },
-    { LLM_ARCH_STABLELM,        "stablelm"     },
-    { LLM_ARCH_QWEN,            "qwen"         },
-    { LLM_ARCH_QWEN2,           "qwen2"        },
-    { LLM_ARCH_QWEN2MOE,        "qwen2moe"     },
-    { LLM_ARCH_QWEN2VL,         "qwen2vl"      },
-    { LLM_ARCH_PHI2,            "phi2"         },
-    { LLM_ARCH_PHI3,            "phi3"         },
-    { LLM_ARCH_PLAMO,           "plamo"        },
-    { LLM_ARCH_CODESHELL,       "codeshell"    },
-    { LLM_ARCH_ORION,           "orion"        },
-    { LLM_ARCH_INTERNLM2,       "internlm2"    },
-    { LLM_ARCH_MINICPM,         "minicpm"      },
-    { LLM_ARCH_MINICPM3,        "minicpm3"     },
-    { LLM_ARCH_GEMMA,           "gemma"        },
-    { LLM_ARCH_GEMMA2,          "gemma2"       },
-    { LLM_ARCH_STARCODER2,      "starcoder2"   },
-    { LLM_ARCH_MAMBA,           "mamba"        },
-    { LLM_ARCH_XVERSE,          "xverse"       },
-    { LLM_ARCH_COMMAND_R,       "command-r"    },
-    { LLM_ARCH_DBRX,            "dbrx"         },
-    { LLM_ARCH_OLMO,            "olmo"         },
-    { LLM_ARCH_OLMO2,           "olmo2"        },
-    { LLM_ARCH_OLMOE,           "olmoe"        },
-    { LLM_ARCH_OPENELM,         "openelm"      },
-    { LLM_ARCH_ARCTIC,          "arctic"       },
-    { LLM_ARCH_DEEPSEEK,        "deepseek"     },
-    { LLM_ARCH_DEEPSEEK2,       "deepseek2"    },
-    { LLM_ARCH_CHATGLM,         "chatglm"      },
-    { LLM_ARCH_BITNET,          "bitnet"       },
-    { LLM_ARCH_T5,              "t5"           },
-    { LLM_ARCH_T5ENCODER,       "t5encoder"    },
-    { LLM_ARCH_JAIS,            "jais"         },
-    { LLM_ARCH_NEMOTRON,        "nemotron"     },
-    { LLM_ARCH_EXAONE,          "exaone"       },
-    { LLM_ARCH_RWKV6,           "rwkv6"        },
-    { LLM_ARCH_GRANITE,         "granite"      },
-    { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
-    { LLM_ARCH_CHAMELEON,       "chameleon"    },
-    { LLM_ARCH_OUTETTS_VOC,     "outetts-voc"  },
-    { LLM_ARCH_UNKNOWN,         "(unknown)"    },
+    { LLM_ARCH_LLAMA,            "llama"            },
+    { LLM_ARCH_FALCON,           "falcon"           },
+    { LLM_ARCH_GROK,             "grok"             },
+    { LLM_ARCH_GPT2,             "gpt2"             },
+    { LLM_ARCH_GPTJ,             "gptj"             },
+    { LLM_ARCH_GPTNEOX,          "gptneox"          },
+    { LLM_ARCH_MPT,              "mpt"              },
+    { LLM_ARCH_BAICHUAN,         "baichuan"         },
+    { LLM_ARCH_STARCODER,        "starcoder"        },
+    { LLM_ARCH_REFACT,           "refact"           },
+    { LLM_ARCH_BERT,             "bert"             },
+    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
+    { LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },
+    { LLM_ARCH_BLOOM,            "bloom"            },
+    { LLM_ARCH_STABLELM,         "stablelm"         },
+    { LLM_ARCH_QWEN,             "qwen"             },
+    { LLM_ARCH_QWEN2,            "qwen2"            },
+    { LLM_ARCH_QWEN2MOE,         "qwen2moe"         },
+    { LLM_ARCH_QWEN2VL,          "qwen2vl"          },
+    { LLM_ARCH_PHI2,             "phi2"             },
+    { LLM_ARCH_PHI3,             "phi3"             },
+    { LLM_ARCH_PLAMO,            "plamo"            },
+    { LLM_ARCH_CODESHELL,        "codeshell"        },
+    { LLM_ARCH_ORION,            "orion"            },
+    { LLM_ARCH_INTERNLM2,        "internlm2"        },
+    { LLM_ARCH_MINICPM,          "minicpm"          },
+    { LLM_ARCH_MINICPM3,         "minicpm3"         },
+    { LLM_ARCH_GEMMA,            "gemma"            },
+    { LLM_ARCH_GEMMA2,           "gemma2"           },
+    { LLM_ARCH_STARCODER2,       "starcoder2"       },
+    { LLM_ARCH_MAMBA,            "mamba"            },
+    { LLM_ARCH_XVERSE,           "xverse"           },
+    { LLM_ARCH_COMMAND_R,        "command-r"        },
+    { LLM_ARCH_DBRX,             "dbrx"             },
+    { LLM_ARCH_OLMO,             "olmo"             },
+    { LLM_ARCH_OLMO2,            "olmo2"            },
+    { LLM_ARCH_OLMOE,            "olmoe"            },
+    { LLM_ARCH_OPENELM,          "openelm"          },
+    { LLM_ARCH_ARCTIC,           "arctic"           },
+    { LLM_ARCH_DEEPSEEK,         "deepseek"         },
+    { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
+    { LLM_ARCH_CHATGLM,          "chatglm"          },
+    { LLM_ARCH_BITNET,           "bitnet"           },
+    { LLM_ARCH_T5,               "t5"               },
+    { LLM_ARCH_T5ENCODER,        "t5encoder"        },
+    { LLM_ARCH_JAIS,             "jais"             },
+    { LLM_ARCH_NEMOTRON,         "nemotron"         },
+    { LLM_ARCH_EXAONE,           "exaone"           },
+    { LLM_ARCH_RWKV6,            "rwkv6"            },
+    { LLM_ARCH_GRANITE,          "granite"          },
+    { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
+    { LLM_ARCH_CHAMELEON,        "chameleon"        },
+    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
+    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
 enum llm_kv {
@@ -1612,7 +1612,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
         },
     },
     {
-        LLM_ARCH_OUTETTS_VOC,
+        LLM_ARCH_WAVTOKENIZER_DEC,
         {
             { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
             { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
@@ -3063,7 +3063,7 @@ struct llama_model {
     struct ggml_tensor * cls_out   = nullptr;
     struct ggml_tensor * cls_out_b = nullptr;
 
-    // outetts vocoder
+    // wavtokenizer decoder
     // TODO: dedup
     struct ggml_tensor * conv_1d = nullptr;
     struct ggml_tensor * conv_1d_b = nullptr;
@@ -6443,7 +6443,7 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                }
             } break;
-        case LLM_ARCH_OUTETTS_VOC:
+        case LLM_ARCH_WAVTOKENIZER_DEC:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
             } break;
@@ -9545,7 +9545,7 @@ static bool llm_load_tensors(
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
-            case LLM_ARCH_OUTETTS_VOC:
+            case LLM_ARCH_WAVTOKENIZER_DEC:
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0);
 
@@ -16142,7 +16142,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_t5_encoder() {
+    struct ggml_cgraph * build_t5_enc() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -16274,7 +16274,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_t5_decoder() {
+    struct ggml_cgraph * build_t5_dec() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         // mutable variable, needed during the last layer of the computation to skip unused tokens
@@ -17224,7 +17224,7 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_outetts_voc() {
+    struct ggml_cgraph * build_wavtokenizer_dec() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
         struct ggml_tensor * cur;
@@ -17692,14 +17692,14 @@ static struct ggml_cgraph * llama_build_graph(
         case LLM_ARCH_T5:
             {
                 if (lctx.is_encoding) {
-                    result = llm.build_t5_encoder();
+                    result = llm.build_t5_enc();
                 } else {
-                    result = llm.build_t5_decoder();
+                    result = llm.build_t5_dec();
                 }
             } break;
         case LLM_ARCH_T5ENCODER:
             {
-                result = llm.build_t5_encoder();
+                result = llm.build_t5_enc();
             } break;
         case LLM_ARCH_JAIS:
             {
@@ -17721,9 +17721,9 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
-        case LLM_ARCH_OUTETTS_VOC:
+        case LLM_ARCH_WAVTOKENIZER_DEC:
             {
-                result = llm.build_outetts_voc();
+                result = llm.build_wavtokenizer_dec();
             } break;
         default:
             GGML_ABORT("fatal error");
@@ -20904,7 +20904,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_T5ENCODER:
         case LLM_ARCH_JAIS:
         case LLM_ARCH_RWKV6:
-        case LLM_ARCH_OUTETTS_VOC:
+        case LLM_ARCH_WAVTOKENIZER_DEC:
             return LLAMA_ROPE_TYPE_NONE;
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values

From c096bbd8ddfc778738498bd005e8ada156bd9406 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Dec 2024 15:07:38 +0200
Subject: [PATCH 33/45] tts : remove hardcoded constants

ggml-ci
---
 convert_hf_to_gguf.py            |   6 +-
 examples/tts/convert_pt_to_hf.py |   4 +
 gguf-py/gguf/constants.py        |   3 +
 gguf-py/gguf/gguf_writer.py      |  12 +++
 src/llama.cpp                    | 151 ++++++++++++++++++-------------
 5 files changed, 110 insertions(+), 66 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 8fc4c4f5655f4..bf6da96017058 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2055,7 +2055,11 @@ def set_vocab(self):
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+        self.gguf_writer.add_vocab_size         (self.hparams["vocab_size"])
+        self.gguf_writer.add_features_length    (self.hparams["n_embd_features"])
+        self.gguf_writer.add_posnet_length      (self.hparams["n_embd_posnet"])
+        self.gguf_writer.add_convnext_length    (self.hparams["n_embd_convnext"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
 
 
 @Model.register("Qwen2MoeForCausalLM")
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index 8d68290e2beb3..4938515af62ad 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -147,6 +147,10 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         "WavTokenizerDec"
     ],
     "hidden_size": 1282,
+    "n_embd_features": 512,
+    "n_embd_posnet": 768,
+    "n_embd_convnext": 768,
+    "n_ff": 2304,
     "vocab_size": 4096,
     "n_head": 1,
     "layer_norm_epsilon": 1e-6,
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index af2a4f4f4d15a..e8a31f1144434 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -90,6 +90,9 @@ class LLM:
         VOCAB_SIZE                        = "{arch}.vocab_size"
         CONTEXT_LENGTH                    = "{arch}.context_length"
         EMBEDDING_LENGTH                  = "{arch}.embedding_length"
+        FEATURES_LENGTH                   = "{arch}.features_length"
+        POSNET_LENGTH                     = "{arch}.posnet_length"
+        CONVNEXT_LENGTH                   = "{arch}.convnext_length"
         BLOCK_COUNT                       = "{arch}.block_count"
         LEADING_DENSE_BLOCK_COUNT         = "{arch}.leading_dense_block_count"
         FEED_FORWARD_LENGTH               = "{arch}.feed_forward_length"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 65a64e10dd33f..8266ba99fd06a 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -631,6 +631,18 @@ def add_context_length(self, length: int) -> None:
     def add_embedding_length(self, length: int) -> None:
         self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
 
+    def add_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_features_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.POSNET_LENGTH.format(arch=self.arch), length)
+
+    def add_convnext_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.CONVNEXT_LENGTH.format(arch=self.arch), length)
+
     def add_block_count(self, length: int) -> None:
         self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 9aa5ad17a5bdb..f0c4b5e624066 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -77,6 +77,7 @@
 #endif
 
 // bump if necessary
+#define LLAMA_MAX_EMBD    8
 #define LLAMA_MAX_LAYERS  512
 #define LLAMA_MAX_EXPERTS 160  // DeepSeekV2
 
@@ -275,6 +276,9 @@ enum llm_kv {
     LLM_KV_VOCAB_SIZE,
     LLM_KV_CONTEXT_LENGTH,
     LLM_KV_EMBEDDING_LENGTH,
+    LLM_KV_FEATURES_LENGTH,
+    LLM_KV_POSNET_LENGTH,
+    LLM_KV_CONVNEXT_LENGTH,
     LLM_KV_BLOCK_COUNT,
     LLM_KV_LEADING_DENSE_BLOCK_COUNT,
     LLM_KV_FEED_FORWARD_LENGTH,
@@ -392,6 +396,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_VOCAB_SIZE,                        "%s.vocab_size"                        },
     { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
     { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
+    { LLM_KV_FEATURES_LENGTH,                   "%s.features_length"                   },
+    { LLM_KV_POSNET_LENGTH,                     "%s.posnet_length"                     },
+    { LLM_KV_CONVNEXT_LENGTH,                   "%s.convnext_length"                   },
     { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
     { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
     { LLM_KV_FEED_FORWARD_LENGTH,               "%s.feed_forward_length"               },
@@ -2545,6 +2552,11 @@ struct llama_hparams {
     uint32_t n_vocab_type = 0; // for BERT-style token types
     uint32_t n_rel_attn_bkts = 0;
 
+    // for WavTokenizer
+    uint32_t n_embd_features = 0;
+    uint32_t n_embd_posnet = 0;
+    uint32_t n_embd_convnext = 0;
+
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
@@ -5684,6 +5696,12 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_EXPERT_COUNT,      hparams.n_expert,      false);
     ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
 
+    if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
+        ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
+        ml.get_key(LLM_KV_POSNET_LENGTH,   hparams.n_embd_posnet);
+        ml.get_key(LLM_KV_CONVNEXT_LENGTH, hparams.n_embd_convnext);
+    }
+
     GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
     GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
     if (hparams.n_expert > 0) {
@@ -5692,7 +5710,7 @@ static void llm_load_hparams(
         GGML_ASSERT(hparams.n_expert_used == 0);
     }
 
-    // zero-out the per-layer hparams
+    // zero-out the array hparams
     std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
     std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
     std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);
@@ -7577,7 +7595,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
             } break;
         case GGML_OP_IM2COL:
             {
-                int n_embd = hparams.n_embd;
+                const int n_embd = hparams.n_embd;
                 ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
                 op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
             } break;
@@ -9547,104 +9565,107 @@ static bool llm_load_tensors(
                 } break;
             case LLM_ARCH_WAVTOKENIZER_DEC:
                 {
-                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0);
+                    const int64_t n_embd_features = hparams.n_embd_features;
+                    const int64_t n_embd_posnet   = hparams.n_embd_posnet;
+                    const int64_t n_embd_convnext = hparams.n_embd_convnext;
+
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd_features, n_vocab}, 0);
 
-                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0);
-                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {768}, 0);
+                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd_posnet}, 0);
+                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd_posnet}, 0);
 
-                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, 512, 768}, 0);
-                    model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {768}, 0);
+                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd_features, n_embd_posnet}, 0);
+                    model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {n_embd_posnet}, 0);
 
-                    model.posnet_0_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0);
-                    model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   0), {768}, 0);
+                    model.posnet_0_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {n_embd_posnet}, 0);
+                    model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   0), {n_embd_posnet}, 0);
 
-                    model.posnet_0_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, 768, 768}, 0);
-                    model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   0), {768}, 0);
+                    model.posnet_0_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   0), {n_embd_posnet}, 0);
 
-                    model.posnet_0_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {768}, 0);
-                    model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   0), {768}, 0);
+                    model.posnet_0_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {n_embd_posnet}, 0);
+                    model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   0), {n_embd_posnet}, 0);
 
-                    model.posnet_0_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, 768, 768}, 0);
-                    model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   0), {768}, 0);
+                    model.posnet_0_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   0), {n_embd_posnet}, 0);
 
-                    model.posnet_1_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {768}, 0);
-                    model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   1), {768}, 0);
+                    model.posnet_1_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {n_embd_posnet}, 0);
+                    model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   1), {n_embd_posnet}, 0);
 
-                    model.posnet_1_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, 768, 768}, 0);
-                    model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   1), {768}, 0);
+                    model.posnet_1_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   1), {n_embd_posnet}, 0);
 
-                    model.posnet_1_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {768}, 0);
-                    model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   1), {768}, 0);
+                    model.posnet_1_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {n_embd_posnet}, 0);
+                    model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   1), {n_embd_posnet}, 0);
 
-                    model.posnet_1_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, 768, 768}, 0);
-                    model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   1), {768}, 0);
+                    model.posnet_1_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   1), {n_embd_posnet}, 0);
 
-                    model.posnet_2_attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {768}, 0);
-                    model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   2), {768}, 0);
+                    model.posnet_2_attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {n_embd_posnet}, 0);
+                    model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   2), {n_embd_posnet}, 0);
 
-                    model.posnet_2_attn_q   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, 768, 768}, 0);
-                    model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias",   2), {768}, 0);
+                    model.posnet_2_attn_q   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias",   2), {n_embd_posnet}, 0);
 
-                    model.posnet_2_attn_k   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, 768, 768}, 0);
-                    model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias",   2), {768}, 0);
+                    model.posnet_2_attn_k   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias",   2), {n_embd_posnet}, 0);
 
-                    model.posnet_2_attn_v   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, 768, 768}, 0);
-                    model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias",   2), {768}, 0);
+                    model.posnet_2_attn_v   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias",   2), {n_embd_posnet}, 0);
 
-                    model.posnet_2_attn_o   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, 768, 768}, 0);
-                    model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias",   2), {768}, 0);
+                    model.posnet_2_attn_o   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias",   2), {n_embd_posnet}, 0);
 
-                    model.posnet_3_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {768}, 0);
-                    model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   3), {768}, 0);
+                    model.posnet_3_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {n_embd_posnet}, 0);
+                    model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   3), {n_embd_posnet}, 0);
 
-                    model.posnet_3_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, 768, 768}, 0);
-                    model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   3), {768}, 0);
+                    model.posnet_3_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   3), {n_embd_posnet}, 0);
 
-                    model.posnet_3_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {768}, 0);
-                    model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   3), {768}, 0);
+                    model.posnet_3_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {n_embd_posnet}, 0);
+                    model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   3), {n_embd_posnet}, 0);
 
-                    model.posnet_3_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, 768, 768}, 0);
-                    model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   3), {768}, 0);
+                    model.posnet_3_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   3), {n_embd_posnet}, 0);
 
-                    model.posnet_4_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {768}, 0);
-                    model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   4), {768}, 0);
+                    model.posnet_4_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {n_embd_posnet}, 0);
+                    model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   4), {n_embd_posnet}, 0);
 
-                    model.posnet_4_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, 768, 768}, 0);
-                    model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   4), {768}, 0);
+                    model.posnet_4_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   4), {n_embd_posnet}, 0);
 
-                    model.posnet_4_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {768}, 0);
-                    model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   4), {768}, 0);
+                    model.posnet_4_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {n_embd_posnet}, 0);
+                    model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   4), {n_embd_posnet}, 0);
 
-                    model.posnet_4_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, 768, 768}, 0);
-                    model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   4), {768}, 0);
+                    model.posnet_4_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0);
+                    model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   4), {n_embd_posnet}, 0);
 
-                    model.posnet_5_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {768}, 0);
-                    model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   5), {768}, 0);
+                    model.posnet_5_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {n_embd_posnet}, 0);
+                    model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   5), {n_embd_posnet}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = model.layers[i];
 
-                        layer.convnext_dw   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, 768}, 0);
-                        layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias",   i), {768}, 0);
+                        layer.convnext_dw   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, n_embd_convnext}, 0);
+                        layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias",   i), {n_embd_convnext}, 0);
 
-                        layer.convnext_norm   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {768}, 0);
-                        layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias",   i), {768}, 0);
+                        layer.convnext_norm   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {n_embd_convnext}, 0);
+                        layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias",   i), {n_embd_convnext}, 0);
 
-                        // TODO: n_ff
-                        layer.convnext_pw1   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "weight", i), {768, 2304}, 0);
-                        layer.convnext_pw1_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "bias",   i), {2304}, 0);
+                        layer.convnext_pw1   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "weight", i), {n_embd_convnext, n_ff}, 0);
+                        layer.convnext_pw1_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "bias",   i), {n_ff}, 0);
 
-                        layer.convnext_pw2   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "weight", i), {2304, 768}, 0);
-                        layer.convnext_pw2_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "bias",   i), {768}, 0);
+                        layer.convnext_pw2   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "weight", i), {n_ff, n_embd_convnext}, 0);
+                        layer.convnext_pw2_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "bias",   i), {n_embd_convnext}, 0);
 
-                        layer.convnext_gamma = create_tensor(tn(LLM_TENSOR_CONV_NEXT_GAMMA, "weight", i), {768}, 0);
+                        layer.convnext_gamma = create_tensor(tn(LLM_TENSOR_CONV_NEXT_GAMMA, "weight", i), {n_embd_convnext}, 0);
                     }
 
                     // output
-                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
-                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {768}, 0);
+                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd_convnext}, 0);
+                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd_convnext}, 0);
 
-                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, n_embd}, 0);
+                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd_convnext, n_embd}, 0);
                     model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
                 } break;
             default:
@@ -17317,7 +17338,7 @@ struct llm_build_context {
 
             struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
 
-            kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(768)), 0.0f);
+            kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(model.hparams.n_embd_posnet)), 0.0f);
 
             cur = ggml_mul_mat(ctx0, kq, v);
 

From d1ef627c51870ca836ba82dd5d1b75edb1e7961a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Dec 2024 16:48:22 +0200
Subject: [PATCH 34/45] tts : fix tensor shapes

---
 convert_hf_to_gguf.py            |   4 +
 examples/tts/convert_pt_to_hf.py |   9 ++
 gguf-py/gguf/constants.py        |   2 +
 gguf-py/gguf/gguf_writer.py      |   9 +-
 gguf-py/tests/test_quants.py     |   2 +-
 src/llama.cpp                    | 150 ++++++++++++++++---------------
 6 files changed, 101 insertions(+), 75 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index bf6da96017058..ad423d0b2625b 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -326,6 +326,8 @@ def prepare_tensors(self):
                             gguf.MODEL_TENSOR.TIME_MIX_W2,
                             gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
                             gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
+                            gguf.MODEL_TENSOR.POS_NET_NORM1,
+                            gguf.MODEL_TENSOR.POS_NET_NORM2,
                         )
                     )
                     or not new_name.endswith(".weight")
@@ -2060,6 +2062,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_posnet_length      (self.hparams["n_embd_posnet"])
         self.gguf_writer.add_convnext_length    (self.hparams["n_embd_convnext"])
         self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
+        self.gguf_writer.add_group_norm_eps     (self.hparams["group_norm_epsilon"])
+        self.gguf_writer.add_group_norm_groups  (self.hparams["group_norm_groups"])
 
 
 @Model.register("Qwen2MoeForCausalLM")
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index 4938515af62ad..99fef33d27815 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -98,6 +98,13 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         if new_key.endswith("gamma"):
             new_key = new_key.replace("gamma", "gamma.weight")
 
+        # convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias
+        if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.pos_net") or new_key.startswith("backbone.embed.bias")):
+            value = value.unsqueeze(1)
+
+        if new_key.endswith("dwconv.bias"):
+            value = value.unsqueeze(1)
+
         size_mb = value.element_size() * value.nelement() / (1024 * 1024)
         print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
 
@@ -154,6 +161,8 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
     "vocab_size": 4096,
     "n_head": 1,
     "layer_norm_epsilon": 1e-6,
+    "group_norm_epsilon": 1e-6,
+    "group_norm_groups": 32,
     "max_position_embeddings": 8192, # ?
     "num_hidden_layers": 12
 }
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index e8a31f1144434..139d0d15f7e06 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -125,6 +125,8 @@ class Attention:
         VALUE_LENGTH      = "{arch}.attention.value_length"
         LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
         LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+        GROUPNORM_EPS     = "{arch}.attention.group_norm_epsilon"
+        GROUPNORM_GROUPS  = "{arch}.attention.group_norm_groups"
         CAUSAL            = "{arch}.attention.causal"
         Q_LORA_RANK       = "{arch}.attention.q_lora_rank"
         KV_LORA_RANK      = "{arch}.attention.kv_lora_rank"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 8266ba99fd06a..5bb2a8af87e25 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -631,9 +631,6 @@ def add_context_length(self, length: int) -> None:
     def add_embedding_length(self, length: int) -> None:
         self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
 
-    def add_embedding_length(self, length: int) -> None:
-        self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
-
     def add_features_length(self, length: int) -> None:
         self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
 
@@ -739,6 +736,12 @@ def add_layer_norm_eps(self, value: float) -> None:
     def add_layer_norm_rms_eps(self, value: float) -> None:
         self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
 
+    def add_group_norm_eps(self, value: float) -> None:
+        self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value)
+
+    def add_group_norm_groups(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value)
+
     def add_causal_attention(self, value: bool) -> None:
         self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
 
diff --git a/gguf-py/tests/test_quants.py b/gguf-py/tests/test_quants.py
index 762067814224e..f04d5acce2793 100755
--- a/gguf-py/tests/test_quants.py
+++ b/gguf-py/tests/test_quants.py
@@ -136,7 +136,7 @@ def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType)
         logger.debug(f"Sample bad block ({diff_bits[bad_block_id]} differing bits):\n{t1[bad_block_id]}\nReference:\n{t2[bad_block_id]}")
 
         sum_diff_bits = np.sum(diff_bits)
-        logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits/(x.size * 8):.6f}%)")
+        logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits / (x.size * 8):.6f}%)")
         return False
 
 
diff --git a/src/llama.cpp b/src/llama.cpp
index f0c4b5e624066..4c298d4062cc3 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -310,6 +310,8 @@ enum llm_kv {
     LLM_KV_ATTENTION_VALUE_LENGTH,
     LLM_KV_ATTENTION_LAYERNORM_EPS,
     LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
+    LLM_KV_ATTENTION_GROUPNORM_EPS,
+    LLM_KV_ATTENTION_GROUPNORM_GROUPS,
     LLM_KV_ATTENTION_CAUSAL,
     LLM_KV_ATTENTION_Q_LORA_RANK,
     LLM_KV_ATTENTION_KV_LORA_RANK,
@@ -430,6 +432,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
     { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
     { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
+    { LLM_KV_ATTENTION_GROUPNORM_EPS,          "%s.attention.group_norm_epsilon"     },
+    { LLM_KV_ATTENTION_GROUPNORM_GROUPS,       "%s.attention.group_norm_groups"      },
     { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
     { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
     { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
@@ -2571,6 +2575,9 @@ struct llama_hparams {
 
     float f_norm_eps;
     float f_norm_rms_eps;
+    float f_norm_group_eps;
+
+    uint32_t n_norm_groups;
 
     float f_attn_logit_softcapping = 50.0f;
     float f_final_logit_softcapping = 30.0f;
@@ -6463,7 +6470,9 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_WAVTOKENIZER_DEC:
             {
-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
+                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
+                ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
             } break;
         default: (void)0;
     }
@@ -9575,79 +9584,79 @@ static bool llm_load_tensors(
                     model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd_posnet}, 0);
 
                     model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd_features, n_embd_posnet}, 0);
-                    model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {n_embd_posnet}, 0);
+                    model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, n_embd_posnet}, 0);
 
-                    model.posnet_0_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {n_embd_posnet}, 0);
-                    model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   0), {n_embd_posnet}, 0);
+                    model.posnet_0_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {1, n_embd_posnet}, 0);
+                    model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   0), {1, n_embd_posnet}, 0);
 
                     model.posnet_0_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   0), {n_embd_posnet}, 0);
+                    model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   0), {1, n_embd_posnet}, 0);
 
-                    model.posnet_0_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {n_embd_posnet}, 0);
-                    model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   0), {n_embd_posnet}, 0);
+                    model.posnet_0_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {1, n_embd_posnet}, 0);
+                    model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   0), {1, n_embd_posnet}, 0);
 
                     model.posnet_0_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   0), {n_embd_posnet}, 0);
+                    model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   0), {1, n_embd_posnet}, 0);
 
-                    model.posnet_1_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {n_embd_posnet}, 0);
-                    model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   1), {n_embd_posnet}, 0);
+                    model.posnet_1_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {1, n_embd_posnet}, 0);
+                    model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   1), {1, n_embd_posnet}, 0);
 
                     model.posnet_1_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   1), {n_embd_posnet}, 0);
+                    model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   1), {1, n_embd_posnet}, 0);
 
-                    model.posnet_1_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {n_embd_posnet}, 0);
-                    model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   1), {n_embd_posnet}, 0);
+                    model.posnet_1_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {1, n_embd_posnet}, 0);
+                    model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   1), {1, n_embd_posnet}, 0);
 
                     model.posnet_1_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   1), {n_embd_posnet}, 0);
+                    model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   1), {1, n_embd_posnet}, 0);
 
-                    model.posnet_2_attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {n_embd_posnet}, 0);
-                    model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   2), {n_embd_posnet}, 0);
+                    model.posnet_2_attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {1, n_embd_posnet}, 0);
+                    model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   2), {1, n_embd_posnet}, 0);
 
                     model.posnet_2_attn_q   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias",   2), {n_embd_posnet}, 0);
+                    model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias",   2), {1, n_embd_posnet}, 0);
 
                     model.posnet_2_attn_k   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias",   2), {n_embd_posnet}, 0);
+                    model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias",   2), {1, n_embd_posnet}, 0);
 
                     model.posnet_2_attn_v   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias",   2), {n_embd_posnet}, 0);
+                    model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias",   2), {1, n_embd_posnet}, 0);
 
                     model.posnet_2_attn_o   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias",   2), {n_embd_posnet}, 0);
+                    model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias",   2), {1, n_embd_posnet}, 0);
 
-                    model.posnet_3_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {n_embd_posnet}, 0);
-                    model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   3), {n_embd_posnet}, 0);
+                    model.posnet_3_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {1, n_embd_posnet}, 0);
+                    model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   3), {1, n_embd_posnet}, 0);
 
                     model.posnet_3_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   3), {n_embd_posnet}, 0);
+                    model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   3), {1, n_embd_posnet}, 0);
 
-                    model.posnet_3_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {n_embd_posnet}, 0);
-                    model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   3), {n_embd_posnet}, 0);
+                    model.posnet_3_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {1, n_embd_posnet}, 0);
+                    model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   3), {1, n_embd_posnet}, 0);
 
                     model.posnet_3_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   3), {n_embd_posnet}, 0);
+                    model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   3), {1, n_embd_posnet}, 0);
 
-                    model.posnet_4_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {n_embd_posnet}, 0);
-                    model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   4), {n_embd_posnet}, 0);
+                    model.posnet_4_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {1, n_embd_posnet}, 0);
+                    model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   4), {1, n_embd_posnet}, 0);
 
                     model.posnet_4_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   4), {n_embd_posnet}, 0);
+                    model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   4), {1, n_embd_posnet}, 0);
 
-                    model.posnet_4_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {n_embd_posnet}, 0);
-                    model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   4), {n_embd_posnet}, 0);
+                    model.posnet_4_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {1, n_embd_posnet}, 0);
+                    model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   4), {1, n_embd_posnet}, 0);
 
                     model.posnet_4_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   4), {n_embd_posnet}, 0);
+                    model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   4), {1, n_embd_posnet}, 0);
 
-                    model.posnet_5_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {n_embd_posnet}, 0);
-                    model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   5), {n_embd_posnet}, 0);
+                    model.posnet_5_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {1, n_embd_posnet}, 0);
+                    model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   5), {1, n_embd_posnet}, 0);
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = model.layers[i];
 
                         layer.convnext_dw   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, n_embd_convnext}, 0);
-                        layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias",   i), {n_embd_convnext}, 0);
+                        layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias",   i), {1, n_embd_convnext}, 0);
 
                         layer.convnext_norm   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {n_embd_convnext}, 0);
                         layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias",   i), {n_embd_convnext}, 0);
@@ -10033,9 +10042,8 @@ static struct ggml_tensor * llm_build_norm(
         case LLM_NORM_RMS:   cur = ggml_rms_norm  (ctx, cur, hparams.f_norm_rms_eps); break;
         case LLM_NORM_GROUP:
             {
-                // TODO: these reshapes should be removed, fix ggml_group_norm
                 cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
-                cur = ggml_group_norm(ctx, cur, 32, 1e-6); // TODO: add groups, eps params
+                cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
                 cur = ggml_reshape_2d(ctx, cur, cur->ne[0],    cur->ne[2]);
             } break;
     }
@@ -17256,31 +17264,31 @@ struct llm_build_context {
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
 
         cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1);
-        cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0]));
+        cur = ggml_add(ctx0, cur, model.conv_1d_b);
 
         inpL = cur;
 
         // resnet block 0
         {
             cur = llm_build_norm(ctx0, cur, hparams,
-                    ggml_reshape_2d(ctx0, model.posnet_0_norm1,   1, model.posnet_0_norm1->ne[0]),
-                    ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]),
+                    model.posnet_0_norm1,
+                    model.posnet_0_norm1_b,
                     LLM_NORM_GROUP, cb, 0);
 
             cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
             cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0]));
+            cur = ggml_add(ctx0, cur, model.posnet_0_conv1_b);
 
             cur = llm_build_norm(ctx0, cur, hparams,
-                    ggml_reshape_2d(ctx0, model.posnet_0_norm2,   1, model.posnet_0_norm2->ne[0]),
-                    ggml_reshape_2d(ctx0, model.posnet_0_norm2_b, 1, model.posnet_0_norm2_b->ne[0]),
+                    model.posnet_0_norm2,
+                    model.posnet_0_norm2_b,
                     LLM_NORM_GROUP, cb, 0);
 
             cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
             cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv2_b, 1, model.posnet_0_conv2_b->ne[0]));
+            cur = ggml_add(ctx0, cur, model.posnet_0_conv2_b);
 
             cur = ggml_add(ctx0, cur, inpL);
         }
@@ -17290,24 +17298,24 @@ struct llm_build_context {
         // resnet block 1
         {
             cur = llm_build_norm(ctx0, cur, hparams,
-                    ggml_reshape_2d(ctx0, model.posnet_1_norm1,   1, model.posnet_1_norm1->ne[0]),
-                    ggml_reshape_2d(ctx0, model.posnet_1_norm1_b, 1, model.posnet_1_norm1_b->ne[0]),
+                    model.posnet_1_norm1,
+                    model.posnet_1_norm1_b,
                     LLM_NORM_GROUP, cb, 0);
 
             cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
             cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_1_conv1_b, 1, model.posnet_1_conv1_b->ne[0]));
+            cur = ggml_add(ctx0, cur, model.posnet_1_conv1_b);
 
             cur = llm_build_norm(ctx0, cur, hparams,
-                    ggml_reshape_2d(ctx0, model.posnet_1_norm2,   1, model.posnet_1_norm2->ne[0]),
-                    ggml_reshape_2d(ctx0, model.posnet_1_norm2_b, 1, model.posnet_1_norm2_b->ne[0]),
+                    model.posnet_1_norm2,
+                    model.posnet_1_norm2_b,
                     LLM_NORM_GROUP, cb, 0);
 
             cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
             cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_1_conv2_b, 1, model.posnet_1_conv2_b->ne[0]));
+            cur = ggml_add(ctx0, cur, model.posnet_1_conv2_b);
 
             cur = ggml_add(ctx0, cur, inpL);
         }
@@ -17317,8 +17325,8 @@ struct llm_build_context {
         // attention block
         {
             cur = llm_build_norm(ctx0, cur, hparams,
-                    ggml_reshape_2d(ctx0, model.posnet_2_attn_norm,   1, model.posnet_2_attn_norm->ne[0]),
-                    ggml_reshape_2d(ctx0, model.posnet_2_attn_norm_b, 1, model.posnet_2_attn_norm_b->ne[0]),
+                    model.posnet_2_attn_norm,
+                    model.posnet_2_attn_norm_b,
                     LLM_NORM_GROUP, cb, 0);
 
             struct ggml_tensor * q;
@@ -17329,9 +17337,9 @@ struct llm_build_context {
             k = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_k, cur, 1, 1);
             v = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_v, cur, 1, 1);
 
-            q = ggml_add(ctx0, q, ggml_reshape_2d(ctx0, model.posnet_2_attn_q_b, 1, model.posnet_2_attn_q_b->ne[0]));
-            k = ggml_add(ctx0, k, ggml_reshape_2d(ctx0, model.posnet_2_attn_k_b, 1, model.posnet_2_attn_k_b->ne[0]));
-            v = ggml_add(ctx0, v, ggml_reshape_2d(ctx0, model.posnet_2_attn_v_b, 1, model.posnet_2_attn_v_b->ne[0]));
+            q = ggml_add(ctx0, q, model.posnet_2_attn_q_b);
+            k = ggml_add(ctx0, k, model.posnet_2_attn_k_b);
+            v = ggml_add(ctx0, v, model.posnet_2_attn_v_b);
 
             q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
             k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
@@ -17343,7 +17351,7 @@ struct llm_build_context {
             cur = ggml_mul_mat(ctx0, kq, v);
 
             cur = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_o, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_2_attn_o_b, 1, model.posnet_2_attn_o_b->ne[0]));
+            cur = ggml_add(ctx0, cur, model.posnet_2_attn_o_b);
 
             cur = ggml_add(ctx0, cur, inpL);
         }
@@ -17353,24 +17361,24 @@ struct llm_build_context {
         // resnet block 3
         {
             cur = llm_build_norm(ctx0, cur, hparams,
-                    ggml_reshape_2d(ctx0, model.posnet_3_norm1,   1, model.posnet_3_norm1->ne[0]),
-                    ggml_reshape_2d(ctx0, model.posnet_3_norm1_b, 1, model.posnet_3_norm1_b->ne[0]),
+                    model.posnet_3_norm1,
+                    model.posnet_3_norm1_b,
                     LLM_NORM_GROUP, cb, 0);
 
             cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
             cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv1, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_3_conv1_b, 1, model.posnet_3_conv1_b->ne[0]));
+            cur = ggml_add(ctx0, cur, model.posnet_3_conv1_b);
 
             cur = llm_build_norm(ctx0, cur, hparams,
-                    ggml_reshape_2d(ctx0, model.posnet_3_norm2,   1, model.posnet_3_norm2->ne[0]),
-                    ggml_reshape_2d(ctx0, model.posnet_3_norm2_b, 1, model.posnet_3_norm2_b->ne[0]),
+                    model.posnet_3_norm2,
+                    model.posnet_3_norm2_b,
                     LLM_NORM_GROUP, cb, 0);
 
             cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
             cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv2, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_3_conv2_b, 1, model.posnet_3_conv2_b->ne[0]));
+            cur = ggml_add(ctx0, cur, model.posnet_3_conv2_b);
 
             cur = ggml_add(ctx0, cur, inpL);
         }
@@ -17380,24 +17388,24 @@ struct llm_build_context {
         // resnet block 4
         {
             cur = llm_build_norm(ctx0, cur, hparams,
-                    ggml_reshape_2d(ctx0, model.posnet_4_norm1,   1, model.posnet_4_norm1->ne[0]),
-                    ggml_reshape_2d(ctx0, model.posnet_4_norm1_b, 1, model.posnet_4_norm1_b->ne[0]),
+                    model.posnet_4_norm1,
+                    model.posnet_4_norm1_b,
                     LLM_NORM_GROUP, cb, 0);
 
             cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
             cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv1, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_4_conv1_b, 1, model.posnet_4_conv1_b->ne[0]));
+            cur = ggml_add(ctx0, cur, model.posnet_4_conv1_b);
 
             cur = llm_build_norm(ctx0, cur, hparams,
-                    ggml_reshape_2d(ctx0, model.posnet_4_norm2,   1, model.posnet_4_norm2->ne[0]),
-                    ggml_reshape_2d(ctx0, model.posnet_4_norm2_b, 1, model.posnet_4_norm2_b->ne[0]),
+                    model.posnet_4_norm2,
+                    model.posnet_4_norm2_b,
                     LLM_NORM_GROUP, cb, 0);
 
             cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
             cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv2, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_4_conv2_b, 1, model.posnet_4_conv2_b->ne[0]));
+            cur = ggml_add(ctx0, cur, model.posnet_4_conv2_b);
 
             cur = ggml_add(ctx0, cur, inpL);
         }
@@ -17405,8 +17413,8 @@ struct llm_build_context {
         // normalize block 5
         {
             cur = llm_build_norm(ctx0, cur, hparams,
-                    ggml_reshape_2d(ctx0, model.posnet_5_norm,   1, model.posnet_5_norm->ne[0]),
-                    ggml_reshape_2d(ctx0, model.posnet_5_norm_b, 1, model.posnet_5_norm_b->ne[0]),
+                    model.posnet_5_norm,
+                    model.posnet_5_norm_b,
                     LLM_NORM_GROUP, cb, 0);
         }
 
@@ -17425,7 +17433,7 @@ struct llm_build_context {
             cur = inpL;
 
             cur = ggml_conv_1d_dw_ph(ctx0, model.layers[il].convnext_dw, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.layers[il].convnext_dw_b, 1, model.layers[il].convnext_dw_b->ne[0]));
+            cur = ggml_add(ctx0, cur, model.layers[il].convnext_dw_b);
 
             cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 

From 980d6310322859cc957d77dbfcdee92b62260302 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Dec 2024 19:21:50 +0200
Subject: [PATCH 35/45] llama : refactor wavtokenizer tensors

ggml-ci
---
 common/arg.cpp                   |   2 +-
 convert_hf_to_gguf.py            |  12 +-
 examples/tts/convert_pt_to_hf.py |  27 +-
 examples/tts/tts.cpp             |   4 +
 gguf-py/gguf/constants.py        | 100 ++---
 gguf-py/gguf/gguf_writer.py      |  14 +-
 gguf-py/gguf/tensor_mapping.py   |  50 +--
 src/llama.cpp                    | 694 ++++++++++++-------------------
 8 files changed, 394 insertions(+), 509 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 3d03c676c2b65..93c15ecdcb4d2 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -842,7 +842,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_sparam());
     add_opt(common_arg(
-        {"--sampling-seq"}, "SEQUENCE",
+        {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
         string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
         [](common_params & params, const std::string & value) {
             params.sampling.samplers = common_sampler_types_from_chars(value);
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ad423d0b2625b..7bf67a268183c 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -326,8 +326,8 @@ def prepare_tensors(self):
                             gguf.MODEL_TENSOR.TIME_MIX_W2,
                             gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
                             gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
-                            gguf.MODEL_TENSOR.POS_NET_NORM1,
-                            gguf.MODEL_TENSOR.POS_NET_NORM2,
+                            gguf.MODEL_TENSOR.POSNET_NORM1,
+                            gguf.MODEL_TENSOR.POSNET_NORM2,
                         )
                     )
                     or not new_name.endswith(".weight")
@@ -2059,12 +2059,16 @@ def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size         (self.hparams["vocab_size"])
         self.gguf_writer.add_features_length    (self.hparams["n_embd_features"])
-        self.gguf_writer.add_posnet_length      (self.hparams["n_embd_posnet"])
-        self.gguf_writer.add_convnext_length    (self.hparams["n_embd_convnext"])
         self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
         self.gguf_writer.add_group_norm_eps     (self.hparams["group_norm_epsilon"])
         self.gguf_writer.add_group_norm_groups  (self.hparams["group_norm_groups"])
 
+        self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
+        self.gguf_writer.add_posnet_block_count     (self.hparams["posnet"]["n_layer"])
+
+        self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
+        self.gguf_writer.add_convnext_block_count     (self.hparams["convnext"]["n_layer"])
+
 
 @Model.register("Qwen2MoeForCausalLM")
 class Qwen2MoeModel(Model):
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index 99fef33d27815..adba21a3661a8 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -74,12 +74,13 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         new_key = key
 
         new_key = new_key.replace('state_dict.', '')
+        new_key = new_key.replace('pos_net', 'posnet')
 
-        # check if matches "backbone.pos_net.%d.bias" or "backbone.pos_net.%d.weight"
-        if new_key.startswith("backbone.pos_net."):
-            match = re.match(r"backbone\.pos_net\.(\d+)\.(bias|weight)", new_key)
+        # check if matches "backbone.posnet.%d.bias" or "backbone.posnet.%d.weight"
+        if new_key.startswith("backbone.posnet."):
+            match = re.match(r"backbone\.posnet\.(\d+)\.(bias|weight)", new_key)
             if match:
-               new_key = f"backbone.pos_net.{match.group(1)}.norm.{match.group(2)}"
+               new_key = f"backbone.posnet.{match.group(1)}.norm.{match.group(2)}"
 
         # "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight"
         if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":
@@ -99,7 +100,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
             new_key = new_key.replace("gamma", "gamma.weight")
 
         # convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias
-        if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.pos_net") or new_key.startswith("backbone.embed.bias")):
+        if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.posnet") or new_key.startswith("backbone.embed.bias")):
             value = value.unsqueeze(1)
 
         if new_key.endswith("dwconv.bias"):
@@ -155,8 +156,6 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
     ],
     "hidden_size": 1282,
     "n_embd_features": 512,
-    "n_embd_posnet": 768,
-    "n_embd_convnext": 768,
     "n_ff": 2304,
     "vocab_size": 4096,
     "n_head": 1,
@@ -164,7 +163,19 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
     "group_norm_epsilon": 1e-6,
     "group_norm_groups": 32,
     "max_position_embeddings": 8192, # ?
-    "num_hidden_layers": 12
+    "n_layer": 12,
+    "posnet": {
+        "n_embd": 768,
+        "n_layer": 6
+    },
+    "convnext": {
+        "n_embd": 768,
+        "n_layer": 12
+    },
+    #"n_embd_posnet": 768,
+    #"n_embd_convnext": 768,
+    #"n_layer_posnet": 6,
+    #"n_layer_convnext": 12
 }
 
 with open(path_dst + '/config.json', 'w') as f:
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 57956e5a17075..aa5508be382d4 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -476,6 +476,10 @@ int main(int argc, char ** argv) {
         smpl[i] = common_sampler_init(model_ttc, params.sampling);
     }
 
+    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl[0]));
+    LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str());
+    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl[0]).c_str());
+
     LOG_INF("%s: loading done\n", __func__);
 
     const auto t_main_start = ggml_time_us();
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 139d0d15f7e06..a40df974d1fca 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -91,8 +91,6 @@ class LLM:
         CONTEXT_LENGTH                    = "{arch}.context_length"
         EMBEDDING_LENGTH                  = "{arch}.embedding_length"
         FEATURES_LENGTH                   = "{arch}.features_length"
-        POSNET_LENGTH                     = "{arch}.posnet_length"
-        CONVNEXT_LENGTH                   = "{arch}.convnext_length"
         BLOCK_COUNT                       = "{arch}.block_count"
         LEADING_DENSE_BLOCK_COUNT         = "{arch}.leading_dense_block_count"
         FEED_FORWARD_LENGTH               = "{arch}.feed_forward_length"
@@ -160,6 +158,14 @@ class SSM:
     class WKV:
         HEAD_SIZE = "{arch}.wkv.head_size"
 
+    class PosNet:
+        EMBEDDING_LENGTH = "{arch}.posnet.embedding_length"
+        BLOCK_COUNT      = "{arch}.posnet.block_count"
+
+    class ConvNext:
+        EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
+        BLOCK_COUNT      = "{arch}.convnext.block_count"
+
     class Tokenizer:
         MODEL                = "tokenizer.ggml.model"
         PRE                  = "tokenizer.ggml.pre"
@@ -377,21 +383,21 @@ class MODEL_TENSOR(IntEnum):
     CLS                  = auto() # classifier
     CLS_OUT              = auto() # classifier output projection
     CONV1D               = auto()
-    CONV_NEXT_DW         = auto()
-    CONV_NEXT_NORM       = auto()
-    CONV_NEXT_PW1        = auto()
-    CONV_NEXT_PW2        = auto()
-    CONV_NEXT_GAMMA      = auto()
-    POS_NET_CONV1        = auto()
-    POS_NET_CONV2        = auto()
-    POS_NET_NORM         = auto()
-    POS_NET_NORM1        = auto()
-    POS_NET_NORM2        = auto()
-    POS_NET_ATTN_NORM    = auto()
-    POS_NET_ATTN_Q       = auto()
-    POS_NET_ATTN_K       = auto()
-    POS_NET_ATTN_V       = auto()
-    POS_NET_ATTN_OUT     = auto()
+    CONVNEXT_DW          = auto()
+    CONVNEXT_NORM        = auto()
+    CONVNEXT_PW1         = auto()
+    CONVNEXT_PW2         = auto()
+    CONVNEXT_GAMMA       = auto()
+    POSNET_CONV1         = auto()
+    POSNET_CONV2         = auto()
+    POSNET_NORM          = auto()
+    POSNET_NORM1         = auto()
+    POSNET_NORM2         = auto()
+    POSNET_ATTN_NORM     = auto()
+    POSNET_ATTN_Q        = auto()
+    POSNET_ATTN_K        = auto()
+    POSNET_ATTN_V        = auto()
+    POSNET_ATTN_OUT      = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -558,21 +564,21 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.CLS:                       "cls",
     MODEL_TENSOR.CLS_OUT:                   "cls.output",
     MODEL_TENSOR.CONV1D:                    "conv1d",
-    MODEL_TENSOR.CONV_NEXT_DW:              "conv_next.{bid}.dw",
-    MODEL_TENSOR.CONV_NEXT_NORM:            "conv_next.{bid}.norm",
-    MODEL_TENSOR.CONV_NEXT_PW1:             "conv_next.{bid}.pw1",
-    MODEL_TENSOR.CONV_NEXT_PW2:             "conv_next.{bid}.pw2",
-    MODEL_TENSOR.CONV_NEXT_GAMMA:           "conv_next.{bid}.gamma",
-    MODEL_TENSOR.POS_NET_CONV1:             "pos_net.{bid}.conv1",
-    MODEL_TENSOR.POS_NET_CONV2:             "pos_net.{bid}.conv2",
-    MODEL_TENSOR.POS_NET_NORM:              "pos_net.{bid}.norm",
-    MODEL_TENSOR.POS_NET_NORM1:             "pos_net.{bid}.norm1",
-    MODEL_TENSOR.POS_NET_NORM2:             "pos_net.{bid}.norm2",
-    MODEL_TENSOR.POS_NET_ATTN_NORM:         "pos_net.{bid}.attn_norm",
-    MODEL_TENSOR.POS_NET_ATTN_Q:            "pos_net.{bid}.attn_q",
-    MODEL_TENSOR.POS_NET_ATTN_K:            "pos_net.{bid}.attn_k",
-    MODEL_TENSOR.POS_NET_ATTN_V:            "pos_net.{bid}.attn_v",
-    MODEL_TENSOR.POS_NET_ATTN_OUT:          "pos_net.{bid}.attn_output",
+    MODEL_TENSOR.CONVNEXT_DW:               "convnext.{bid}.dw",
+    MODEL_TENSOR.CONVNEXT_NORM:             "convnext.{bid}.norm",
+    MODEL_TENSOR.CONVNEXT_PW1:              "convnext.{bid}.pw1",
+    MODEL_TENSOR.CONVNEXT_PW2:              "convnext.{bid}.pw2",
+    MODEL_TENSOR.CONVNEXT_GAMMA:            "convnext.{bid}.gamma",
+    MODEL_TENSOR.POSNET_CONV1:              "posnet.{bid}.conv1",
+    MODEL_TENSOR.POSNET_CONV2:              "posnet.{bid}.conv2",
+    MODEL_TENSOR.POSNET_NORM:               "posnet.{bid}.norm",
+    MODEL_TENSOR.POSNET_NORM1:              "posnet.{bid}.norm1",
+    MODEL_TENSOR.POSNET_NORM2:              "posnet.{bid}.norm2",
+    MODEL_TENSOR.POSNET_ATTN_NORM:          "posnet.{bid}.attn_norm",
+    MODEL_TENSOR.POSNET_ATTN_Q:             "posnet.{bid}.attn_q",
+    MODEL_TENSOR.POSNET_ATTN_K:             "posnet.{bid}.attn_k",
+    MODEL_TENSOR.POSNET_ATTN_V:             "posnet.{bid}.attn_v",
+    MODEL_TENSOR.POSNET_ATTN_OUT:           "posnet.{bid}.attn_output",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1415,23 +1421,23 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.TOKEN_EMBD_NORM,
         MODEL_TENSOR.CONV1D,
-        MODEL_TENSOR.CONV_NEXT_DW,
-        MODEL_TENSOR.CONV_NEXT_NORM,
-        MODEL_TENSOR.CONV_NEXT_PW1,
-        MODEL_TENSOR.CONV_NEXT_PW2,
-        MODEL_TENSOR.CONV_NEXT_GAMMA,
+        MODEL_TENSOR.CONVNEXT_DW,
+        MODEL_TENSOR.CONVNEXT_NORM,
+        MODEL_TENSOR.CONVNEXT_PW1,
+        MODEL_TENSOR.CONVNEXT_PW2,
+        MODEL_TENSOR.CONVNEXT_GAMMA,
         MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.POS_NET_CONV1,
-        MODEL_TENSOR.POS_NET_CONV2,
-        MODEL_TENSOR.POS_NET_NORM,
-        MODEL_TENSOR.POS_NET_NORM1,
-        MODEL_TENSOR.POS_NET_NORM2,
-        MODEL_TENSOR.POS_NET_ATTN_NORM,
-        MODEL_TENSOR.POS_NET_ATTN_Q,
-        MODEL_TENSOR.POS_NET_ATTN_K,
-        MODEL_TENSOR.POS_NET_ATTN_V,
-        MODEL_TENSOR.POS_NET_ATTN_OUT,
+        MODEL_TENSOR.POSNET_CONV1,
+        MODEL_TENSOR.POSNET_CONV2,
+        MODEL_TENSOR.POSNET_NORM,
+        MODEL_TENSOR.POSNET_NORM1,
+        MODEL_TENSOR.POSNET_NORM2,
+        MODEL_TENSOR.POSNET_ATTN_NORM,
+        MODEL_TENSOR.POSNET_ATTN_Q,
+        MODEL_TENSOR.POSNET_ATTN_K,
+        MODEL_TENSOR.POSNET_ATTN_V,
+        MODEL_TENSOR.POSNET_ATTN_OUT,
     ],
     # TODO
 }
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 5bb2a8af87e25..3023b539ae82b 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -634,11 +634,17 @@ def add_embedding_length(self, length: int) -> None:
     def add_features_length(self, length: int) -> None:
         self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
 
-    def add_posnet_length(self, length: int) -> None:
-        self.add_uint32(Keys.LLM.POSNET_LENGTH.format(arch=self.arch), length)
+    def add_posnet_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length)
 
-    def add_convnext_length(self, length: int) -> None:
-        self.add_uint32(Keys.LLM.CONVNEXT_LENGTH.format(arch=self.arch), length)
+    def add_posnet_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_convnext_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_convnext_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
 
     def add_block_count(self, length: int) -> None:
         self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 296f1ca054cf2..82cdb121a1f26 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -704,64 +704,64 @@ class TensorNameMap:
         ),
         #############################################################################
 
-        MODEL_TENSOR.CONV_NEXT_DW: (
+        MODEL_TENSOR.CONVNEXT_DW: (
             "backbone.convnext.{bid}.dwconv", # wavtokenizer
         ),
 
-        MODEL_TENSOR.CONV_NEXT_NORM: (
+        MODEL_TENSOR.CONVNEXT_NORM: (
             "backbone.convnext.{bid}.norm", # wavtokenizer
         ),
 
-        MODEL_TENSOR.CONV_NEXT_PW1: (
+        MODEL_TENSOR.CONVNEXT_PW1: (
             "backbone.convnext.{bid}.pwconv1", # wavtokenizer
         ),
 
-        MODEL_TENSOR.CONV_NEXT_PW2: (
+        MODEL_TENSOR.CONVNEXT_PW2: (
             "backbone.convnext.{bid}.pwconv2", # wavtokenizer
         ),
 
-        MODEL_TENSOR.CONV_NEXT_GAMMA: (
+        MODEL_TENSOR.CONVNEXT_GAMMA: (
             "backbone.convnext.{bid}.gamma", # wavtokenizer
         ),
 
-        MODEL_TENSOR.POS_NET_CONV1: (
-            "backbone.pos_net.{bid}.conv1", # wavtokenizer
+        MODEL_TENSOR.POSNET_CONV1: (
+            "backbone.posnet.{bid}.conv1", # wavtokenizer
         ),
 
-        MODEL_TENSOR.POS_NET_CONV2: (
-            "backbone.pos_net.{bid}.conv2", # wavtokenizer
+        MODEL_TENSOR.POSNET_CONV2: (
+            "backbone.posnet.{bid}.conv2", # wavtokenizer
         ),
 
-        MODEL_TENSOR.POS_NET_NORM: (
-            "backbone.pos_net.{bid}.norm", # wavtokenizer
+        MODEL_TENSOR.POSNET_NORM: (
+            "backbone.posnet.{bid}.norm", # wavtokenizer
         ),
 
-        MODEL_TENSOR.POS_NET_NORM1: (
-            "backbone.pos_net.{bid}.norm1", # wavtokenizer
+        MODEL_TENSOR.POSNET_NORM1: (
+            "backbone.posnet.{bid}.norm1", # wavtokenizer
         ),
 
-        MODEL_TENSOR.POS_NET_NORM2: (
-            "backbone.pos_net.{bid}.norm2", # wavtokenizer
+        MODEL_TENSOR.POSNET_NORM2: (
+            "backbone.posnet.{bid}.norm2", # wavtokenizer
         ),
 
-        MODEL_TENSOR.POS_NET_ATTN_NORM: (
-            "backbone.pos_net.{bid}.norm", # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_NORM: (
+            "backbone.posnet.{bid}.norm", # wavtokenizer
         ),
 
-        MODEL_TENSOR.POS_NET_ATTN_Q: (
-            "backbone.pos_net.{bid}.q", # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_Q: (
+            "backbone.posnet.{bid}.q", # wavtokenizer
         ),
 
-        MODEL_TENSOR.POS_NET_ATTN_K: (
-            "backbone.pos_net.{bid}.k", # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_K: (
+            "backbone.posnet.{bid}.k", # wavtokenizer
         ),
 
-        MODEL_TENSOR.POS_NET_ATTN_V: (
-            "backbone.pos_net.{bid}.v", # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_V: (
+            "backbone.posnet.{bid}.v", # wavtokenizer
         ),
 
-        MODEL_TENSOR.POS_NET_ATTN_OUT: (
-            "backbone.pos_net.{bid}.proj_out", # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_OUT: (
+            "backbone.posnet.{bid}.proj_out", # wavtokenizer
         ),
     }
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 4c298d4062cc3..2d16ce72ba4b4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -277,8 +277,6 @@ enum llm_kv {
     LLM_KV_CONTEXT_LENGTH,
     LLM_KV_EMBEDDING_LENGTH,
     LLM_KV_FEATURES_LENGTH,
-    LLM_KV_POSNET_LENGTH,
-    LLM_KV_CONVNEXT_LENGTH,
     LLM_KV_BLOCK_COUNT,
     LLM_KV_LEADING_DENSE_BLOCK_COUNT,
     LLM_KV_FEED_FORWARD_LENGTH,
@@ -375,6 +373,12 @@ enum llm_kv {
     LLM_KV_ADAPTER_TYPE,
     LLM_KV_ADAPTER_LORA_ALPHA,
 
+    LLM_KV_POSNET_EMBEDDING_LENGTH,
+    LLM_KV_POSNET_BLOCK_COUNT,
+
+    LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
+    LLM_KV_CONVNEXT_BLOCK_COUNT,
+
     // deprecated:
     LLM_KV_TOKENIZER_PREFIX_ID,
     LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -399,8 +403,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_CONTEXT_LENGTH,                    "%s.context_length"                    },
     { LLM_KV_EMBEDDING_LENGTH,                  "%s.embedding_length"                  },
     { LLM_KV_FEATURES_LENGTH,                   "%s.features_length"                   },
-    { LLM_KV_POSNET_LENGTH,                     "%s.posnet_length"                     },
-    { LLM_KV_CONVNEXT_LENGTH,                   "%s.convnext_length"                   },
     { LLM_KV_BLOCK_COUNT,                       "%s.block_count"                       },
     { LLM_KV_LEADING_DENSE_BLOCK_COUNT,         "%s.leading_dense_block_count"         },
     { LLM_KV_FEED_FORWARD_LENGTH,               "%s.feed_forward_length"               },
@@ -464,6 +466,12 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
     { LLM_KV_WKV_HEAD_SIZE,                    "%s.wkv.head_size" },
 
+    { LLM_KV_POSNET_EMBEDDING_LENGTH,          "%s.posnet.embedding_length" },
+    { LLM_KV_POSNET_BLOCK_COUNT,               "%s.posnet.block_count"      },
+
+    { LLM_KV_CONVNEXT_EMBEDDING_LENGTH,        "%s.convnext.embedding_length" },
+    { LLM_KV_CONVNEXT_BLOCK_COUNT,             "%s.convnext.block_count"      },
+
     { LLM_KV_TOKENIZER_MODEL,                  "tokenizer.ggml.model"                    },
     { LLM_KV_TOKENIZER_PRE,                    "tokenizer.ggml.pre"                      },
     { LLM_KV_TOKENIZER_LIST,                   "tokenizer.ggml.tokens"                   },
@@ -623,11 +631,11 @@ enum llm_tensor {
     LLM_TENSOR_CLS,
     LLM_TENSOR_CLS_OUT,
     LLM_TENSOR_CONV1D,
-    LLM_TENSOR_CONV_NEXT_DW,
-    LLM_TENSOR_CONV_NEXT_NORM,
-    LLM_TENSOR_CONV_NEXT_PW1,
-    LLM_TENSOR_CONV_NEXT_PW2,
-    LLM_TENSOR_CONV_NEXT_GAMMA,
+    LLM_TENSOR_CONVNEXT_DW,
+    LLM_TENSOR_CONVNEXT_NORM,
+    LLM_TENSOR_CONVNEXT_PW1,
+    LLM_TENSOR_CONVNEXT_PW2,
+    LLM_TENSOR_CONVNEXT_GAMMA,
     LLM_TENSOR_POS_NET_CONV1,
     LLM_TENSOR_POS_NET_CONV2,
     LLM_TENSOR_POS_NET_NORM,
@@ -1628,23 +1636,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
             { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
             { LLM_TENSOR_CONV1D,            "conv1d" },
-            { LLM_TENSOR_CONV_NEXT_DW,      "conv_next.%d.dw" },
-            { LLM_TENSOR_CONV_NEXT_NORM,    "conv_next.%d.norm" },
-            { LLM_TENSOR_CONV_NEXT_PW1,     "conv_next.%d.pw1" },
-            { LLM_TENSOR_CONV_NEXT_PW2,     "conv_next.%d.pw2" },
-            { LLM_TENSOR_CONV_NEXT_GAMMA,   "conv_next.%d.gamma" },
+            { LLM_TENSOR_CONVNEXT_DW,       "convnext.%d.dw" },
+            { LLM_TENSOR_CONVNEXT_NORM,     "convnext.%d.norm" },
+            { LLM_TENSOR_CONVNEXT_PW1,      "convnext.%d.pw1" },
+            { LLM_TENSOR_CONVNEXT_PW2,      "convnext.%d.pw2" },
+            { LLM_TENSOR_CONVNEXT_GAMMA,    "convnext.%d.gamma" },
             { LLM_TENSOR_OUTPUT_NORM,       "output_norm" },
             { LLM_TENSOR_OUTPUT,            "output" },
-            { LLM_TENSOR_POS_NET_CONV1,     "pos_net.%d.conv1" },
-            { LLM_TENSOR_POS_NET_CONV2,     "pos_net.%d.conv2" },
-            { LLM_TENSOR_POS_NET_NORM,      "pos_net.%d.norm" },
-            { LLM_TENSOR_POS_NET_NORM1,     "pos_net.%d.norm1" },
-            { LLM_TENSOR_POS_NET_NORM2,     "pos_net.%d.norm2" },
-            { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.%d.attn_norm" },
-            { LLM_TENSOR_POS_NET_ATTN_Q,    "pos_net.%d.attn_q" },
-            { LLM_TENSOR_POS_NET_ATTN_K,    "pos_net.%d.attn_k" },
-            { LLM_TENSOR_POS_NET_ATTN_V,    "pos_net.%d.attn_v" },
-            { LLM_TENSOR_POS_NET_ATTN_OUT,  "pos_net.%d.attn_output" },
+            { LLM_TENSOR_POS_NET_CONV1,     "posnet.%d.conv1" },
+            { LLM_TENSOR_POS_NET_CONV2,     "posnet.%d.conv2" },
+            { LLM_TENSOR_POS_NET_NORM,      "posnet.%d.norm" },
+            { LLM_TENSOR_POS_NET_NORM1,     "posnet.%d.norm1" },
+            { LLM_TENSOR_POS_NET_NORM2,     "posnet.%d.norm2" },
+            { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
+            { LLM_TENSOR_POS_NET_ATTN_Q,    "posnet.%d.attn_q" },
+            { LLM_TENSOR_POS_NET_ATTN_K,    "posnet.%d.attn_k" },
+            { LLM_TENSOR_POS_NET_ATTN_V,    "posnet.%d.attn_v" },
+            { LLM_TENSOR_POS_NET_ATTN_OUT,  "posnet.%d.attn_output" },
         },
     },
     {
@@ -2537,6 +2545,16 @@ static const size_t kiB = 1024;
 static const size_t MiB = 1024*kiB;
 static const size_t GiB = 1024*MiB;
 
+struct llama_hparams_posnet {
+    uint32_t n_embd;
+    uint32_t n_layer;
+};
+
+struct llama_hparams_convnext {
+    uint32_t n_embd;
+    uint32_t n_layer;
+};
+
 struct llama_hparams {
     bool vocab_only;
     bool rope_finetuned;
@@ -2546,6 +2564,7 @@ struct llama_hparams {
     uint32_t n_vocab = 0;
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
+    uint32_t n_embd_features = 0;
     uint32_t n_layer;
     uint32_t n_rot;
     uint32_t n_swa = 0; // sliding window attention (SWA)
@@ -2557,9 +2576,8 @@ struct llama_hparams {
     uint32_t n_rel_attn_bkts = 0;
 
     // for WavTokenizer
-    uint32_t n_embd_features = 0;
-    uint32_t n_embd_posnet = 0;
-    uint32_t n_embd_convnext = 0;
+    struct llama_hparams_posnet   posnet;
+    struct llama_hparams_convnext convnext;
 
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
@@ -2623,66 +2641,6 @@ struct llama_hparams {
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
     enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
 
-    bool operator!=(const llama_hparams & other) const {
-        if (this->vocab_only    != other.vocab_only)    return true;
-        if (this->n_vocab       != other.n_vocab)       return true;
-        if (this->n_ctx_train   != other.n_ctx_train)   return true;
-        if (this->n_embd        != other.n_embd)        return true;
-        if (this->n_layer       != other.n_layer)       return true;
-        if (this->n_rot         != other.n_rot)         return true;
-        if (this->n_swa         != other.n_swa)         return true;
-        if (this->n_embd_head_k != other.n_embd_head_k) return true;
-        if (this->n_embd_head_v != other.n_embd_head_v) return true;
-        if (this->n_expert      != other.n_expert)      return true;
-        if (this->n_expert_used != other.n_expert_used) return true;
-
-        if (this->n_head_arr    != other.n_head_arr)    return true;
-        if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
-        if (this->n_ff_arr      != other.n_ff_arr)      return true;
-
-        if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
-        if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
-        if (this->n_lora_q           != other.n_lora_q)           return true;
-        if (this->n_lora_kv          != other.n_lora_kv)          return true;
-        if (this->n_ff_exp           != other.n_ff_exp)           return true;
-        if (this->n_ff_shexp         != other.n_ff_shexp)         return true;
-        if (this->n_expert_shared    != other.n_expert_shared)    return true;
-
-        if (this->rope_finetuned  != other.rope_finetuned)  return true;
-        if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
-        if (std::equal(std::begin(this->rope_sections),
-                       std::end(this->rope_sections),
-                       std::begin(other.rope_sections)))    return true;
-
-        if (this->ssm_d_conv  != other.ssm_d_conv)  return true;
-        if (this->ssm_d_inner != other.ssm_d_inner) return true;
-        if (this->ssm_d_state != other.ssm_d_state) return true;
-        if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
-        if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
-
-        if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
-        if (this->time_mix_extra_dim     != other.time_mix_extra_dim)     return true;
-        if (this->time_decay_extra_dim   != other.time_decay_extra_dim)   return true;
-        if (this->wkv_head_size          != other.wkv_head_size)          return true;
-
-        if (this->dec_start_token_id != other.dec_start_token_id) return true;
-
-        const float EPSILON = 1e-9f;
-
-        if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
-        if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
-        if (!is_float_close(this->rope_attn_factor,      other.rope_attn_factor,      EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
-        if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
-        if (!is_float_close(this->expert_weights_scale,  other.expert_weights_scale,  EPSILON)) return true;
-        if (!is_float_close(this->rope_yarn_log_mul,     other.rope_yarn_log_mul,     EPSILON)) return true;
-        if (!is_float_close(this->f_residual_scale,      other.f_residual_scale,      EPSILON)) return true;
-        if (!is_float_close(this->f_embedding_scale,     other.f_embedding_scale,     EPSILON)) return true;
-        if (!is_float_close(this->f_attention_scale,     other.f_attention_scale,     EPSILON)) return true;
-
-        return false;
-    }
-
     uint32_t n_head(uint32_t il = 0) const {
         if (il < n_layer) {
             return n_head_arr[il];
@@ -2735,21 +2693,21 @@ struct llama_hparams {
         if (wkv_head_size != 0) {
             // for RWKV models
             return 2 * n_embd;
-        } else {
-            // TODO: maybe support other convolution strides than 1
-            // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
-            return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
         }
+
+        // TODO: maybe support other convolution strides than 1
+        // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
+        return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
     }
 
     uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
         if (wkv_head_size != 0) {
             // corresponds to RWKV's wkv_states size
             return n_embd * wkv_head_size;
-        } else {
-            // corresponds to Mamba's ssm_states size
-            return ssm_d_state * ssm_d_inner;
         }
+
+        // corresponds to Mamba's ssm_states size
+        return ssm_d_state * ssm_d_inner;
     }
 };
 
@@ -2787,6 +2745,57 @@ struct llama_cparams {
     void * cb_eval_user_data;
 };
 
+struct llama_layer_posnet {
+    // resnet
+    struct ggml_tensor * norm1 = nullptr;
+    struct ggml_tensor * norm1_b = nullptr;
+
+    struct ggml_tensor * conv1 = nullptr;
+    struct ggml_tensor * conv1_b = nullptr;
+
+    struct ggml_tensor * norm2 = nullptr;
+    struct ggml_tensor * norm2_b = nullptr;
+
+    struct ggml_tensor * conv2 = nullptr;
+    struct ggml_tensor * conv2_b = nullptr;
+
+    // attention
+    struct ggml_tensor * attn_norm = nullptr;
+    struct ggml_tensor * attn_norm_b = nullptr;
+
+    struct ggml_tensor * attn_q = nullptr;
+    struct ggml_tensor * attn_q_b = nullptr;
+
+    struct ggml_tensor * attn_k = nullptr;
+    struct ggml_tensor * attn_k_b = nullptr;
+
+    struct ggml_tensor * attn_v = nullptr;
+    struct ggml_tensor * attn_v_b = nullptr;
+
+    struct ggml_tensor * attn_o = nullptr;
+    struct ggml_tensor * attn_o_b = nullptr;
+
+    // normalize
+    struct ggml_tensor * norm = nullptr;
+    struct ggml_tensor * norm_b = nullptr;
+};
+
+struct llama_layer_convnext {
+    struct ggml_tensor * dw;
+    struct ggml_tensor * dw_b;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * norm_b;
+
+    struct ggml_tensor * pw1;
+    struct ggml_tensor * pw1_b;
+
+    struct ggml_tensor * pw2;
+    struct ggml_tensor * pw2_b;
+
+    struct ggml_tensor * gamma;
+};
+
 // TODO: separate into "llama_layer_enc" and "llama_layer_dec"
 struct llama_layer {
     llama_layer() {
@@ -2938,20 +2947,9 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_scale;
     struct ggml_tensor * ffn_down_scale;
 
-    // convnext
-    struct ggml_tensor * convnext_dw;
-    struct ggml_tensor * convnext_dw_b;
-
-    struct ggml_tensor * convnext_norm;
-    struct ggml_tensor * convnext_norm_b;
-
-    struct ggml_tensor * convnext_pw1;
-    struct ggml_tensor * convnext_pw1_b;
+    struct llama_layer_posnet posnet;
 
-    struct ggml_tensor * convnext_pw2;
-    struct ggml_tensor * convnext_pw2_b;
-
-    struct ggml_tensor * convnext_gamma;
+    struct llama_layer_convnext convnext;
 };
 
 // very similar to llama_batch,
@@ -3082,85 +3080,9 @@ struct llama_model {
     struct ggml_tensor * cls_out   = nullptr;
     struct ggml_tensor * cls_out_b = nullptr;
 
-    // wavtokenizer decoder
-    // TODO: dedup
     struct ggml_tensor * conv_1d = nullptr;
     struct ggml_tensor * conv_1d_b = nullptr;
 
-    struct ggml_tensor * hann_window = nullptr;
-
-    // resnet 0
-    struct ggml_tensor * posnet_0_norm1 = nullptr;
-    struct ggml_tensor * posnet_0_norm1_b = nullptr;
-
-    struct ggml_tensor * posnet_0_conv1 = nullptr;
-    struct ggml_tensor * posnet_0_conv1_b = nullptr;
-
-    struct ggml_tensor * posnet_0_norm2 = nullptr;
-    struct ggml_tensor * posnet_0_norm2_b = nullptr;
-
-    struct ggml_tensor * posnet_0_conv2 = nullptr;
-    struct ggml_tensor * posnet_0_conv2_b = nullptr;
-
-    // resnet 1
-    struct ggml_tensor * posnet_1_norm1 = nullptr;
-    struct ggml_tensor * posnet_1_norm1_b = nullptr;
-
-    struct ggml_tensor * posnet_1_conv1 = nullptr;
-    struct ggml_tensor * posnet_1_conv1_b = nullptr;
-
-    struct ggml_tensor * posnet_1_norm2 = nullptr;
-    struct ggml_tensor * posnet_1_norm2_b = nullptr;
-
-    struct ggml_tensor * posnet_1_conv2 = nullptr;
-    struct ggml_tensor * posnet_1_conv2_b = nullptr;
-
-    // attn 2
-    struct ggml_tensor * posnet_2_attn_norm = nullptr;
-    struct ggml_tensor * posnet_2_attn_norm_b = nullptr;
-
-    struct ggml_tensor * posnet_2_attn_q = nullptr;
-    struct ggml_tensor * posnet_2_attn_q_b = nullptr;
-
-    struct ggml_tensor * posnet_2_attn_k = nullptr;
-    struct ggml_tensor * posnet_2_attn_k_b = nullptr;
-
-    struct ggml_tensor * posnet_2_attn_v = nullptr;
-    struct ggml_tensor * posnet_2_attn_v_b = nullptr;
-
-    struct ggml_tensor * posnet_2_attn_o = nullptr;
-    struct ggml_tensor * posnet_2_attn_o_b = nullptr;
-
-    // resnet 3
-    struct ggml_tensor * posnet_3_norm1 = nullptr;
-    struct ggml_tensor * posnet_3_norm1_b = nullptr;
-
-    struct ggml_tensor * posnet_3_conv1 = nullptr;
-    struct ggml_tensor * posnet_3_conv1_b = nullptr;
-
-    struct ggml_tensor * posnet_3_norm2 = nullptr;
-    struct ggml_tensor * posnet_3_norm2_b = nullptr;
-
-    struct ggml_tensor * posnet_3_conv2 = nullptr;
-    struct ggml_tensor * posnet_3_conv2_b = nullptr;
-
-    // resnet 4
-    struct ggml_tensor * posnet_4_norm1 = nullptr;
-    struct ggml_tensor * posnet_4_norm1_b = nullptr;
-
-    struct ggml_tensor * posnet_4_conv1 = nullptr;
-    struct ggml_tensor * posnet_4_conv1_b = nullptr;
-
-    struct ggml_tensor * posnet_4_norm2 = nullptr;
-    struct ggml_tensor * posnet_4_norm2_b = nullptr;
-
-    struct ggml_tensor * posnet_4_conv2 = nullptr;
-    struct ggml_tensor * posnet_4_conv2_b = nullptr;
-
-    // resnet 5
-    struct ggml_tensor * posnet_5_norm = nullptr;
-    struct ggml_tensor * posnet_5_norm_b = nullptr;
-
     std::vector<llama_layer> layers;
 
     // gguf metadata
@@ -5705,8 +5627,12 @@ static void llm_load_hparams(
 
     if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
         ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
-        ml.get_key(LLM_KV_POSNET_LENGTH,   hparams.n_embd_posnet);
-        ml.get_key(LLM_KV_CONVNEXT_LENGTH, hparams.n_embd_convnext);
+
+        ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
+        ml.get_key(LLM_KV_POSNET_BLOCK_COUNT,      hparams.posnet.n_layer);
+
+        ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
+        ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT,      hparams.convnext.n_layer);
     }
 
     GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
@@ -7493,11 +7419,11 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_POS_NET_ATTN_K,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_POS_NET_ATTN_V,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_POS_NET_ATTN_OUT,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CONV_NEXT_DW,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
-    {LLM_TENSOR_CONV_NEXT_NORM,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_CONV_NEXT_PW1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CONV_NEXT_PW2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_CONV_NEXT_GAMMA,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CONVNEXT_DW,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
+    {LLM_TENSOR_CONVNEXT_NORM,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CONVNEXT_PW1,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONVNEXT_PW2,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CONVNEXT_GAMMA,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
@@ -7738,7 +7664,8 @@ static bool llm_load_tensors(
     model.main_gpu     = main_gpu;
     model.n_gpu_layers = n_gpu_layers;
 
-    const int n_layer     = hparams.n_layer;
+    const int n_layer = hparams.n_layer;
+
     bool use_mmap_buffer = true;
 
     // build a list of buffer types for the CPU and GPU devices
@@ -9574,107 +9501,105 @@ static bool llm_load_tensors(
                 } break;
             case LLM_ARCH_WAVTOKENIZER_DEC:
                 {
-                    const int64_t n_embd_features = hparams.n_embd_features;
-                    const int64_t n_embd_posnet   = hparams.n_embd_posnet;
-                    const int64_t n_embd_convnext = hparams.n_embd_convnext;
-
-                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd_features, n_vocab}, 0);
-
-                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd_posnet}, 0);
-                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd_posnet}, 0);
-
-                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd_features, n_embd_posnet}, 0);
-                    model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, n_embd_posnet}, 0);
-
-                    model.posnet_0_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {1, n_embd_posnet}, 0);
-                    model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   0), {1, n_embd_posnet}, 0);
-
-                    model.posnet_0_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   0), {1, n_embd_posnet}, 0);
-
-                    model.posnet_0_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {1, n_embd_posnet}, 0);
-                    model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   0), {1, n_embd_posnet}, 0);
-
-                    model.posnet_0_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   0), {1, n_embd_posnet}, 0);
-
-                    model.posnet_1_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {1, n_embd_posnet}, 0);
-                    model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   1), {1, n_embd_posnet}, 0);
-
-                    model.posnet_1_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   1), {1, n_embd_posnet}, 0);
-
-                    model.posnet_1_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {1, n_embd_posnet}, 0);
-                    model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   1), {1, n_embd_posnet}, 0);
-
-                    model.posnet_1_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   1), {1, n_embd_posnet}, 0);
-
-                    model.posnet_2_attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {1, n_embd_posnet}, 0);
-                    model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   2), {1, n_embd_posnet}, 0);
-
-                    model.posnet_2_attn_q   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias",   2), {1, n_embd_posnet}, 0);
-
-                    model.posnet_2_attn_k   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias",   2), {1, n_embd_posnet}, 0);
-
-                    model.posnet_2_attn_v   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias",   2), {1, n_embd_posnet}, 0);
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
 
-                    model.posnet_2_attn_o   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias",   2), {1, n_embd_posnet}, 0);
+                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
+                    model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, hparams.posnet.n_embd}, 0);
 
-                    model.posnet_3_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {1, n_embd_posnet}, 0);
-                    model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   3), {1, n_embd_posnet}, 0);
-
-                    model.posnet_3_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   3), {1, n_embd_posnet}, 0);
-
-                    model.posnet_3_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {1, n_embd_posnet}, 0);
-                    model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   3), {1, n_embd_posnet}, 0);
-
-                    model.posnet_3_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   3), {1, n_embd_posnet}, 0);
-
-                    model.posnet_4_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {1, n_embd_posnet}, 0);
-                    model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   4), {1, n_embd_posnet}, 0);
+                    // posnet
+                    {
+                        const int64_t n_embd = hparams.posnet.n_embd;
+
+                        for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
+                            auto & layer = model.layers[i].posnet;
+
+                            // posnet:
+                            //
+                            //  - resnet
+                            //  - resnet
+                            //  - attn
+                            //  - resnet
+                            //  - resnet
+                            //  - norm
+                            //
+                            switch (i) {
+                                case 0:
+                                case 1:
+                                case 3:
+                                case 4:
+                                    {
+                                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
+                                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
+                                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
+                                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
+                                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);
+                                    } break;
+                                case 2:
+                                    {
+                                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+                                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);
+
+                                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);
+                                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);
+                                    } break;
+                                case 5:
+                                    {
+                                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
+                                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
+                                    } break;
+                                default: GGML_ABORT("unknown posnet layer");
+                            };
+                        }
+                    }
 
-                    model.posnet_4_conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   4), {1, n_embd_posnet}, 0);
+                    GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
 
-                    model.posnet_4_norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {1, n_embd_posnet}, 0);
-                    model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   4), {1, n_embd_posnet}, 0);
+                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
+                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {hparams.posnet.n_embd}, 0);
 
-                    model.posnet_4_conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0);
-                    model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   4), {1, n_embd_posnet}, 0);
+                    // convnext
+                    {
+                        const int64_t n_embd = hparams.convnext.n_embd;
 
-                    model.posnet_5_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {1, n_embd_posnet}, 0);
-                    model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   5), {1, n_embd_posnet}, 0);
+                        for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
+                            auto & layer = model.layers[i].convnext;
 
-                    for (int i = 0; i < n_layer; ++i) {
-                        auto & layer = model.layers[i];
+                            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);
+                            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);
 
-                        layer.convnext_dw   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, n_embd_convnext}, 0);
-                        layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias",   i), {1, n_embd_convnext}, 0);
+                            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);
+                            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);
 
-                        layer.convnext_norm   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {n_embd_convnext}, 0);
-                        layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias",   i), {n_embd_convnext}, 0);
+                            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);
+                            layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);
 
-                        layer.convnext_pw1   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "weight", i), {n_embd_convnext, n_ff}, 0);
-                        layer.convnext_pw1_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "bias",   i), {n_ff}, 0);
+                            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);
+                            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);
 
-                        layer.convnext_pw2   = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "weight", i), {n_ff, n_embd_convnext}, 0);
-                        layer.convnext_pw2_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "bias",   i), {n_embd_convnext}, 0);
+                            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
+                        }
 
-                        layer.convnext_gamma = create_tensor(tn(LLM_TENSOR_CONV_NEXT_GAMMA, "weight", i), {n_embd_convnext}, 0);
+                        // output
+                        model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                        model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
                     }
 
-                    // output
-                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd_convnext}, 0);
-                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd_convnext}, 0);
-
-                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd_convnext, n_embd}, 0);
+                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
                     model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
                 } break;
             default:
@@ -17266,156 +17191,82 @@ struct llm_build_context {
         cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1);
         cur = ggml_add(ctx0, cur, model.conv_1d_b);
 
-        inpL = cur;
-
-        // resnet block 0
-        {
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.posnet_0_norm1,
-                    model.posnet_0_norm1_b,
-                    LLM_NORM_GROUP, cb, 0);
-
-            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
-
-            cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, model.posnet_0_conv1_b);
-
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.posnet_0_norm2,
-                    model.posnet_0_norm2_b,
-                    LLM_NORM_GROUP, cb, 0);
-
-            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
-
-            cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, model.posnet_0_conv2_b);
-
-            cur = ggml_add(ctx0, cur, inpL);
-        }
-
-        inpL = cur;
-
-        // resnet block 1
-        {
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.posnet_1_norm1,
-                    model.posnet_1_norm1_b,
-                    LLM_NORM_GROUP, cb, 0);
-
-            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
-
-            cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, model.posnet_1_conv1_b);
-
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.posnet_1_norm2,
-                    model.posnet_1_norm2_b,
-                    LLM_NORM_GROUP, cb, 0);
-
-            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
-
-            cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, model.posnet_1_conv2_b);
-
-            cur = ggml_add(ctx0, cur, inpL);
-        }
+        // posnet
+        for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
+            const auto & layer = model.layers[il].posnet;
 
-        inpL = cur;
-
-        // attention block
-        {
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.posnet_2_attn_norm,
-                    model.posnet_2_attn_norm_b,
-                    LLM_NORM_GROUP, cb, 0);
-
-            struct ggml_tensor * q;
-            struct ggml_tensor * k;
-            struct ggml_tensor * v;
-
-            q = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_q, cur, 1, 1);
-            k = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_k, cur, 1, 1);
-            v = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_v, cur, 1, 1);
-
-            q = ggml_add(ctx0, q, model.posnet_2_attn_q_b);
-            k = ggml_add(ctx0, k, model.posnet_2_attn_k_b);
-            v = ggml_add(ctx0, v, model.posnet_2_attn_v_b);
-
-            q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
-            k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
-
-            struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
-
-            kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(model.hparams.n_embd_posnet)), 0.0f);
-
-            cur = ggml_mul_mat(ctx0, kq, v);
-
-            cur = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_o, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, model.posnet_2_attn_o_b);
-
-            cur = ggml_add(ctx0, cur, inpL);
-        }
-
-        inpL = cur;
+            inpL = cur;
 
-        // resnet block 3
-        {
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.posnet_3_norm1,
-                    model.posnet_3_norm1_b,
-                    LLM_NORM_GROUP, cb, 0);
+            switch (il) {
+                case 0:
+                case 1:
+                case 3:
+                case 4:
+                    {
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.norm1,
+                                layer.norm1_b,
+                                LLM_NORM_GROUP, cb, 0);
 
-            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+                        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
-            cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv1, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, model.posnet_3_conv1_b);
+                        cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.conv1_b);
 
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.posnet_3_norm2,
-                    model.posnet_3_norm2_b,
-                    LLM_NORM_GROUP, cb, 0);
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.norm2,
+                                layer.norm2_b,
+                                LLM_NORM_GROUP, cb, 0);
 
-            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+                        cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
 
-            cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv2, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, model.posnet_3_conv2_b);
+                        cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.conv2_b);
 
-            cur = ggml_add(ctx0, cur, inpL);
-        }
+                        cur = ggml_add(ctx0, cur, inpL);
+                    } break;
+                case 2:
+                    {
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.attn_norm,
+                                layer.attn_norm_b,
+                                LLM_NORM_GROUP, cb, 0);
 
-        inpL = cur;
+                        struct ggml_tensor * q;
+                        struct ggml_tensor * k;
+                        struct ggml_tensor * v;
 
-        // resnet block 4
-        {
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.posnet_4_norm1,
-                    model.posnet_4_norm1_b,
-                    LLM_NORM_GROUP, cb, 0);
+                        q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
+                        k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
+                        v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
 
-            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+                        q = ggml_add(ctx0, q, layer.attn_q_b);
+                        k = ggml_add(ctx0, k, layer.attn_k_b);
+                        v = ggml_add(ctx0, v, layer.attn_v_b);
 
-            cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv1, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, model.posnet_4_conv1_b);
+                        q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
+                        k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
 
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.posnet_4_norm2,
-                    model.posnet_4_norm2_b,
-                    LLM_NORM_GROUP, cb, 0);
+                        struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
 
-            cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
+                        kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
 
-            cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv2, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, model.posnet_4_conv2_b);
+                        cur = ggml_mul_mat(ctx0, kq, v);
 
-            cur = ggml_add(ctx0, cur, inpL);
-        }
+                        cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
+                        cur = ggml_add(ctx0, cur, layer.attn_o_b);
 
-        // normalize block 5
-        {
-            cur = llm_build_norm(ctx0, cur, hparams,
-                    model.posnet_5_norm,
-                    model.posnet_5_norm_b,
-                    LLM_NORM_GROUP, cb, 0);
+                        cur = ggml_add(ctx0, cur, inpL);
+                    } break;
+                case 5:
+                    {
+                        cur = llm_build_norm(ctx0, cur, hparams,
+                                layer.norm,
+                                layer.norm_b,
+                                LLM_NORM_GROUP, cb, 0);
+                    } break;
+                default: GGML_ABORT("unknown posnet layer");
+            };
         }
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
@@ -17429,27 +17280,30 @@ struct llm_build_context {
 
         inpL = cur;
 
-        for (int il = 0; il < n_layer; ++il) {
+        // convnext
+        for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
+            const auto & layer = model.layers[il].convnext;
+
             cur = inpL;
 
-            cur = ggml_conv_1d_dw_ph(ctx0, model.layers[il].convnext_dw, cur, 1, 1);
-            cur = ggml_add(ctx0, cur, model.layers[il].convnext_dw_b);
+            cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
+            cur = ggml_add(ctx0, cur, layer.dw_b);
 
             cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 
             cur = llm_build_norm(ctx0, cur, hparams,
-                    model.layers[il].convnext_norm,
-                    model.layers[il].convnext_norm_b,
+                    layer.norm,
+                    layer.norm_b,
                     LLM_NORM, cb, -1);
 
             cur = llm_build_ffn(ctx0, lctx, cur,
-                    model.layers[il].convnext_pw1, model.layers[il].convnext_pw1_b, NULL,
-                    NULL,                          NULL,                            NULL,
-                    model.layers[il].convnext_pw2, model.layers[il].convnext_pw2_b, NULL,
+                    layer.pw1, layer.pw1_b, NULL,
+                    NULL,      NULL,        NULL,
+                    layer.pw2, layer.pw2_b, NULL,
                     NULL,
                     LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
 
-            cur = ggml_mul(ctx0, cur, model.layers[il].convnext_gamma);
+            cur = ggml_mul(ctx0, cur, layer.gamma);
 
             cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
 

From 35259e53353fc2b59384ae51342c96e07cb81192 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Dec 2024 19:33:35 +0200
Subject: [PATCH 36/45] cont

ggml-ci
---
 examples/tts/convert_pt_to_hf.py |   4 -
 src/llama.cpp                    | 267 +++++++++++++++----------------
 2 files changed, 131 insertions(+), 140 deletions(-)

diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index adba21a3661a8..8909a65fd1e13 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -172,10 +172,6 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         "n_embd": 768,
         "n_layer": 12
     },
-    #"n_embd_posnet": 768,
-    #"n_embd_convnext": 768,
-    #"n_layer_posnet": 6,
-    #"n_layer_convnext": 12
 }
 
 with open(path_dst + '/config.json', 'w') as f:
diff --git a/src/llama.cpp b/src/llama.cpp
index 2d16ce72ba4b4..019def4a8ac3e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2747,191 +2747,185 @@ struct llama_cparams {
 
 struct llama_layer_posnet {
     // resnet
-    struct ggml_tensor * norm1 = nullptr;
+    struct ggml_tensor * norm1   = nullptr;
     struct ggml_tensor * norm1_b = nullptr;
 
-    struct ggml_tensor * conv1 = nullptr;
+    struct ggml_tensor * conv1   = nullptr;
     struct ggml_tensor * conv1_b = nullptr;
 
-    struct ggml_tensor * norm2 = nullptr;
+    struct ggml_tensor * norm2   = nullptr;
     struct ggml_tensor * norm2_b = nullptr;
 
-    struct ggml_tensor * conv2 = nullptr;
+    struct ggml_tensor * conv2   = nullptr;
     struct ggml_tensor * conv2_b = nullptr;
 
     // attention
-    struct ggml_tensor * attn_norm = nullptr;
+    struct ggml_tensor * attn_norm   = nullptr;
     struct ggml_tensor * attn_norm_b = nullptr;
 
-    struct ggml_tensor * attn_q = nullptr;
+    struct ggml_tensor * attn_q   = nullptr;
     struct ggml_tensor * attn_q_b = nullptr;
 
-    struct ggml_tensor * attn_k = nullptr;
+    struct ggml_tensor * attn_k   = nullptr;
     struct ggml_tensor * attn_k_b = nullptr;
 
-    struct ggml_tensor * attn_v = nullptr;
+    struct ggml_tensor * attn_v   = nullptr;
     struct ggml_tensor * attn_v_b = nullptr;
 
-    struct ggml_tensor * attn_o = nullptr;
+    struct ggml_tensor * attn_o   = nullptr;
     struct ggml_tensor * attn_o_b = nullptr;
 
     // normalize
-    struct ggml_tensor * norm = nullptr;
+    struct ggml_tensor * norm   = nullptr;
     struct ggml_tensor * norm_b = nullptr;
 };
 
 struct llama_layer_convnext {
-    struct ggml_tensor * dw;
-    struct ggml_tensor * dw_b;
+    struct ggml_tensor * dw   = nullptr;
+    struct ggml_tensor * dw_b = nullptr;
 
-    struct ggml_tensor * norm;
-    struct ggml_tensor * norm_b;
+    struct ggml_tensor * norm   = nullptr;
+    struct ggml_tensor * norm_b = nullptr;
 
-    struct ggml_tensor * pw1;
-    struct ggml_tensor * pw1_b;
+    struct ggml_tensor * pw1   = nullptr;
+    struct ggml_tensor * pw1_b = nullptr;
 
-    struct ggml_tensor * pw2;
-    struct ggml_tensor * pw2_b;
+    struct ggml_tensor * pw2   = nullptr;
+    struct ggml_tensor * pw2_b = nullptr;
 
-    struct ggml_tensor * gamma;
+    struct ggml_tensor * gamma = nullptr;
 };
 
-// TODO: separate into "llama_layer_enc" and "llama_layer_dec"
 struct llama_layer {
-    llama_layer() {
-        // initialize all pointers to NULL
-        std::memset(this, 0, sizeof(*this));
-    }
-
     // normalization
-    struct ggml_tensor * attn_norm;
-    struct ggml_tensor * attn_norm_b;
-    struct ggml_tensor * attn_norm_2;
-    struct ggml_tensor * attn_norm_2_b;
-    struct ggml_tensor * attn_q_norm;
-    struct ggml_tensor * attn_q_norm_b;
-    struct ggml_tensor * attn_k_norm;
-    struct ggml_tensor * attn_k_norm_b;
-    struct ggml_tensor * attn_out_norm;
-    struct ggml_tensor * attn_out_norm_b;
-    struct ggml_tensor * attn_q_a_norm;
-    struct ggml_tensor * attn_kv_a_norm;
-    struct ggml_tensor * attn_sub_norm;
-    struct ggml_tensor * attn_post_norm;
-    struct ggml_tensor * ffn_sub_norm;
-    struct ggml_tensor * attn_norm_cross;
-    struct ggml_tensor * attn_norm_enc;
+    struct ggml_tensor * attn_norm       = nullptr;
+    struct ggml_tensor * attn_norm_b     = nullptr;
+    struct ggml_tensor * attn_norm_2     = nullptr;
+    struct ggml_tensor * attn_norm_2_b   = nullptr;
+    struct ggml_tensor * attn_q_norm     = nullptr;
+    struct ggml_tensor * attn_q_norm_b   = nullptr;
+    struct ggml_tensor * attn_k_norm     = nullptr;
+    struct ggml_tensor * attn_k_norm_b   = nullptr;
+    struct ggml_tensor * attn_out_norm   = nullptr;
+    struct ggml_tensor * attn_out_norm_b = nullptr;
+    struct ggml_tensor * attn_q_a_norm   = nullptr;
+    struct ggml_tensor * attn_kv_a_norm  = nullptr;
+    struct ggml_tensor * attn_sub_norm   = nullptr;
+    struct ggml_tensor * attn_post_norm  = nullptr;
+    struct ggml_tensor * ffn_sub_norm    = nullptr;
+    struct ggml_tensor * attn_norm_cross = nullptr;
+    struct ggml_tensor * attn_norm_enc   = nullptr;
 
     // attention
-    struct ggml_tensor * wq;
-    struct ggml_tensor * wk;
-    struct ggml_tensor * wv;
-    struct ggml_tensor * wo;
-    struct ggml_tensor * wqkv;
-    struct ggml_tensor * wq_a;
-    struct ggml_tensor * wq_b;
-    struct ggml_tensor * wkv_a_mqa;
-    struct ggml_tensor * wkv_b;
-    struct ggml_tensor * wq_cross;
-    struct ggml_tensor * wk_cross;
-    struct ggml_tensor * wv_cross;
-    struct ggml_tensor * wo_cross;
-    struct ggml_tensor * wq_enc;
-    struct ggml_tensor * wk_enc;
-    struct ggml_tensor * wv_enc;
-    struct ggml_tensor * wo_enc;
+    struct ggml_tensor * wq        = nullptr;
+    struct ggml_tensor * wk        = nullptr;
+    struct ggml_tensor * wv        = nullptr;
+    struct ggml_tensor * wo        = nullptr;
+    struct ggml_tensor * wqkv      = nullptr;
+    struct ggml_tensor * wq_a      = nullptr;
+    struct ggml_tensor * wq_b      = nullptr;
+    struct ggml_tensor * wkv_a_mqa = nullptr;
+    struct ggml_tensor * wkv_b     = nullptr;
+    struct ggml_tensor * wq_cross  = nullptr;
+    struct ggml_tensor * wk_cross  = nullptr;
+    struct ggml_tensor * wv_cross  = nullptr;
+    struct ggml_tensor * wo_cross  = nullptr;
+    struct ggml_tensor * wq_enc    = nullptr;
+    struct ggml_tensor * wk_enc    = nullptr;
+    struct ggml_tensor * wv_enc    = nullptr;
+    struct ggml_tensor * wo_enc    = nullptr;
 
     // attention bias
-    struct ggml_tensor * bq;
-    struct ggml_tensor * bk;
-    struct ggml_tensor * bv;
-    struct ggml_tensor * bo;
-    struct ggml_tensor * bqkv;
+    struct ggml_tensor * bq   = nullptr;
+    struct ggml_tensor * bk   = nullptr;
+    struct ggml_tensor * bv   = nullptr;
+    struct ggml_tensor * bo   = nullptr;
+    struct ggml_tensor * bqkv = nullptr;
 
     // relative position bias
-    struct ggml_tensor * attn_rel_b;
-    struct ggml_tensor * attn_rel_b_enc;
-    struct ggml_tensor * attn_rel_b_cross;
+    struct ggml_tensor * attn_rel_b       = nullptr;
+    struct ggml_tensor * attn_rel_b_enc   = nullptr;
+    struct ggml_tensor * attn_rel_b_cross = nullptr;
 
     // normalization
-    struct ggml_tensor * ffn_norm;
-    struct ggml_tensor * ffn_norm_b;
-    struct ggml_tensor * ffn_post_norm;
-    struct ggml_tensor * layer_out_norm;
-    struct ggml_tensor * layer_out_norm_b;
-    struct ggml_tensor * ffn_norm_exps;
-    struct ggml_tensor * ffn_norm_enc;
+    struct ggml_tensor * ffn_norm         = nullptr;
+    struct ggml_tensor * ffn_norm_b       = nullptr;
+    struct ggml_tensor * ffn_post_norm    = nullptr;
+    struct ggml_tensor * layer_out_norm   = nullptr;
+    struct ggml_tensor * layer_out_norm_b = nullptr;
+    struct ggml_tensor * ffn_norm_exps    = nullptr;
+    struct ggml_tensor * ffn_norm_enc     = nullptr;
 
     // ff
-    struct ggml_tensor * ffn_gate; // w1
-    struct ggml_tensor * ffn_down; // w2
-    struct ggml_tensor * ffn_up;   // w3
-    struct ggml_tensor * ffn_gate_enc;
-    struct ggml_tensor * ffn_down_enc;
-    struct ggml_tensor * ffn_up_enc;
+    struct ggml_tensor * ffn_gate     = nullptr; // w1
+    struct ggml_tensor * ffn_down     = nullptr; // w2
+    struct ggml_tensor * ffn_up       = nullptr; // w3
+    struct ggml_tensor * ffn_gate_enc = nullptr;
+    struct ggml_tensor * ffn_down_enc = nullptr;
+    struct ggml_tensor * ffn_up_enc   = nullptr;
 
     // ff MoE
-    struct ggml_tensor * ffn_gate_inp;
-    struct ggml_tensor * ffn_gate_exps;
-    struct ggml_tensor * ffn_down_exps;
-    struct ggml_tensor * ffn_up_exps ;
+    struct ggml_tensor * ffn_gate_inp  = nullptr;
+    struct ggml_tensor * ffn_gate_exps = nullptr;
+    struct ggml_tensor * ffn_down_exps = nullptr;
+    struct ggml_tensor * ffn_up_exps   = nullptr;
 
     // ff shared expert (shexp)
-    struct ggml_tensor * ffn_gate_inp_shexp;
-    struct ggml_tensor * ffn_gate_shexp;
-    struct ggml_tensor * ffn_down_shexp;
-    struct ggml_tensor * ffn_up_shexp;
+    struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
+    struct ggml_tensor * ffn_gate_shexp     = nullptr;
+    struct ggml_tensor * ffn_down_shexp     = nullptr;
+    struct ggml_tensor * ffn_up_shexp       = nullptr;
 
     // ff bias
-    struct ggml_tensor * ffn_gate_b;
-    struct ggml_tensor * ffn_down_b; // b2
-    struct ggml_tensor * ffn_up_b; // b3
-    struct ggml_tensor * ffn_act;
+    struct ggml_tensor * ffn_gate_b = nullptr;
+    struct ggml_tensor * ffn_down_b = nullptr; // b2
+    struct ggml_tensor * ffn_up_b   = nullptr; // b3
+    struct ggml_tensor * ffn_act    = nullptr;
 
     // mamba proj
-    struct ggml_tensor * ssm_in;
-    struct ggml_tensor * ssm_x;
-    struct ggml_tensor * ssm_dt;
-    struct ggml_tensor * ssm_out;
+    struct ggml_tensor * ssm_in  = nullptr;
+    struct ggml_tensor * ssm_x   = nullptr;
+    struct ggml_tensor * ssm_dt  = nullptr;
+    struct ggml_tensor * ssm_out = nullptr;
 
     // mamba
-    struct ggml_tensor * ssm_conv1d;
-    struct ggml_tensor * ssm_a;
-    struct ggml_tensor * ssm_d;
+    struct ggml_tensor * ssm_conv1d = nullptr;
+    struct ggml_tensor * ssm_a      = nullptr;
+    struct ggml_tensor * ssm_d      = nullptr;
 
     // mamba bias
-    struct ggml_tensor * ssm_conv1d_b;
-    struct ggml_tensor * ssm_dt_b;
+    struct ggml_tensor * ssm_conv1d_b = nullptr;
+    struct ggml_tensor * ssm_dt_b     = nullptr;
 
     // rwkv
-    struct ggml_tensor * time_mix_w1;
-    struct ggml_tensor * time_mix_w2;
-    struct ggml_tensor * time_mix_lerp_x;
-    struct ggml_tensor * time_mix_lerp_w;
-    struct ggml_tensor * time_mix_lerp_k;
-    struct ggml_tensor * time_mix_lerp_v;
-    struct ggml_tensor * time_mix_lerp_r;
-    struct ggml_tensor * time_mix_lerp_g;
-
-    struct ggml_tensor * time_mix_first;
-    struct ggml_tensor * time_mix_decay;
-    struct ggml_tensor * time_mix_decay_w1;
-    struct ggml_tensor * time_mix_decay_w2;
-    struct ggml_tensor * time_mix_key;
-    struct ggml_tensor * time_mix_value;
-    struct ggml_tensor * time_mix_receptance;
-    struct ggml_tensor * time_mix_gate;
-
-    struct ggml_tensor * time_mix_ln;
-    struct ggml_tensor * time_mix_ln_b;
-    struct ggml_tensor * time_mix_output;
-
-    struct ggml_tensor * channel_mix_lerp_k;
-    struct ggml_tensor * channel_mix_lerp_r;
-
-    struct ggml_tensor * channel_mix_key;
-    struct ggml_tensor * channel_mix_receptance;
-    struct ggml_tensor * channel_mix_value;
+    struct ggml_tensor * time_mix_w1         = nullptr;
+    struct ggml_tensor * time_mix_w2         = nullptr;
+    struct ggml_tensor * time_mix_lerp_x     = nullptr;
+    struct ggml_tensor * time_mix_lerp_w     = nullptr;
+    struct ggml_tensor * time_mix_lerp_k     = nullptr;
+    struct ggml_tensor * time_mix_lerp_v     = nullptr;
+    struct ggml_tensor * time_mix_lerp_r     = nullptr;
+    struct ggml_tensor * time_mix_lerp_g     = nullptr;
+
+    struct ggml_tensor * time_mix_first      = nullptr;
+    struct ggml_tensor * time_mix_decay      = nullptr;
+    struct ggml_tensor * time_mix_decay_w1   = nullptr;
+    struct ggml_tensor * time_mix_decay_w2   = nullptr;
+    struct ggml_tensor * time_mix_key        = nullptr;
+    struct ggml_tensor * time_mix_value      = nullptr;
+    struct ggml_tensor * time_mix_receptance = nullptr;
+    struct ggml_tensor * time_mix_gate       = nullptr;
+
+    struct ggml_tensor * time_mix_ln     = nullptr;
+    struct ggml_tensor * time_mix_ln_b   = nullptr;
+    struct ggml_tensor * time_mix_output = nullptr;
+
+    struct ggml_tensor * channel_mix_lerp_k = nullptr;
+    struct ggml_tensor * channel_mix_lerp_r = nullptr;
+
+    struct ggml_tensor * channel_mix_key        = nullptr;
+    struct ggml_tensor * channel_mix_receptance = nullptr;
+    struct ggml_tensor * channel_mix_value      = nullptr;
 
     // long rope factors
     struct ggml_tensor * rope_long  = nullptr;
@@ -2939,13 +2933,13 @@ struct llama_layer {
     struct ggml_tensor * rope_freqs = nullptr;
 
     // bitnet scale
-    struct ggml_tensor * wq_scale;
-    struct ggml_tensor * wk_scale;
-    struct ggml_tensor * wv_scale;
-    struct ggml_tensor * wo_scale;
-    struct ggml_tensor * ffn_gate_scale;
-    struct ggml_tensor * ffn_up_scale;
-    struct ggml_tensor * ffn_down_scale;
+    struct ggml_tensor * wq_scale       = nullptr;
+    struct ggml_tensor * wk_scale       = nullptr;
+    struct ggml_tensor * wv_scale       = nullptr;
+    struct ggml_tensor * wo_scale       = nullptr;
+    struct ggml_tensor * ffn_gate_scale = nullptr;
+    struct ggml_tensor * ffn_up_scale   = nullptr;
+    struct ggml_tensor * ffn_down_scale = nullptr;
 
     struct llama_layer_posnet posnet;
 
@@ -3167,6 +3161,7 @@ struct llama_sbatch {
     // batch indices of the output
     std::vector<size_t> out_ids;
     std::vector<llama_sbatch_seq> seq;
+
     const llama_batch * batch = nullptr;
 
     // buffers for the ubatch

From 2033fb7eefa6919404642900d8e4f10a7b235aef Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Dec 2024 20:39:46 +0200
Subject: [PATCH 37/45] cont [no ci]

---
 src/llama.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 019def4a8ac3e..8f16d2d4fc853 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -77,7 +77,6 @@
 #endif
 
 // bump if necessary
-#define LLAMA_MAX_EMBD    8
 #define LLAMA_MAX_LAYERS  512
 #define LLAMA_MAX_EXPERTS 160  // DeepSeekV2
 
@@ -3074,8 +3073,8 @@ struct llama_model {
     struct ggml_tensor * cls_out   = nullptr;
     struct ggml_tensor * cls_out_b = nullptr;
 
-    struct ggml_tensor * conv_1d = nullptr;
-    struct ggml_tensor * conv_1d_b = nullptr;
+    struct ggml_tensor * conv1d = nullptr;
+    struct ggml_tensor * conv1d_b = nullptr;
 
     std::vector<llama_layer> layers;
 
@@ -9498,8 +9497,8 @@ static bool llm_load_tensors(
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
 
-                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
-                    model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, hparams.posnet.n_embd}, 0);
+                    model.conv1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
+                    model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {1, hparams.posnet.n_embd}, 0);
 
                     // posnet
                     {
@@ -17183,8 +17182,8 @@ struct llm_build_context {
 
         cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
 
-        cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1);
-        cur = ggml_add(ctx0, cur, model.conv_1d_b);
+        cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
+        cur = ggml_add(ctx0, cur, model.conv1d_b);
 
         // posnet
         for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {

From 824fa750d4b441c4266a0581efbc32194f1bfe42 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Dec 2024 10:25:17 +0200
Subject: [PATCH 38/45] llama : update WavTokenizer to non-causal attn

---
 convert_hf_to_gguf.py | 2 ++
 src/llama.cpp         | 1 +
 2 files changed, 3 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 7bf67a268183c..4a0b00f69c699 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2069,6 +2069,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"])
         self.gguf_writer.add_convnext_block_count     (self.hparams["convnext"]["n_layer"])
 
+        self.gguf_writer.add_causal_attention(False)
+
 
 @Model.register("Qwen2MoeForCausalLM")
 class Qwen2MoeModel(Model):
diff --git a/src/llama.cpp b/src/llama.cpp
index 8f16d2d4fc853..94160d534435f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6393,6 +6393,7 @@ static void llm_load_hparams(
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                 ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS,    hparams.f_norm_group_eps);
                 ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
+                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
             } break;
         default: (void)0;
     }

From d291c742534fdb6f4626283b9b4c2ad105d5a803 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Dec 2024 21:45:25 +0200
Subject: [PATCH 39/45] llama : handle no-vocab detokenization

---
 src/llama-vocab.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index e38e598532345..7f2725f94be13 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1867,6 +1867,10 @@ int32_t llama_detokenize_impl(
                          int32_t   text_len_max,
                             bool   remove_special,
                             bool   unparse_special) {
+    if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
+        return 0;
+    }
+
     GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
 
     int32_t avail = text_len_max;

From 5038abe1ee2be71473975d70cd317b33707b2a59 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Dec 2024 10:27:52 +0200
Subject: [PATCH 40/45] tts : add Python example for OuteTTS (wip)

---
 examples/tts/tts-outetts.py | 168 ++++++++++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 examples/tts/tts-outetts.py

diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py
new file mode 100644
index 0000000000000..86a7806573a96
--- /dev/null
+++ b/examples/tts/tts-outetts.py
@@ -0,0 +1,168 @@
+import sys
+import json
+import struct
+import requests
+import re
+
+def process_text(text: str):
+    text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed
+    text = re.sub(r'[-_/,\.\\]', ' ', text)
+    text = re.sub(r'[^a-z\s]', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text.split()
+
+# usage:
+# python tts-outetts.py http://server-llm:port http://server-dec:port "text"
+
+if len(sys.argv) <= 3:
+    print("usage: python tts-outetts.py http://server-llm:port http://server-dec:port \"text\"")
+    exit(1)
+
+host_llm = sys.argv[1]
+host_dec = sys.argv[2]
+text = sys.argv[3]
+
+prefix = """<|im_start|>
+<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>"""
+
+words = process_text(text)
+words = "<|text_sep|>".join([i.strip() for i in words])
+words += "<|text_end|>\n"
+
+# voice data
+# TODO: load from json
+#suffix = """<|audio_start|>
+#the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|>
+#overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|>
+#package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|>
+#from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|>
+#just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|>
+#two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|>
+#people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|>
+#is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|>
+#pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|>
+#remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|>
+#sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|>
+#i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|>
+#have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|>
+#some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|>
+#critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|>
+#about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|>
+#some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|>
+#of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|>
+#the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|>
+#gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|>
+#aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|>
+#but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|>
+#its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|>
+#still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|>
+#really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|>
+#enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|>
+#and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|>
+#it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
+#looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
+#lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>"""
+
+# TODO: tokenization is slow for some reason - here is pre-tokenized input
+suffix = [ 151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585, 152460, 153375, 151670, 198, 74455,
+          155808, 151669, 151799, 151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470, 151970, 153413,
+          152419, 153334, 153289, 153374, 153199, 152040, 153260, 152721, 152680, 153297, 152419, 153248, 152400,
+          152691, 153368, 153437, 151670, 198, 1722, 155828, 151669, 152607, 152256, 152991, 152299, 152688, 153163,
+          153016, 152789, 153198, 152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207, 152461, 153321,
+          153309, 151750, 152137, 153340, 152573, 152267, 153347, 151789, 152681, 153339, 151992, 152512, 151751,
+          152179, 153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904, 152311, 151670, 198, 1499, 155791,
+          151669, 152276, 152454, 153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226, 153043, 152325,
+          153267, 152622, 151670, 198, 4250, 155797, 151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271,
+          152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213, 152112, 153204, 151722, 152542, 151670, 198,
+          19789, 155796, 151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002, 152191, 151734, 152312, 152810,
+          152237, 153224, 153169, 153224, 152244, 153387, 153404, 151670, 198, 16069, 155811, 151669, 152265, 151946,
+          151808, 152412, 152363, 152305, 153156, 152733, 152810, 153157, 152016, 152100, 152069, 153234, 152317,
+          152589, 152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504, 153376, 152272, 152433, 152325,
+          151941, 151670, 198, 285, 155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381, 152474, 152680,
+          152157, 153255, 152324, 151682, 151670, 198, 32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682,
+          152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488, 153070, 151883, 152890, 152489, 153144,
+          153375, 152358, 151685, 152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669, 151902, 152720,
+          153377, 152027, 152378, 152821, 153207, 153459, 153028, 153068, 152507, 153255, 152158, 152921, 151958,
+          152609, 152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470, 152606, 152162, 152186, 153071,
+          152244, 153118, 153375, 153018, 152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736, 153380,
+          153502, 152702, 152115, 153181, 152735, 153277, 153457, 152393, 153112, 152595, 151670, 198, 19098, 155808,
+          151669, 152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239, 153163, 152922, 153402, 152034,
+          152591, 153438, 152215, 151673, 152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482, 152718,
+          152862, 153347, 151670, 198, 72, 155780, 151669, 151795, 152111, 152746, 152377, 153471, 152309, 151670, 198,
+          19016, 155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701, 152939, 152536, 152091, 151815, 152733,
+          151672, 151670, 198, 14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042, 153504, 152589, 153333,
+          151839, 151941, 153038, 153180, 151670, 198, 36996, 8303, 155832, 151669, 152231, 152256, 152835, 152801,
+          152985, 153400, 152393, 152818, 152765, 152249, 152600, 151699, 152302, 152752, 153018, 153009, 151992,
+          153054, 152847, 153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458, 152048, 152757, 152428,
+          153195, 151906, 153006, 153178, 153250, 152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418,
+          152228, 152733, 151670, 198, 9096, 155801, 151669, 151698, 153321, 152217, 153039, 152935, 153400, 152122,
+          152531, 153106, 152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851, 152901, 152885, 152594,
+          153446, 153080, 151670, 198, 14689, 155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191, 151673,
+          151690, 151698, 152714, 152846, 152981, 153171, 153384, 153364, 153188, 153246, 151670, 198, 1055, 155779,
+          151669, 151869, 152388, 152711, 153334, 151736, 151670, 198, 1782, 155780, 151669, 153483, 153240, 152241,
+          152558, 152697, 153046, 151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605, 153034, 153434,
+          153372, 153347, 151887, 152453, 152758, 152133, 152510, 152694, 152431, 152321, 153088, 152676, 152223,
+          152581, 152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032, 152903, 152859, 152989, 151748,
+          152669, 152661, 152650, 152409, 151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469, 152988,
+          152894, 151819, 152391, 153019, 152058, 153062, 153230, 151826, 152112, 152306, 152264, 152769, 153390,
+          152384, 152435, 152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540, 151919, 151893, 152558,
+          152817, 152946, 152956, 152129, 152715, 153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450,
+          151670, 198, 8088, 155792, 151669, 152452, 153497, 153353, 152679, 152533, 152382, 152374, 152611, 153341,
+          153163, 152285, 153411, 152495, 153141, 152320, 151670, 198, 1199, 155781, 151669, 151764, 152360, 153295,
+          152634, 153342, 152199, 152271, 151670, 198, 43366, 155799, 151669, 152308, 151682, 152889, 152016, 152385,
+          152629, 152495, 151826, 153321, 152958, 152180, 151886, 153432, 152922, 152128, 153024, 153040, 152593,
+          152287, 151677, 151670, 198, 53660, 155808, 151669, 151727, 152092, 152680, 153331, 151699, 152316, 152938,
+          152289, 152433, 153384, 151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691, 152489, 151941,
+          152049, 152034, 153053, 152179, 153160, 151676, 153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350,
+          152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234, 153135, 152291, 153235, 152143, 152583,
+          152402, 153483, 152678, 152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825, 152548, 153442,
+          152109, 152659, 153325, 152781, 152570, 152957, 151752, 152265, 153381, 152515, 151670, 198, 437, 155787,
+          151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174, 151792, 153409, 153327, 152990, 151670, 198,
+          275, 155781, 151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974, 151670, 198, 94273, 155799,
+          151669, 152953, 152938, 153427, 152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331, 152257,
+          152987, 152777, 153448, 152408, 151696, 152408, 152326, 152699, 151670, 198, 385, 16239, 155828, 151669,
+          152306, 152268, 153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110, 152918, 152923, 152467,
+          152331, 153053, 153330, 151889, 153444, 152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751,
+          152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499, 152109, 152255, 151739, 152267, 152759,
+          153318, 153165, 153349, 151670, ]
+
+response = requests.post(
+    host_llm + "/completion",
+    json={
+        "prompt": [prefix + words, *suffix],
+        "n_predict": 1024,
+        "cache_prompt": True,
+        "samplers": ["top_k"],
+        "top_k": 16,
+        "seed": 1003,
+    }
+)
+
+response_json = response.json()
+
+#print(json.dumps(response_json, indent=4))
+#print(json.dumps(response_json["prompt"], indent=4).replace("\\n", "\n"))
+#print(json.dumps(response_json["timings"], indent=4))
+print(json.dumps(response_json["tokens"], indent=4))
+
+codes = response_json["tokens"]
+
+codes = [t - 151672 for t in codes if t >= 151672 and t <= 155772]
+
+response = requests.post(
+    host_dec + "/embeddings",
+    json={
+        "input": [*codes],
+    }
+)
+
+response_json = response.json()
+
+#print(json.dumps(response_json, indent=4))
+
+embd = response_json["data"][0]["embedding"]
+
+print(len(embd))
+
+
+

From edb7896b495fbeddebe5d33601b6a844c4833acb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Dec 2024 14:21:06 +0200
Subject: [PATCH 41/45] tts : extend python example to generate spectrogram

ggml-ci
---
 examples/tts/tts-outetts.py |  12 ++-
 examples/tts/tts.cpp        | 188 +++++++++++++++++++++---------------
 2 files changed, 118 insertions(+), 82 deletions(-)

diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py
index 86a7806573a96..3c085dd5437de 100644
--- a/examples/tts/tts-outetts.py
+++ b/examples/tts/tts-outetts.py
@@ -143,7 +143,7 @@ def process_text(text: str):
 #print(json.dumps(response_json, indent=4))
 #print(json.dumps(response_json["prompt"], indent=4).replace("\\n", "\n"))
 #print(json.dumps(response_json["timings"], indent=4))
-print(json.dumps(response_json["tokens"], indent=4))
+#print(json.dumps(response_json["tokens"], indent=4))
 
 codes = response_json["tokens"]
 
@@ -160,9 +160,15 @@ def process_text(text: str):
 
 #print(json.dumps(response_json, indent=4))
 
+# spectrogram
 embd = response_json["data"][0]["embedding"]
 
-print(len(embd))
-
+n_codes = len(embd)
+n_embd = len(embd[0])
 
+print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd))
 
+# post-process the spectrogram to convert to audio
+# TODO: see the tts.cpp:embd_to_audio() and implement it in Python
+print('converting to audio ...')
+print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python')
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index aa5508be382d4..9c3d58f891703 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -63,7 +63,47 @@ static void print_usage(int, char ** argv) {
     LOG("\n");
 }
 
-static void fill_hann_window(int length, bool periodic, double * output) {
+struct wav_header {
+    char riff[4] = {'R', 'I', 'F', 'F'};
+    uint32_t chunk_size;
+    char wave[4] = {'W', 'A', 'V', 'E'};
+    char fmt[4] = {'f', 'm', 't', ' '};
+    uint32_t fmt_chunk_size = 16;
+    uint16_t audio_format = 1; // PCM
+    uint16_t num_channels = 1; // Mono
+    uint32_t sample_rate;
+    uint32_t byte_rate;
+    uint16_t block_align;
+    uint16_t bits_per_sample = 16;
+    char data[4] = {'d', 'a', 't', 'a'};
+    uint32_t data_size;
+};
+
+static void save_wav16(const std::string & fname, const std::vector<float> & data, int sample_rate) {
+    std::ofstream file(fname, std::ios::binary);
+    if (!file) {
+        LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
+        return;
+    }
+
+    wav_header header;
+    header.sample_rate = sample_rate;
+    header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
+    header.block_align = header.num_channels * (header.bits_per_sample / 8);
+    header.data_size = data.size() * (header.bits_per_sample / 8);
+    header.chunk_size = 36 + header.data_size;
+
+    file.write(reinterpret_cast<const char*>(&header), sizeof(header));
+
+    for (const auto & sample : data) {
+        int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
+        file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
+    }
+
+    file.close();
+}
+
+static void fill_hann_window(int length, bool periodic, float * output) {
     int offset = -1;
     if (periodic) {
         offset = 0;
@@ -74,31 +114,31 @@ static void fill_hann_window(int length, bool periodic, double * output) {
 }
 
 // very poor-man fft
-static void twiddle(double * real, double * imag, int k, int N) {
-    double angle = 2 * M_PI * k / N;
+static void twiddle(float * real, float * imag, int k, int N) {
+    float angle = 2 * M_PI * k / N;
     *real = cos(angle);
     *imag = sin(angle);
 }
 
-static void irfft(int n, const double * inp_cplx, double * out_real) {
+static void irfft(int n, const float * inp_cplx, float * out_real) {
     int N = n / 2 + 1;
 
-    std::vector<double> real_input(N);
-    std::vector<double> imag_input(N);
+    std::vector<float> real_input(N);
+    std::vector<float> imag_input(N);
     for (int i = 0; i < N; ++i) {
         real_input[i] = inp_cplx[2 * i];
         imag_input[i] = inp_cplx[2 * i + 1];
     }
 
-    std::vector<double> real_output(n);
-    std::vector<double> imag_output(n);
+    std::vector<float> real_output(n);
+    std::vector<float> imag_output(n);
 
     for (int k = 0; k < n; ++k) {
         real_output[k] = 0.0f;
         imag_output[k] = 0.0f;
         for (int m = 0; m < N; ++m) {
-            double twiddle_real;
-            double twiddle_imag;
+            float twiddle_real;
+            float twiddle_imag;
 
             twiddle(&twiddle_real, &twiddle_imag, k * m, n);
 
@@ -123,7 +163,7 @@ static void irfft(int n, const double * inp_cplx, double * out_real) {
 // hop_length =  320
 // pad =  480
 //
-static void fold(const std::vector<double> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<double> & output) {
+static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
     int64_t output_height = n_out;
     int64_t kernel_w = n_win;
     int64_t stride_w = n_hop;
@@ -147,103 +187,63 @@ static void fold(const std::vector<double> & data, int64_t n_out, int64_t n_win,
     output.resize(n_out - 2 * n_pad);
 }
 
-struct wav_header {
-    char riff[4] = {'R', 'I', 'F', 'F'};
-    uint32_t chunk_size;
-    char wave[4] = {'W', 'A', 'V', 'E'};
-    char fmt[4] = {'f', 'm', 't', ' '};
-    uint32_t fmt_chunk_size = 16;
-    uint16_t audio_format = 1; // PCM
-    uint16_t num_channels = 1; // Mono
-    uint32_t sample_rate;
-    uint32_t byte_rate;
-    uint16_t block_align;
-    uint16_t bits_per_sample = 16;
-    char data[4] = {'d', 'a', 't', 'a'};
-    uint32_t data_size;
-};
-
-static void save_wav16(const std::string & fname, const std::vector<double> & data, int sample_rate) {
-    std::ofstream file(fname, std::ios::binary);
-    if (!file) {
-        LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str());
-        return;
-    }
-
-    wav_header header;
-    header.sample_rate = sample_rate;
-    header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8);
-    header.block_align = header.num_channels * (header.bits_per_sample / 8);
-    header.data_size = data.size() * (header.bits_per_sample / 8);
-    header.chunk_size = 36 + header.data_size;
-
-    file.write(reinterpret_cast<const char*>(&header), sizeof(header));
-
-    for (const auto & sample : data) {
-        int16_t pcm_sample = static_cast<int16_t>(std::clamp(sample * 32767.0, -32768.0, 32767.0));
-        file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(pcm_sample));
-    }
-
-    file.close();
-}
-
-static std::vector<double> embd_to_audio(
+// TODO: not optimized at all
+static std::vector<float> embd_to_audio(
         const float * embd,
-        const std::vector<llama_token> & codes,
+        const int n_codes,
         const int n_embd,
         const int n_thread) {
-    const int n     = codes.size();
     const int n_fft = 1280;
     const int n_hop = 320;
     const int n_win = 1280;
     const int n_pad = (n_win - n_hop)/2;
-    const int n_out = (n - 1)*n_hop + n_win;
+    const int n_out = (n_codes - 1)*n_hop + n_win;
 
-    std::vector<double> hann(n_fft);
+    std::vector<float> hann(n_fft);
 
     fill_hann_window(hann.size(), true, hann.data());
 
-    int n_spec = n_embd*n;
+    int n_spec = n_embd*n_codes;
 
-    std::vector<double> E (n_spec);
-    std::vector<double> S (n_spec);
-    std::vector<double> ST(n_spec);
+    std::vector<float> E (n_spec);
+    std::vector<float> S (n_spec);
+    std::vector<float> ST(n_spec);
 
-    for (int l = 0; l < n; ++l) {
+    for (int l = 0; l < n_codes; ++l) {
         for (int k = 0; k < n_embd; ++k) {
-            E[k*n + l] = embd[l*n_embd + k];
+            E[k*n_codes + l] = embd[l*n_embd + k];
         }
     }
 
     for (int k = 0; k < n_embd/2; ++k) {
-        for (int l = 0; l < n; ++l) {
-            double mag = E[(k           )*n + l];
-            double phi = E[(k + n_embd/2)*n + l];
+        for (int l = 0; l < n_codes; ++l) {
+            float mag = E[(k           )*n_codes + l];
+            float phi = E[(k + n_embd/2)*n_codes + l];
 
             mag = exp(mag);
 
             if (mag > 1e2) {
                 mag = 1e2;
             }
-            S[2*(k*n + l) + 0] = mag*cosf(phi);
-            S[2*(k*n + l) + 1] = mag*sinf(phi);
+            S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
+            S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
         }
     }
 
-    for (int l = 0; l < n; ++l) {
+    for (int l = 0; l < n_codes; ++l) {
         for (int k = 0; k < n_embd/2; ++k) {
-            ST[l*n_embd + 2*k + 0] = S[2*(k*n + l) + 0];
-            ST[l*n_embd + 2*k + 1] = S[2*(k*n + l) + 1];
+            ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
+            ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
         }
     }
 
-    std::vector<double> res  (n*n_fft);
-    std::vector<double> hann2(n*n_fft);
+    std::vector<float> res  (n_codes*n_fft);
+    std::vector<float> hann2(n_codes*n_fft);
 
     std::vector<std::thread> workers(n_thread);
     for (int i = 0; i < n_thread; ++i) {
         workers[i] = std::thread([&, i]() {
-            for (int l = i; l < n; l += n_thread) {
+            for (int l = i; l < n_codes; l += n_thread) {
                 irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
                 for (int j = 0; j < n_fft; ++j) {
                     res  [l*n_fft + j] *= hann[j];
@@ -256,8 +256,8 @@ static std::vector<double> embd_to_audio(
         workers[i].join();
     }
 
-    std::vector<double> audio;
-    std::vector<double> env;
+    std::vector<float> audio;
+    std::vector<float> env;
 
     fold(res,   n_out, n_win, n_hop, n_pad, audio);
     fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
@@ -844,12 +844,14 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
 
     const auto t_voc_start = ggml_time_us();
 
-    llama_batch batch = llama_batch_init(codes.size(), 0, 1);
+    const int n_codes = codes.size();
+
+    llama_batch batch = llama_batch_init(n_codes, 0, 1);
 
     for (size_t i = 0; i < codes.size(); ++i) {
         common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits?
     }
-    GGML_ASSERT(batch.n_tokens == (int) codes.size());
+    GGML_ASSERT(batch.n_tokens == n_codes);
 
     if (llama_decode(ctx_cts, batch) != 0) {
         LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -862,12 +864,40 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
 
     const auto t_spec_start = ggml_time_us();
 
+#if 1
     // spectral operations
-    // TODO: not optimized at all
     const int n_embd = llama_n_embd(model_cts);
     const float * embd = llama_get_embeddings(ctx_cts);
 
-    auto audio = embd_to_audio(embd, codes, n_embd, params.cpuparams.n_threads);
+    auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);
+
+#else
+    // read the spectrogram from a file for debugging purposes
+    std::vector<float> audio;
+    {
+        std::ifstream fin("out.bin", std::ios::binary);
+        if (!fin) {
+            LOG_ERR("%s: failed to open file '%s'\n", __func__, "out.bin");
+            return 1;
+        }
+
+        std::vector<float> embd;
+
+        int n_codes;
+        int n_embd;
+
+        fin.read(reinterpret_cast<char *>(&n_codes), sizeof(int));
+        fin.read(reinterpret_cast<char *>(&n_embd), sizeof(int));
+
+        embd.resize(n_codes * n_embd);
+        fin.read(reinterpret_cast<char *>(embd.data()), n_codes * n_embd * sizeof(float));
+        fin.close();
+
+        LOG_INF("%s: n_codes: %d, n_embd: %d\n", __func__, n_codes, n_embd);
+
+        audio = embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads);
+    }
+#endif
 
     const std::string fname = "output.wav";
 

From 2a1a6f6326403032602897d9f48552f182c08be6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 18 Dec 2024 14:10:59 +0200
Subject: [PATCH 42/45] server : fix rebase artifacts

---
 examples/server/server.cpp                   | 4 ++--
 examples/server/tests/unit/test_embedding.py | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 57db582d72754..5ed4e8d274428 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -687,6 +687,8 @@ struct server_task_result_cmpl_partial : server_task_result {
                 json second_ret = json{
                             {"choices", json::array({json{{"finish_reason", nullptr},
                                                             {"index", 0},
+                                                            {"delta", json {
+                                                            {"content", content}}}
                                                             }})},
                             {"created", t},
                             {"id", oaicompat_cmpl_id},
@@ -702,7 +704,6 @@ struct server_task_result_cmpl_partial : server_task_result {
                 {"delta",
                 json {
                     {"content", content},
-                    {"tokens",  tokens}
                 }},
             }});
         }
@@ -1016,7 +1017,6 @@ struct server_slot {
         n_prompt_tokens    = 0;
         last_nl_pos        = 0;
         generated_text     = "";
-        generated_tokens   = {};
         has_new_line       = false;
         truncated          = false;
         stop               = STOP_TYPE_NONE;
diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py
index af6d148537452..e32d745829605 100644
--- a/examples/server/tests/unit/test_embedding.py
+++ b/examples/server/tests/unit/test_embedding.py
@@ -92,10 +92,6 @@ def test_embedding_pooling_none():
     for x in res.body[0]['embedding']:
         assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON
 
-    # make sure embedding vector is not normalized
-    for x in res.body[0]['embedding']:
-        assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON
-
 
 def test_embedding_pooling_none_oai():
     global server

From 29df666d44efd6a53f15b3a7c4875b5d9483a9b2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 18 Dec 2024 14:13:09 +0200
Subject: [PATCH 43/45] tts : enable "return_tokens" in Python example

ggml-ci
---
 examples/tts/tts-outetts.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py
index 3c085dd5437de..bebd38ef4a5fe 100644
--- a/examples/tts/tts-outetts.py
+++ b/examples/tts/tts-outetts.py
@@ -132,6 +132,7 @@ def process_text(text: str):
         "prompt": [prefix + words, *suffix],
         "n_predict": 1024,
         "cache_prompt": True,
+        "return_tokens": True,
         "samplers": ["top_k"],
         "top_k": 16,
         "seed": 1003,

From a95191c468f5fa81ca4ed080e2dd63bf6be0f744 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 18 Dec 2024 17:50:57 +0200
Subject: [PATCH 44/45] tts : minor fixes

---
 examples/tts/tts-outetts.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py
index bebd38ef4a5fe..0f81192fca52d 100644
--- a/examples/tts/tts-outetts.py
+++ b/examples/tts/tts-outetts.py
@@ -1,6 +1,6 @@
 import sys
-import json
-import struct
+#import json
+#import struct
 import requests
 import re
 
@@ -162,7 +162,7 @@ def process_text(text: str):
 #print(json.dumps(response_json, indent=4))
 
 # spectrogram
-embd = response_json["data"][0]["embedding"]
+embd = response_json[0]["embedding"]
 
 n_codes = len(embd)
 n_embd = len(embd[0])

From c0df192838f51507e06b7293030b43232cd2670f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 18 Dec 2024 19:22:56 +0200
Subject: [PATCH 45/45] common : support HF download for vocoder

---
 common/arg.cpp       | 50 +++++++++++++++++++++++++++++++-------------
 common/common.cpp    |  7 ++++---
 common/common.h      |  6 +++++-
 examples/tts/tts.cpp |  7 ++++++-
 4 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 93c15ecdcb4d2..e5ddd8318f787 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -119,29 +119,33 @@ std::string common_arg::to_string() {
 // utils
 //
 
-static void common_params_handle_model_default(common_params & params) {
-    if (!params.hf_repo.empty()) {
+static void common_params_handle_model_default(
+        std::string & model,
+        std::string & model_url,
+        std::string & hf_repo,
+        std::string & hf_file) {
+    if (!hf_repo.empty()) {
         // short-hand to avoid specifying --hf-file -> default it to --model
-        if (params.hf_file.empty()) {
-            if (params.model.empty()) {
+        if (hf_file.empty()) {
+            if (model.empty()) {
                 throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
             }
-            params.hf_file = params.model;
-        } else if (params.model.empty()) {
+            hf_file = model;
+        } else if (model.empty()) {
             // this is to avoid different repo having same file name, or same file name in different subdirs
-            std::string filename = params.hf_repo + "_" + params.hf_file;
+            std::string filename = hf_repo + "_" + hf_file;
             // to make sure we don't have any slashes in the filename
             string_replace_all(filename, "/", "_");
-            params.model = fs_get_cache_file(filename);
+            model = fs_get_cache_file(filename);
         }
-    } else if (!params.model_url.empty()) {
-        if (params.model.empty()) {
-            auto f = string_split<std::string>(params.model_url, '#').front();
+    } else if (!model_url.empty()) {
+        if (model.empty()) {
+            auto f = string_split<std::string>(model_url, '#').front();
             f = string_split<std::string>(f, '?').front();
-            params.model = fs_get_cache_file(string_split<std::string>(f, '/').back());
+            model = fs_get_cache_file(string_split<std::string>(f, '/').back());
         }
-    } else if (params.model.empty()) {
-        params.model = DEFAULT_MODEL_PATH;
+    } else if (model.empty()) {
+        model = DEFAULT_MODEL_PATH;
     }
 }
 
@@ -276,7 +280,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
-    common_params_handle_model_default(params);
+    // TODO: refactor model params in a common struct
+    common_params_handle_model_default(params.model,         params.model_url,         params.hf_repo,         params.hf_file);
+    common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
 
     if (params.escape) {
         string_process_escapes(params.prompt);
@@ -1581,6 +1587,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE"));
+    add_opt(common_arg(
+        {"-hfrv", "--hf-repo-v"}, "REPO",
+        "Hugging Face model repository for the vocoder model (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.vocoder.hf_repo = value;
+        }
+    ).set_env("LLAMA_ARG_HF_REPO_V"));
+    add_opt(common_arg(
+        {"-hffv", "--hf-file-v"}, "FILE",
+        "Hugging Face model file for the vocoder model (default: unused)",
+        [](common_params & params, const std::string & value) {
+            params.vocoder.hf_file = value;
+        }
+    ).set_env("LLAMA_ARG_HF_FILE_V"));
     add_opt(common_arg(
         {"-hft", "--hf-token"}, "TOKEN",
         "Hugging Face access token (default: value from HF_TOKEN environment variable)",
diff --git a/common/common.cpp b/common/common.cpp
index 05d3ba766e38b..20be9291161ca 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1095,7 +1095,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 #define CURL_MAX_RETRY 3
 #define CURL_RETRY_DELAY_SECONDS 2
 
-static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
     int remaining_attempts = max_attempts;
 
     while (remaining_attempts > 0) {
@@ -1119,7 +1119,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
 }
 
 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
-
     // Initialize libcurl
     std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
     if (!curl) {
@@ -1192,11 +1191,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
         std::string etag;
         std::string last_modified;
     };
+
     common_load_model_from_url_headers headers;
+
     {
         typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
         auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
+            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
 
             static std::regex header_regex("([^:]+): (.*)\r\n");
             static std::regex etag_regex("ETag", std::regex_constants::icase);
diff --git a/common/common.h b/common/common.h
index c09c4eb0d3628..1d2bd932c211d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -175,7 +175,11 @@ struct common_params_speculative {
 };
 
 struct common_params_vocoder {
-    std::string model = ""; // vocoder model for producing audio                             // NOLINT
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+
+    std::string model     = ""; // model path                                                // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
 };
 
 struct common_params {
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 9c3d58f891703..7f36b80f0dee1 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -461,7 +461,12 @@ int main(int argc, char ** argv) {
     model_ttc = llama_init_ttc.model;
     ctx_ttc = llama_init_ttc.context;
 
-    params.model = params.vocoder.model;
+    // TODO: refactor in a common struct
+    params.model     = params.vocoder.model;
+    params.model_url = params.vocoder.model_url;
+    params.hf_repo   = params.vocoder.hf_repo;
+    params.hf_file   = params.vocoder.hf_file;
+
     params.embedding = true;
 
     common_init_result llama_init_cts = common_init_from_params(params);