From 89eaf5036a07b394bea4c7b35f97d171bee2e87b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 21:03:24 +0200 Subject: [PATCH 01/45] server : add "tokens" output ggml-ci --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5ed4e8d274428..57db582d72754 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -687,8 +687,6 @@ struct server_task_result_cmpl_partial : server_task_result { json second_ret = json{ {"choices", json::array({json{{"finish_reason", nullptr}, {"index", 0}, - {"delta", json { - {"content", content}}} }})}, {"created", t}, {"id", oaicompat_cmpl_id}, @@ -704,6 +702,7 @@ struct server_task_result_cmpl_partial : server_task_result { {"delta", json { {"content", content}, + {"tokens", tokens} }}, }}); } @@ -1017,6 +1016,7 @@ struct server_slot { n_prompt_tokens = 0; last_nl_pos = 0; generated_text = ""; + generated_tokens = {}; has_new_line = false; truncated = false; stop = STOP_TYPE_NONE; From 06e85401b0f265f8ffa76144d31c1c7fdca45b6b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 10:56:20 +0200 Subject: [PATCH 02/45] server : output embeddings for all tokens when pooling = none ggml-ci --- examples/server/tests/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index 277125e88b534..da95c830b036d 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -275,7 +275,7 @@ def tinyllama2() -> ServerProcess: return server @staticmethod - def bert_bge_small() -> ServerProcess: + def bert_bge_small(pooling = 'last') -> ServerProcess: server = ServerProcess() server.model_hf_repo = "ggml-org/models" server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf" @@ -286,6 +286,7 @@ def bert_bge_small() -> ServerProcess: server.n_slots = 2 server.seed = 42 server.server_embeddings = True + server.pooling = pooling return server @staticmethod From 1b18b2d7b0d55da16b3c1b1d8abd94e5be85bf91 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 11:45:18 +0200 Subject: [PATCH 03/45] server : be explicit about the pooling type in the tests ggml-ci --- examples/server/tests/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index da95c830b036d..277125e88b534 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -275,7 +275,7 @@ def tinyllama2() -> ServerProcess: return server @staticmethod - def bert_bge_small(pooling = 'last') -> ServerProcess: + def bert_bge_small() -> ServerProcess: server = ServerProcess() server.model_hf_repo = "ggml-org/models" server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf" @@ -286,7 +286,6 @@ def bert_bge_small(pooling = 'last') -> ServerProcess: server.n_slots = 2 server.seed = 42 server.server_embeddings = True - server.pooling = pooling return server @staticmethod From e65556f1748451847b1b84382348e150a200bb34 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 13:36:32 +0200 Subject: [PATCH 04/45] server : do not normalize embeddings when there is no pooling ggml-ci --- examples/server/tests/unit/test_embedding.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py index e32d745829605..af6d148537452 100644 --- a/examples/server/tests/unit/test_embedding.py +++ b/examples/server/tests/unit/test_embedding.py @@ -92,6 +92,10 @@ def test_embedding_pooling_none(): for x in res.body[0]['embedding']: assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON + # make sure embedding vector is not normalized + for x in res.body[0]['embedding']: + assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON + def test_embedding_pooling_none_oai(): global server From f16996515861463b6a7dd1c4ce85aee52e74b96a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 14:40:03 +0200 Subject: [PATCH 05/45] llama : add OuteTTS support (wip) --- convert_hf_to_gguf.py | 26 ++++++ examples/tts/convert_pt_to_hf.py | 141 +++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 65 ++++++++++++++ gguf-py/gguf/tensor_mapping.py | 83 ++++++++++++++++++ 4 files changed, 315 insertions(+) create mode 100644 examples/tts/convert_pt_to_hf.py diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9dc1673bc2c06..2a9ed6a71fcc4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -689,6 +689,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: return res # Marker: End get_vocab_base_pre + def _set_vocab_none(self) -> None: + self.gguf_writer.add_tokenizer_model("none") + def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") @@ -2027,6 +2030,29 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: yield name, data +@Model.register("OuteTTSVocoder") +class OuteTTSVocoderModel(Model): + model_arch = gguf.MODEL_ARCH.OUTETTS_VOC + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if \ + name.endswith("codebook.cluster_size") or \ + name.endswith("codebook.embed_avg") or \ + name.endswith("codebook.inited"): + logger.debug(f"Skipping {name!r}") + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): model_arch = gguf.MODEL_ARCH.QWEN2MOE diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py new file mode 100644 index 0000000000000..c77aee6a8c0e2 --- /dev/null +++ b/examples/tts/convert_pt_to_hf.py @@ -0,0 +1,141 @@ +# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format +# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the OuteTTSS vocoder +# +# TODO: this script is LLM-generated and probably very inefficient and should be rewritten + +import torch +import json +import os +import sys +import re + +from safetensors.torch import save_file + +# change path to script dir +os.chdir(os.path.dirname(os.path.abspath(__file__))) + +# default +model_path = './model.pt'; + +# read from CLI +if len(sys.argv) > 1: + model_path = sys.argv[1] + +# get the directory of the input model +path_dst = os.path.dirname(model_path) + +print(f"Loading model from {model_path}") + +model = torch.load(model_path, map_location='cpu') + +#print(model) + +# print all keys +for key in model.keys(): + print(key) + if key == 'hyper_parameters': + #print(model[key]) + # dump as json pretty + print(json.dumps(model[key], indent=4)) + #if key != 'state_dict' and key != 'optimizer_states': + # print(model[key]) + +# Check if the loaded model is a state_dict or a model instance +if isinstance(model, torch.nn.Module): + state_dict = model.state_dict() +else: + state_dict = model + +# Print the structure of the state_dict to understand its format +print("State dictionary keys:") +for key in state_dict.keys(): + print(key) + +# Ensure the state_dict is flat and contains only torch.Tensor objects +def flatten_state_dict(state_dict, parent_key='', sep='.'): + items = [] + items_new = [] + + for k, v in state_dict.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, torch.Tensor): + items.append((new_key, v)) + elif isinstance(v, dict): + items.extend(flatten_state_dict(v, new_key, sep=sep).items()) + return dict(items) + + size_total_mb = 0 + + for key, value in list(items): + # keep only what we need for inference + if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \ + not key.startswith('state_dict.backbone.') and \ + not key.startswith('state_dict.head.'): + print('Skipping key: ', key) + continue + + new_key = key + + new_key = new_key.replace('state_dict.', '') + + # check if matches "backbone.pos_net.%d.bias" or "backbone.pos_net.%d.weight" + if new_key.startswith("backbone.pos_net."): + match = re.match(r"backbone\.pos_net\.(\d+)\.(bias|weight)", new_key) + if match: + new_key = f"backbone.pos_net.{match.group(1)}.norm.{match.group(2)}" + + size_mb = value.element_size() * value.nelement() / (1024 * 1024) + print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}") + + size_total_mb += size_mb + + #print(key, '->', new_key, ': ', value) + #print(key, '->', new_key) + + items_new.append((new_key, value)) + + print(f"Total size: {size_total_mb:8.2f} MB") + + return dict(items_new) + +flattened_state_dict = flatten_state_dict(state_dict) + + +# Convert the model to the safetensors format +output_path = path_dst + '/model.safetensors' +save_file(flattened_state_dict, output_path) + +print(f"Model has been successfully converted and saved to {output_path}") + +# Calculate the total size of the .safetensors file +total_size = os.path.getsize(output_path) + +# Create the weight map +weight_map = { + "model.safetensors": ["*"] # Assuming all weights are in one file +} + +# Create metadata for the index.json file +metadata = { + "total_size": total_size, + "weight_map": weight_map +} + +# Save the metadata to index.json +index_path = path_dst + '/index.json' +with open(index_path, 'w') as f: + json.dump(metadata, f, indent=4) + +print(f"Metadata has been saved to {index_path}") + +config = { + "architectures": [ + "OuteTTSVocoder" + ], + "num_hidden_layers": 12 +} + +with open(path_dst + '/config.json', 'w') as f: + json.dump(config, f, indent=4) + +print(f"Config has been saved to {path_dst + 'config.json'}") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c2c7cad14e500..37d8bce4778ae 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -261,11 +261,13 @@ class MODEL_ARCH(IntEnum): GRANITE = auto() GRANITE_MOE = auto() CHAMELEON = auto() + OUTETTS_VOC = auto() class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() TOKEN_EMBD_NORM = auto() + TOKEN_EMBD_SHIFT = auto() TOKEN_TYPES = auto() POS_EMBD = auto() OUTPUT = auto() @@ -370,6 +372,24 @@ class MODEL_TENSOR(IntEnum): ENC_OUTPUT_NORM = auto() CLS = auto() # classifier CLS_OUT = auto() # classifier output projection + CONV_NEXT_DW = auto() + CONV_NEXT_NORM = auto() + CONV_NEXT_SHIFT = auto() + CONV_NEXT_PW1 = auto() + CONV_NEXT_PW2 = auto() + CONV_NEXT_GAMMA = auto() + POS_NET_CONV1 = auto() + POS_NET_CONV2 = auto() + POS_NET_NORM = auto() + POS_NET_NORM1 = auto() + POS_NET_NORM2 = auto() + POS_NET_ATTN_NORM = auto() + POS_NET_ATTN_Q = auto() + POS_NET_ATTN_K = auto() + POS_NET_ATTN_V = auto() + POS_NET_ATTN_OUT = auto() + QNTZ_CBOOK_EMBD = auto() + HANN_WINDOW = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -425,11 +445,13 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GRANITE: "granite", MODEL_ARCH.GRANITE_MOE: "granitemoe", MODEL_ARCH.CHAMELEON: "chameleon", + MODEL_ARCH.OUTETTS_VOC: "outetts-voc", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", + MODEL_TENSOR.TOKEN_EMBD_SHIFT: "token_embd_shift", MODEL_TENSOR.TOKEN_TYPES: "token_types", MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", @@ -534,6 +556,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", MODEL_TENSOR.CLS: "cls", MODEL_TENSOR.CLS_OUT: "cls.output", + MODEL_TENSOR.CONV_NEXT_DW: "conv_next.{bid}.dw", + MODEL_TENSOR.CONV_NEXT_NORM: "conv_next.{bid}.norm", + MODEL_TENSOR.CONV_NEXT_SHIFT: "conv_next.{bid}.shift", + MODEL_TENSOR.CONV_NEXT_PW1: "conv_next.{bid}.pw1", + MODEL_TENSOR.CONV_NEXT_PW2: "conv_next.{bid}.pw2", + MODEL_TENSOR.CONV_NEXT_GAMMA: "conv_next.{bid}.gamma", + MODEL_TENSOR.POS_NET_CONV1: "pos_net.{bid}.conv1", + MODEL_TENSOR.POS_NET_CONV2: "pos_net.{bid}.conv2", + MODEL_TENSOR.POS_NET_NORM: "pos_net.{bid}.norm", + MODEL_TENSOR.POS_NET_NORM1: "pos_net.{bid}.norm1", + MODEL_TENSOR.POS_NET_NORM2: "pos_net.{bid}.norm2", + MODEL_TENSOR.POS_NET_ATTN_NORM: "pos_net.{bid}.attn_norm", + MODEL_TENSOR.POS_NET_ATTN_Q: "pos_net.{bid}.attn_q", + MODEL_TENSOR.POS_NET_ATTN_K: "pos_net.{bid}.attn_k", + MODEL_TENSOR.POS_NET_ATTN_V: "pos_net.{bid}.attn_v", + MODEL_TENSOR.POS_NET_ATTN_OUT: "pos_net.{bid}.attn_output", + MODEL_TENSOR.QNTZ_CBOOK_EMBD: "qntz.cbook.{bid}.embd", + MODEL_TENSOR.HANN_WINDOW: "hann_window", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1372,6 +1412,31 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.OUTETTS_VOC: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.TOKEN_EMBD_SHIFT, + MODEL_TENSOR.CONV_NEXT_DW, + MODEL_TENSOR.CONV_NEXT_NORM, + MODEL_TENSOR.CONV_NEXT_SHIFT, + MODEL_TENSOR.CONV_NEXT_PW1, + MODEL_TENSOR.CONV_NEXT_PW2, + MODEL_TENSOR.CONV_NEXT_GAMMA, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.POS_NET_CONV1, + MODEL_TENSOR.POS_NET_CONV2, + MODEL_TENSOR.POS_NET_NORM, + MODEL_TENSOR.POS_NET_NORM1, + MODEL_TENSOR.POS_NET_NORM2, + MODEL_TENSOR.POS_NET_ATTN_NORM, + MODEL_TENSOR.POS_NET_ATTN_Q, + MODEL_TENSOR.POS_NET_ATTN_K, + MODEL_TENSOR.POS_NET_ATTN_V, + MODEL_TENSOR.POS_NET_ATTN_OUT, + MODEL_TENSOR.QNTZ_CBOOK_EMBD, + MODEL_TENSOR.HANN_WINDOW, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 573d0282ea599..39eeea434468d 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -28,6 +28,7 @@ class TensorNameMap: "transformer.token_embeddings", # openelm "shared", # t5 "rwkv.embeddings", # rwkv + "backbone.embed", # outetts ), # Token type embeddings @@ -42,6 +43,11 @@ class TensorNameMap: "emb_ln", # nomic-bert "transformer.norm", # openelm "rwkv.blocks.0.pre_ln", # rwkv + "backbone.norm.scale", # outetts + ), + + MODEL_TENSOR.TOKEN_EMBD_SHIFT: ( + "backbone.norm.shift", # outetts ), # Position embeddings @@ -60,6 +66,7 @@ class TensorNameMap: "lm_head.linear", # phi2 "output_layer", # chatglm "head", # rwkv + "head.out", # outetts ), # Output norm @@ -80,6 +87,7 @@ class TensorNameMap: "transformer.norm", # openelm "model.norm", # nemotron "rwkv.ln_out", # rwkv + "backbone.final_layer_norm", # outetts ), # Rope frequencies @@ -90,6 +98,10 @@ class TensorNameMap: MODEL_TENSOR.ROPE_FACTORS_LONG: (), MODEL_TENSOR.ROPE_FACTORS_SHORT: (), + + MODEL_TENSOR.HANN_WINDOW: ( + "head.istft.window", # outetts + ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { @@ -681,6 +693,8 @@ class TensorNameMap: "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 ), + ############################################################################ + # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg MODEL_TENSOR.ENC_OUTPUT_NORM: ( "encoder.final_layer_norm", # t5 ), @@ -693,6 +707,75 @@ class TensorNameMap: MODEL_TENSOR.CLS_OUT: ( "classifier.out_proj", # roberta ), + ############################################################################# + + MODEL_TENSOR.CONV_NEXT_DW: ( + "backbone.convnext.{bid}.dwconv", # outetts + ), + + MODEL_TENSOR.CONV_NEXT_NORM: ( + "backbone.convnext.{bid}.norm.scale", # outetts + ), + + MODEL_TENSOR.CONV_NEXT_SHIFT: ( + "backbone.convnext.{bid}.norm.shift", # outetts + ), + + MODEL_TENSOR.CONV_NEXT_PW1: ( + "backbone.convnext.{bid}.pwconv1", # outetts + ), + + MODEL_TENSOR.CONV_NEXT_PW2: ( + "backbone.convnext.{bid}.pwconv2", # outetts + ), + + MODEL_TENSOR.CONV_NEXT_GAMMA: ( + "backbone.convnext.{bid}.gamma", # outetts + ), + + MODEL_TENSOR.POS_NET_CONV1: ( + "backbone.pos_net.{bid}.conv1", # outetts + ), + + MODEL_TENSOR.POS_NET_CONV2: ( + "backbone.pos_net.{bid}.conv2", # outetts + ), + + MODEL_TENSOR.POS_NET_NORM: ( + "backbone.pos_net.{bid}.norm", # outetts + ), + + MODEL_TENSOR.POS_NET_NORM1: ( + "backbone.pos_net.{bid}.norm1", # outetts + ), + + MODEL_TENSOR.POS_NET_NORM2: ( + "backbone.pos_net.{bid}.norm2", # outetts + ), + + MODEL_TENSOR.POS_NET_ATTN_NORM: ( + "backbone.pos_net.{bid}.norm", # outetts + ), + + MODEL_TENSOR.POS_NET_ATTN_Q: ( + "backbone.pos_net.{bid}.q", # outetts + ), + + MODEL_TENSOR.POS_NET_ATTN_K: ( + "backbone.pos_net.{bid}.k", # outetts + ), + + MODEL_TENSOR.POS_NET_ATTN_V: ( + "backbone.pos_net.{bid}.v", # outetts + ), + + MODEL_TENSOR.POS_NET_ATTN_OUT: ( + "backbone.pos_net.{bid}.proj_out", # outetts + ), + + MODEL_TENSOR.QNTZ_CBOOK_EMBD: ( + "feature_extractor.encodec.quantizer.vq.layers.{bid}._codebook.embed", # outetts + ), } # architecture-specific block mappings From ff2ea75fb4a1fc0f57b3eeb25b41461967cdc393 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 16:31:02 +0200 Subject: [PATCH 06/45] wip --- common/arg.cpp | 8 ++ common/common.h | 9 +- convert_hf_to_gguf.py | 15 +-- examples/CMakeLists.txt | 1 + examples/tts/CMakeLists.txt | 5 + examples/tts/convert_pt_to_hf.py | 7 ++ examples/tts/tts.cpp | 186 +++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 6 +- gguf-py/gguf/tensor_mapping.py | 10 +- src/llama.cpp | 71 ++++++++++-- 10 files changed, 295 insertions(+), 23 deletions(-) create mode 100644 examples/tts/CMakeLists.txt create mode 100644 examples/tts/tts.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 3d55289c33192..3d03c676c2b65 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2178,5 +2178,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); + add_opt(common_arg( + {"-mv", "--model-vocoder"}, "FNAME", + "vocoder model for audio generation (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.model = value; + } + ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); + return ctx_arg; } diff --git a/common/common.h b/common/common.h index ec0e49f6f1806..c09c4eb0d3628 100644 --- a/common/common.h +++ b/common/common.h @@ -80,6 +80,7 @@ enum llama_example { LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PARALLEL, + LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_COUNT, }; @@ -159,6 +160,7 @@ struct common_params_sampling { struct common_params_speculative { std::vector devices; // devices to use for offloading + int32_t n_ctx = 0; // draft context size int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding @@ -172,6 +174,10 @@ struct common_params_speculative { std::string model = ""; // draft model for speculative decoding // NOLINT }; +struct common_params_vocoder { + std::string model = ""; // vocoder model for producing audio // NOLINT +}; + struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 4096; // context size @@ -214,8 +220,9 @@ struct common_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - struct common_params_sampling sampling; + struct common_params_sampling sampling; struct common_params_speculative speculative; + struct common_params_vocoder vocoder; std::string model = ""; // model path // NOLINT std::string model_alias = ""; // model alias // NOLINT diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2a9ed6a71fcc4..4b51a2ad99423 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -221,17 +221,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") + if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) logger.info(f"gguf: feed forward length = {n_ff}") - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") + if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) @@ -2050,7 +2050,8 @@ def set_vocab(self): self._set_vocab_none() def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) @Model.register("Qwen2MoeForCausalLM") diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 21b31392e81d0..66cfab2c3b796 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -51,6 +51,7 @@ else() add_subdirectory(speculative) add_subdirectory(speculative-simple) add_subdirectory(tokenize) + add_subdirectory(tts) add_subdirectory(gen-docs) if (NOT GGML_BACKEND_DL) # these examples use the backends directly and cannot be built with dynamic loading diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt new file mode 100644 index 0000000000000..c72bd814c3b31 --- /dev/null +++ b/examples/tts/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-tts) +add_executable(${TARGET} tts.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index c77aee6a8c0e2..a652bae4361a2 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -84,6 +84,10 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): if match: new_key = f"backbone.pos_net.{match.group(1)}.norm.{match.group(2)}" + # "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight" + if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed": + new_key = "backbone.embedding.weight" + size_mb = value.element_size() * value.nelement() / (1024 * 1024) print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}") @@ -132,6 +136,9 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): "architectures": [ "OuteTTSVocoder" ], + "hidden_size": 512, + "vocab_size": 4096, + "max_position_embeddings": 8192, # ? "num_hidden_layers": 12 } diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp new file mode 100644 index 0000000000000..768015a5271c2 --- /dev/null +++ b/examples/tts/tts.cpp @@ -0,0 +1,186 @@ +#include "arg.h" +#include "common.h" +#include "sampling.h" +#include "log.h" +#include "llama.h" + +#include +#include +#include +#include +#include + +// +// Terminal utils +// + +#define SQR(X) ((X) * (X)) +#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40 + +/** + * Quantizes 24-bit RGB to xterm256 code range [16,256). + */ +static int rgb2xterm256(int r, int g, int b) { + unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377}; + int av, ir, ig, ib, il, qr, qg, qb, ql; + av = r * .299 + g * .587 + b * .114 + .5; + ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8; + qr = cube[(ir = UNCUBE(r))]; + qg = cube[(ig = UNCUBE(g))]; + qb = cube[(ib = UNCUBE(b))]; + if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <= + SQR(ql - r) + SQR(ql - g) + SQR(ql - b)) + return ir * 36 + ig * 6 + ib + 020; + return il + 0350; +} + +static std::string set_xterm256_foreground(int r, int g, int b) { + int x = rgb2xterm256(r, g, b); + std::ostringstream oss; + oss << "\033[38;5;" << x << "m"; + return oss.str(); +} + +const std::vector k_colors = { + set_xterm256_foreground(220, 5, 12), + set_xterm256_foreground(232, 96, 28), + set_xterm256_foreground(241, 147, 45), + set_xterm256_foreground(246, 193, 65), + set_xterm256_foreground(247, 240, 86), + set_xterm256_foreground(144, 201, 135), + set_xterm256_foreground( 78, 178, 101), +}; + +static void print_usage(int, char ** argv) { + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -p \"Hello!\"\n", argv[0]); + LOG("\n"); +} + +int main(int argc, char ** argv) { + common_params params; + + params.prompt = ""; + + params.n_predict = 1024; + params.n_batch = 8192; + params.n_ctx = 8192; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { + return 1; + } + + common_init(); + + // init LLM + + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model_ttc = NULL; // text-to-codes + llama_model * model_cts = NULL; // codes-to-speech + + llama_context * ctx_ttc = NULL; + llama_context * ctx_cts = NULL; + + common_init_result llama_init_ttc = common_init_from_params(params); + model_ttc = llama_init_ttc.model; + ctx_ttc = llama_init_ttc.context; + + params.model = params.vocoder.model; + + common_init_result llama_init_cts = common_init_from_params(params); + model_cts = llama_init_cts.model; + ctx_cts = llama_init_cts.context; + + const auto t_main_start = ggml_time_us(); + + std::vector prompt_inp = {198, 88225, 155856, 151669, 152205, + 153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695, + 153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010, + 153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286, + 152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296, + 153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690, + 153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061, + 153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670, + 198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683, + 152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908, + 151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359, + 153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424, + 151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670, + 198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729, + 152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669, + 153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670, + 198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501, + 152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242, + 153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360, + 153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055, + 152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670, + 198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441, + 152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831, + 153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133, + 153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109, + 152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055, + 155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729, + 151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337, + 153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153, + 153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365, + 153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218, + 152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464, + 152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855, + 152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418, + 153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645}; + + { + const std::string inp_txt = common_detokenize(ctx_ttc, prompt_inp, true); + LOG_INF("prompt: '%s'\n", inp_txt.c_str()); + LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size()); + } + + // remove all non-audio tokens (i.e. < 151672 || > 155772) + prompt_inp.erase(std::remove_if(prompt_inp.begin(), prompt_inp.end(), [](llama_token t) { return t < 151672 || t > 155772; }), prompt_inp.end()); + + { + const std::string inp_txt = common_detokenize(ctx_ttc, prompt_inp, true); + LOG_INF("prompt audio: '%s'\n", inp_txt.c_str()); + LOG_INF("%s: prompt audio size: %d\n", __func__, (int) prompt_inp.size()); + } + + + llama_batch batch = llama_batch_init(prompt_inp.size(), 0, 1); + + // evaluate the initial prompt + for (size_t i = 0; i < prompt_inp.size(); ++i) { + common_batch_add(batch, prompt_inp[i], i, { 0 }, true); // TODO: all logits? + } + GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size()); + + if (llama_decode(ctx_ttc, batch) != 0) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + return 1; + } + + llama_synchronize(ctx_ttc); + + LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); + + const float * embd = llama_get_embeddings(ctx_ttc); + + LOG("result:\n"); + for (int i = 0; i < 10; ++i) { + LOG("%8.3f ", embd[i]); + } + LOG("\n"); + + fprintf(stderr, "\n"); + + llama_free(ctx_ttc); + llama_free_model(model_ttc); + + llama_free(ctx_cts); + llama_free_model(model_cts); + + llama_backend_free(); + + return 0; +} diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 37d8bce4778ae..14e68cffa522c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -372,6 +372,7 @@ class MODEL_TENSOR(IntEnum): ENC_OUTPUT_NORM = auto() CLS = auto() # classifier CLS_OUT = auto() # classifier output projection + CONV1D = auto() CONV_NEXT_DW = auto() CONV_NEXT_NORM = auto() CONV_NEXT_SHIFT = auto() @@ -388,7 +389,6 @@ class MODEL_TENSOR(IntEnum): POS_NET_ATTN_K = auto() POS_NET_ATTN_V = auto() POS_NET_ATTN_OUT = auto() - QNTZ_CBOOK_EMBD = auto() HANN_WINDOW = auto() @@ -556,6 +556,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", MODEL_TENSOR.CLS: "cls", MODEL_TENSOR.CLS_OUT: "cls.output", + MODEL_TENSOR.CONV1D: "conv1d", MODEL_TENSOR.CONV_NEXT_DW: "conv_next.{bid}.dw", MODEL_TENSOR.CONV_NEXT_NORM: "conv_next.{bid}.norm", MODEL_TENSOR.CONV_NEXT_SHIFT: "conv_next.{bid}.shift", @@ -572,7 +573,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POS_NET_ATTN_K: "pos_net.{bid}.attn_k", MODEL_TENSOR.POS_NET_ATTN_V: "pos_net.{bid}.attn_v", MODEL_TENSOR.POS_NET_ATTN_OUT: "pos_net.{bid}.attn_output", - MODEL_TENSOR.QNTZ_CBOOK_EMBD: "qntz.cbook.{bid}.embd", MODEL_TENSOR.HANN_WINDOW: "hann_window", } @@ -1416,6 +1416,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.TOKEN_EMBD_SHIFT, + MODEL_TENSOR.CONV1D, MODEL_TENSOR.CONV_NEXT_DW, MODEL_TENSOR.CONV_NEXT_NORM, MODEL_TENSOR.CONV_NEXT_SHIFT, @@ -1434,7 +1435,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POS_NET_ATTN_K, MODEL_TENSOR.POS_NET_ATTN_V, MODEL_TENSOR.POS_NET_ATTN_OUT, - MODEL_TENSOR.QNTZ_CBOOK_EMBD, MODEL_TENSOR.HANN_WINDOW, ], # TODO diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 39eeea434468d..4355ccf111456 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -28,7 +28,7 @@ class TensorNameMap: "transformer.token_embeddings", # openelm "shared", # t5 "rwkv.embeddings", # rwkv - "backbone.embed", # outetts + "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" # outetts ), # Token type embeddings @@ -102,6 +102,10 @@ class TensorNameMap: MODEL_TENSOR.HANN_WINDOW: ( "head.istft.window", # outetts ), + + MODEL_TENSOR.CONV1D: ( + "backbone.embed", # roberta + ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { @@ -772,10 +776,6 @@ class TensorNameMap: MODEL_TENSOR.POS_NET_ATTN_OUT: ( "backbone.pos_net.{bid}.proj_out", # outetts ), - - MODEL_TENSOR.QNTZ_CBOOK_EMBD: ( - "feature_extractor.encodec.quantizer.vq.layers.{bid}._codebook.embed", # outetts - ), } # architecture-specific block mappings diff --git a/src/llama.cpp b/src/llama.cpp index b7b04a41d99e6..eefedab8bb8ce 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -197,6 +197,7 @@ enum llm_arch { LLM_ARCH_GRANITE, LLM_ARCH_GRANITE_MOE, LLM_ARCH_CHAMELEON, + LLM_ARCH_OUTETTS_VOC, LLM_ARCH_UNKNOWN, }; @@ -253,6 +254,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_CHAMELEON, "chameleon" }, + { LLM_ARCH_OUTETTS_VOC, "outetts-voc" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -503,6 +505,7 @@ struct LLM_KV { enum llm_tensor { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD_NORM, + LLM_TENSOR_TOKEN_EMBD_SHIFT, LLM_TENSOR_TOKEN_TYPES, LLM_TENSOR_POS_EMBD, LLM_TENSOR_OUTPUT, @@ -609,6 +612,24 @@ enum llm_tensor { LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, + LLM_TENSOR_CONV1D, + LLM_TENSOR_CONV_NEXT_DW, + LLM_TENSOR_CONV_NEXT_NORM, + LLM_TENSOR_CONV_NEXT_SHIFT, + LLM_TENSOR_CONV_NEXT_PW1, + LLM_TENSOR_CONV_NEXT_PW2, + LLM_TENSOR_CONV_NEXT_GAMMA, + LLM_TENSOR_POS_NET_CONV1, + LLM_TENSOR_POS_NET_CONV2, + LLM_TENSOR_POS_NET_NORM, + LLM_TENSOR_POS_NET_NORM1, + LLM_TENSOR_POS_NET_NORM2, + LLM_TENSOR_POS_NET_ATTN_NORM, + LLM_TENSOR_POS_NET_ATTN_Q, + LLM_TENSOR_POS_NET_ATTN_K, + LLM_TENSOR_POS_NET_ATTN_V, + LLM_TENSOR_POS_NET_ATTN_OUT, + LLM_TENSOR_HANN_WINDOW, }; static const std::map> LLM_TENSOR_NAMES = { @@ -1593,6 +1614,34 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, }, }, + { + LLM_ARCH_OUTETTS_VOC, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_EMBD_SHIFT, "token_embd_shift" }, + { LLM_TENSOR_CONV1D, "conv1d" }, + { LLM_TENSOR_CONV_NEXT_DW, "conv_next.dw" }, + { LLM_TENSOR_CONV_NEXT_NORM, "conv_next.norm" }, + { LLM_TENSOR_CONV_NEXT_SHIFT, "conv_next.shift" }, + { LLM_TENSOR_CONV_NEXT_PW1, "conv_next.pw1" }, + { LLM_TENSOR_CONV_NEXT_PW2, "conv_next.pw2" }, + { LLM_TENSOR_CONV_NEXT_GAMMA, "conv_next.gamma" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_POS_NET_CONV1, "pos_net.conv1" }, + { LLM_TENSOR_POS_NET_CONV2, "pos_net.conv2" }, + { LLM_TENSOR_POS_NET_NORM, "pos_net.norm" }, + { LLM_TENSOR_POS_NET_NORM1, "pos_net.norm1" }, + { LLM_TENSOR_POS_NET_NORM2, "pos_net.norm2" }, + { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.attn_norm" }, + { LLM_TENSOR_POS_NET_ATTN_Q, "pos_net.attn_q" }, + { LLM_TENSOR_POS_NET_ATTN_K, "pos_net.attn_k" }, + { LLM_TENSOR_POS_NET_ATTN_V, "pos_net.attn_v" }, + { LLM_TENSOR_POS_NET_ATTN_OUT, "pos_net.attn_output" }, + { LLM_TENSOR_HANN_WINDOW, "hann_window" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2489,7 +2538,7 @@ struct llama_hparams { bool use_par_res; bool swin_norm; - uint32_t n_vocab; + uint32_t n_vocab = 0; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; uint32_t n_layer; @@ -3005,6 +3054,9 @@ struct llama_model { struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out_b = nullptr; + // quantizer + struct ggml_tensor * qntz_cbook_embd = nullptr; + std::vector layers; // gguf metadata @@ -5519,7 +5571,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); // get hparams kv - ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); + ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false); // everything past this point is not vocab-related if (hparams.vocab_only) { @@ -5545,8 +5597,8 @@ static void llm_load_hparams( std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); - ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; @@ -6320,7 +6372,7 @@ static void llm_load_vocab( ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); - if (tokenizer_model == "no_vocab") { + if (tokenizer_model == "no_vocab" || tokenizer_model == "none") { vocab.type = LLAMA_VOCAB_TYPE_NONE; // default special tokens @@ -9336,9 +9388,9 @@ static bool llm_load_tensors( } break; case LLM_ARCH_CHAMELEON: { - model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - // output + // output model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed @@ -9367,6 +9419,10 @@ static bool llm_load_tensors( layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_OUTETTS_VOC: + { + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + } break; default: throw std::runtime_error("unknown architecture"); } @@ -20383,6 +20439,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_T5ENCODER: case LLM_ARCH_JAIS: case LLM_ARCH_RWKV6: + case LLM_ARCH_OUTETTS_VOC: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values From aac7e0495379217f69c59465a17d5fbb4a6699dd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 18:23:10 +0200 Subject: [PATCH 07/45] extract features --- examples/tts/convert_pt_to_hf.py | 1 + examples/tts/tts.cpp | 15 ++++- src/llama.cpp | 104 +++++++++++++++++++++++-------- 3 files changed, 91 insertions(+), 29 deletions(-) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index a652bae4361a2..389d2de50efe9 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -138,6 +138,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): ], "hidden_size": 512, "vocab_size": 4096, + "n_head": 1, "max_position_embeddings": 8192, # ? "num_hidden_layers": 12 } diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 768015a5271c2..d3fee7373ea3b 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -88,6 +88,7 @@ int main(int argc, char ** argv) { ctx_ttc = llama_init_ttc.context; params.model = params.vocoder.model; + params.embedding = true; common_init_result llama_init_cts = common_init_from_params(params); model_cts = llama_init_cts.model; @@ -146,6 +147,9 @@ int main(int argc, char ** argv) { LOG_INF("%s: prompt audio size: %d\n", __func__, (int) prompt_inp.size()); } + for (auto & token : prompt_inp) { + token -= 151672; + } llama_batch batch = llama_batch_init(prompt_inp.size(), 0, 1); @@ -155,22 +159,27 @@ int main(int argc, char ** argv) { } GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size()); - if (llama_decode(ctx_ttc, batch) != 0) { + if (llama_decode(ctx_cts, batch) != 0) { LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } - llama_synchronize(ctx_ttc); + llama_synchronize(ctx_cts); LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); - const float * embd = llama_get_embeddings(ctx_ttc); + const float * embd = llama_get_embeddings(ctx_cts); LOG("result:\n"); for (int i = 0; i < 10; ++i) { LOG("%8.3f ", embd[i]); } LOG("\n"); + double sum = 0.0; + for (int i = 0; i < 261*512; ++i) { + sum += embd[i]; + } + LOG("sum: %f\n", sum); fprintf(stderr, "\n"); diff --git a/src/llama.cpp b/src/llama.cpp index eefedab8bb8ce..3262b0218eb1b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3614,7 +3614,9 @@ static bool llama_kv_cache_init( const struct llama_hparams & hparams = model.hparams; - const int64_t n_layer = hparams.n_layer; + const int32_t n_layer = hparams.n_layer; + + LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer); cache.has_shift = false; @@ -3655,10 +3657,12 @@ static bool llama_kv_cache_init( cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); - for (int i = 0; i < (int) n_layer; i++) { + for (int i = 0; i < n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); + LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa); + ggml_backend_buffer_type_t buft; if (offload) { auto * dev = model.dev_layer.at(i).dev; @@ -5032,7 +5036,8 @@ struct llama_model_loader { void done_getting_tensors() const { if (n_created != n_tensors) { - throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); + // TODO: TEMPORARY DISABLED + //throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); } } @@ -9422,6 +9427,10 @@ static bool llm_load_tensors( case LLM_ARCH_OUTETTS_VOC: { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); } break; default: throw std::runtime_error("unknown architecture"); @@ -16991,6 +17000,30 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_outetts_voc() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + + cur = inpL; + + //cur = llm_build_norm(ctx0, cur, hparams, + // model.output_norm, NULL, + // LLM_NORM_RMS, cb, -1); + //cb(cur, "result_norm", -1); + + //// lm_head + //cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + //cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -17266,13 +17299,18 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_chameleon(); } break; + case LLM_ARCH_OUTETTS_VOC: + { + result = llm.build_outetts_voc(); + } break; default: GGML_ABORT("fatal error"); } // add on pooling layer if (lctx.cparams.embeddings) { - result = llm.append_pooling(result); + // TODO: TEMPORARY DISABLED + //result = llm.append_pooling(result); } llm.free(); @@ -17357,30 +17395,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) } if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); - const int64_t n_tokens = ubatch.n_tokens; + //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); - int32_t * data = (int32_t *) lctx.inp_out_ids->data; + if (!lctx.inp_out_ids) { + LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__); + } else { + const int64_t n_tokens = ubatch.n_tokens; - if (lctx.n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { - data[i] = i; - } - } else if (ubatch.output) { - int32_t n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch.output[i]) { - data[n_outputs++] = i; + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); + int32_t * data = (int32_t *) lctx.inp_out_ids->data; + + if (lctx.n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (ubatch.output) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch.output[i]) { + data[n_outputs++] = i; + } } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(lctx.n_outputs == n_outputs); + } else if (lctx.n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(lctx.n_outputs == 0); } - // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(lctx.n_outputs == n_outputs); - } else if (lctx.n_outputs == 1) { - // only keep last output - data[0] = n_tokens - 1; - } else { - GGML_ASSERT(lctx.n_outputs == 0); } } @@ -18029,9 +18072,14 @@ static int llama_decode_internal( ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); + struct ggml_tensor * res = nullptr; + struct ggml_tensor * embd = nullptr; + +// TODO: TEMPORARY DISABLED +if (model.arch != LLM_ARCH_OUTETTS_VOC) { // the output is always the last tensor in the graph - struct ggml_tensor * res = ggml_graph_node(gf, -1); - struct ggml_tensor * embd = ggml_graph_node(gf, -2); + res = ggml_graph_node(gf, -1); + embd = ggml_graph_node(gf, -2); if (lctx.n_outputs == 0) { // no output @@ -18051,6 +18099,10 @@ static int llama_decode_internal( embd = nullptr; // do not extract embeddings when not needed GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } +} else { + res = nullptr; + embd = ggml_graph_node(gf, -1); +} // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); From 6ef14091c07b12d5c3dd3d02adb0e4000d433307 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 19:18:04 +0200 Subject: [PATCH 08/45] first conv --- examples/tts/tts.cpp | 8 +++++++- ggml/src/ggml.c | 2 +- src/llama.cpp | 40 ++++++++++++++++++++++++++++++++-------- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index d3fee7373ea3b..17dc0eff2cd52 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -170,13 +170,19 @@ int main(int argc, char ** argv) { const float * embd = llama_get_embeddings(ctx_cts); + int n = 768*261; + LOG("result:\n"); for (int i = 0; i < 10; ++i) { LOG("%8.3f ", embd[i]); } LOG("\n"); + for (int i = n - 10; i < n; ++i) { + LOG("%8.3f ", embd[i]); + } + LOG("\n"); double sum = 0.0; - for (int i = 0; i < 261*512; ++i) { + for (int i = 0; i < n; ++i) { sum += embd[i]; } LOG("sum: %f\n", sum); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 0efd2b2ebf780..3665488b30c40 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3874,7 +3874,7 @@ struct ggml_tensor * ggml_im2col( int d1, bool is_2D, enum ggml_type dst_type) { - if(is_2D) { + if (is_2D) { GGML_ASSERT(a->ne[2] == b->ne[2]); } else { GGML_ASSERT(a->ne[1] == b->ne[1]); diff --git a/src/llama.cpp b/src/llama.cpp index 3262b0218eb1b..ec1436288e614 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3054,8 +3054,8 @@ struct llama_model { struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out_b = nullptr; - // quantizer - struct ggml_tensor * qntz_cbook_embd = nullptr; + struct ggml_tensor * conv_1d = nullptr; + struct ggml_tensor * conv_1d_b = nullptr; std::vector layers; @@ -5036,7 +5036,7 @@ struct llama_model_loader { void done_getting_tensors() const { if (n_created != n_tensors) { - // TODO: TEMPORARY DISABLED + // TODO: TEMPORARY DISABLED [OUTETTS] //throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); } } @@ -7356,6 +7356,7 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, + {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, }; // checks if the weight tensor can be used with the specified buffer type and device @@ -7460,6 +7461,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H); op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state); } break; + case GGML_OP_IM2COL: + { + int n_embd = hparams.n_embd; + ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1); + op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16); + } break; default: GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name); } @@ -9428,6 +9435,9 @@ static bool llm_load_tensors( { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0); + model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {768}, 0); + // output model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -9671,7 +9681,7 @@ static struct ggml_tensor * llm_build_inp_embd( inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); } else { - lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens); + lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens); inpL = lctx.inp_embd; ggml_set_input(lctx.inp_embd); } @@ -17009,7 +17019,13 @@ struct llm_build_context { inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); - cur = inpL; + cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); + + printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); + printf("conv1d: %d %d %d\n", model.conv_1d->ne[0], model.conv_1d->ne[1], model.conv_1d->ne[2]); + cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0])); + printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); //cur = llm_build_norm(ctx0, cur, hparams, // model.output_norm, NULL, @@ -17309,7 +17325,7 @@ static struct ggml_cgraph * llama_build_graph( // add on pooling layer if (lctx.cparams.embeddings) { - // TODO: TEMPORARY DISABLED + // TODO: TEMPORARY DISABLED [OUTETTS] //result = llm.append_pooling(result); } @@ -17798,7 +17814,13 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { } const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; + + // TODO: TEMPORARY !!! [OUTETTS] +#if 0 const size_t new_size = (logits_size + embd_size) * sizeof(float); +#else + const size_t new_size = 1024*1024*32; +#endif // alloc only when more than the current capacity is required // TODO: also consider shrinking the buffer @@ -18075,7 +18097,7 @@ static int llama_decode_internal( struct ggml_tensor * res = nullptr; struct ggml_tensor * embd = nullptr; -// TODO: TEMPORARY DISABLED +// TODO: TEMPORARY DISABLED [OUTETTS] if (model.arch != LLM_ARCH_OUTETTS_VOC) { // the output is always the last tensor in the graph res = ggml_graph_node(gf, -1); @@ -18170,7 +18192,9 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) { if (n_outputs_new) { GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size); - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); + // TODO: TEMPORARY [OUTETTS] + //ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*768*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_MEAN: From 5296c96ca8ea4677c3585467e88f78b677f39989 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 20:33:29 +0200 Subject: [PATCH 09/45] group norm --- src/llama.cpp | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ec1436288e614..8e62ea324fe25 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1629,16 +1629,16 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_CONV_NEXT_GAMMA, "conv_next.gamma" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_POS_NET_CONV1, "pos_net.conv1" }, - { LLM_TENSOR_POS_NET_CONV2, "pos_net.conv2" }, + { LLM_TENSOR_POS_NET_CONV1, "pos_net.%d.conv1" }, + { LLM_TENSOR_POS_NET_CONV2, "pos_net.%d.conv2" }, { LLM_TENSOR_POS_NET_NORM, "pos_net.norm" }, - { LLM_TENSOR_POS_NET_NORM1, "pos_net.norm1" }, - { LLM_TENSOR_POS_NET_NORM2, "pos_net.norm2" }, - { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.attn_norm" }, - { LLM_TENSOR_POS_NET_ATTN_Q, "pos_net.attn_q" }, - { LLM_TENSOR_POS_NET_ATTN_K, "pos_net.attn_k" }, - { LLM_TENSOR_POS_NET_ATTN_V, "pos_net.attn_v" }, - { LLM_TENSOR_POS_NET_ATTN_OUT, "pos_net.attn_output" }, + { LLM_TENSOR_POS_NET_NORM1, "pos_net.%d.norm1" }, + { LLM_TENSOR_POS_NET_NORM2, "pos_net.%d.norm2" }, + { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.%d.attn_norm" }, + { LLM_TENSOR_POS_NET_ATTN_Q, "pos_net.%d.attn_q" }, + { LLM_TENSOR_POS_NET_ATTN_K, "pos_net.%d.attn_k" }, + { LLM_TENSOR_POS_NET_ATTN_V, "pos_net.%d.attn_v" }, + { LLM_TENSOR_POS_NET_ATTN_OUT, "pos_net.%d.attn_output" }, { LLM_TENSOR_HANN_WINDOW, "hann_window" }, }, }, @@ -3054,9 +3054,13 @@ struct llama_model { struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out_b = nullptr; + // outetts vocoder struct ggml_tensor * conv_1d = nullptr; struct ggml_tensor * conv_1d_b = nullptr; + struct ggml_tensor * posnet_0_norm1 = nullptr; + struct ggml_tensor * posnet_0_norm1_b = nullptr; + std::vector layers; // gguf metadata @@ -7357,6 +7361,7 @@ static const std::map llm_tensor_info_mapping = { // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, + {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, }; // checks if the weight tensor can be used with the specified buffer type and device @@ -9438,6 +9443,9 @@ static bool llm_load_tensors( model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0); model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {768}, 0); + model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0); + model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 0), {768}, 0); + // output model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -9661,6 +9669,7 @@ enum llm_ffn_gate_type { enum llm_norm_type { LLM_NORM, LLM_NORM_RMS, + LLM_NORM_GROUP, }; static struct ggml_tensor * llm_build_inp_embd( @@ -9802,8 +9811,15 @@ static struct ggml_tensor * llm_build_norm( const llm_build_cb & cb, int il) { switch (type) { - case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break; - case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break; + case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break; + case LLM_NORM_GROUP: + { + // TODO: these reshapes should be removed, fix ggml_group_norm + cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]); + cur = ggml_group_norm(ctx, cur, 32, 1e-6); // TODO: add groups, eps params + cur = ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]); + } break; } if (mw || mb) { @@ -17025,6 +17041,12 @@ struct llm_build_context { printf("conv1d: %d %d %d\n", model.conv_1d->ne[0], model.conv_1d->ne[1], model.conv_1d->ne[2]); cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1); cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0])); + + cur = llm_build_norm(ctx0, cur, hparams, + ggml_reshape_2d(ctx0, model.posnet_0_norm1, 1, model.posnet_0_norm1->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]), + LLM_NORM_GROUP, cb, 0); + printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); //cur = llm_build_norm(ctx0, cur, hparams, From 3d08d62b6c6de4b8c90ceef15702427aa4a8ad4f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 20:40:52 +0200 Subject: [PATCH 10/45] resnet conv --- src/llama.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 8e62ea324fe25..bfc8d6601ec3e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3061,6 +3061,15 @@ struct llama_model { struct ggml_tensor * posnet_0_norm1 = nullptr; struct ggml_tensor * posnet_0_norm1_b = nullptr; + struct ggml_tensor * posnet_0_conv1 = nullptr; + struct ggml_tensor * posnet_0_conv1_b = nullptr; + + struct ggml_tensor * posnet_0_norm2 = nullptr; + struct ggml_tensor * posnet_0_norm2_b = nullptr; + + struct ggml_tensor * posnet_0_conv2 = nullptr; + struct ggml_tensor * posnet_0_conv2_b = nullptr; + std::vector layers; // gguf metadata @@ -7362,6 +7371,9 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, + {LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, }; // checks if the weight tensor can be used with the specified buffer type and device @@ -9446,6 +9458,15 @@ static bool llm_load_tensors( model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0); model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 0), {768}, 0); + model.posnet_0_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, 768, 768}, 0); + model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 0), {768}, 0); + + model.posnet_0_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {768}, 0); + model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 0), {768}, 0); + + model.posnet_0_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, 768, 768}, 0); + model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 0), {768}, 0); + // output model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -17047,6 +17068,11 @@ struct llm_build_context { ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]), LLM_NORM_GROUP, cb, 0); + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0])); + printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); //cur = llm_build_norm(ctx0, cur, hparams, From 13dd8941a4cf0a300045935670808119d02a1e97 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 20:50:13 +0200 Subject: [PATCH 11/45] resnet --- src/llama.cpp | 83 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index bfc8d6601ec3e..47bfbf9b3dd02 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3070,6 +3070,18 @@ struct llama_model { struct ggml_tensor * posnet_0_conv2 = nullptr; struct ggml_tensor * posnet_0_conv2_b = nullptr; + struct ggml_tensor * posnet_1_norm1 = nullptr; + struct ggml_tensor * posnet_1_norm1_b = nullptr; + + struct ggml_tensor * posnet_1_conv1 = nullptr; + struct ggml_tensor * posnet_1_conv1_b = nullptr; + + struct ggml_tensor * posnet_1_norm2 = nullptr; + struct ggml_tensor * posnet_1_norm2_b = nullptr; + + struct ggml_tensor * posnet_1_conv2 = nullptr; + struct ggml_tensor * posnet_1_conv2_b = nullptr; + std::vector layers; // gguf metadata @@ -9467,6 +9479,18 @@ static bool llm_load_tensors( model.posnet_0_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, 768, 768}, 0); model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 0), {768}, 0); + model.posnet_1_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {768}, 0); + model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 1), {768}, 0); + + model.posnet_1_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, 768, 768}, 0); + model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 1), {768}, 0); + + model.posnet_1_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {768}, 0); + model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 1), {768}, 0); + + model.posnet_1_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, 768, 768}, 0); + model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 1), {768}, 0); + // output model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -17060,18 +17084,63 @@ struct llm_build_context { printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); printf("conv1d: %d %d %d\n", model.conv_1d->ne[0], model.conv_1d->ne[1], model.conv_1d->ne[2]); + cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1); cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0])); - cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_0_norm1, 1, model.posnet_0_norm1->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]), - LLM_NORM_GROUP, cb, 0); + // resnet block 0 + { + struct ggml_tensor * cur_rnet = cur; + + cur_rnet = llm_build_norm(ctx0, cur, hparams, + ggml_reshape_2d(ctx0, model.posnet_0_norm1, 1, model.posnet_0_norm1->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]), + LLM_NORM_GROUP, cb, 0); + + cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet); + + cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur_rnet, 1, 1); + cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0])); - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + cur_rnet = llm_build_norm(ctx0, cur_rnet, hparams, + ggml_reshape_2d(ctx0, model.posnet_0_norm2, 1, model.posnet_0_norm2->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_0_norm2_b, 1, model.posnet_0_norm2_b->ne[0]), + LLM_NORM_GROUP, cb, 0); - cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0])); + cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet); + + cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur_rnet, 1, 1); + cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_0_conv2_b, 1, model.posnet_0_conv2_b->ne[0])); + + cur = ggml_add(ctx0, cur_rnet, cur); + } + + // resnet block 1 + { + struct ggml_tensor * cur_rnet = cur; + + cur_rnet = llm_build_norm(ctx0, cur, hparams, + ggml_reshape_2d(ctx0, model.posnet_1_norm1, 1, model.posnet_1_norm1->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_1_norm1_b, 1, model.posnet_1_norm1_b->ne[0]), + LLM_NORM_GROUP, cb, 0); + + cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet); + + cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur_rnet, 1, 1); + cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_1_conv1_b, 1, model.posnet_1_conv1_b->ne[0])); + + cur_rnet = llm_build_norm(ctx0, cur_rnet, hparams, + ggml_reshape_2d(ctx0, model.posnet_1_norm2, 1, model.posnet_1_norm2->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_1_norm2_b, 1, model.posnet_1_norm2_b->ne[0]), + LLM_NORM_GROUP, cb, 0); + + cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet); + + cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur_rnet, 1, 1); + cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_1_conv2_b, 1, model.posnet_1_conv2_b->ne[0])); + + cur = ggml_add(ctx0, cur_rnet, cur); + } printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); From 3046fde4205d159ad4967f20564a01b808464aa7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 21:59:45 +0200 Subject: [PATCH 12/45] attn --- convert_hf_to_gguf.py | 6 +- src/llama.cpp | 169 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 152 insertions(+), 23 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4b51a2ad99423..ebeb3840cd587 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -296,7 +296,9 @@ def prepare_tensors(self): break for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): - data = data_torch.squeeze().numpy() + # TODO: why do we squeeze here? + #data = data_torch.squeeze().numpy() + data = data_torch.numpy() # if data ends up empty, it means data_torch was a scalar tensor -> restore if len(data.shape) == 0: @@ -2044,6 +2046,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.debug(f"Skipping {name!r}") return [] + print(f"{self.map_tensor_name(name)} -> {data_torch.shape}") + return [(self.map_tensor_name(name), data_torch)] def set_vocab(self): diff --git a/src/llama.cpp b/src/llama.cpp index 47bfbf9b3dd02..cae15fde5dbf4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3055,9 +3055,11 @@ struct llama_model { struct ggml_tensor * cls_out_b = nullptr; // outetts vocoder + // TODO: dedup struct ggml_tensor * conv_1d = nullptr; struct ggml_tensor * conv_1d_b = nullptr; + // resnet 0 struct ggml_tensor * posnet_0_norm1 = nullptr; struct ggml_tensor * posnet_0_norm1_b = nullptr; @@ -3070,6 +3072,7 @@ struct llama_model { struct ggml_tensor * posnet_0_conv2 = nullptr; struct ggml_tensor * posnet_0_conv2_b = nullptr; + // resnet 1 struct ggml_tensor * posnet_1_norm1 = nullptr; struct ggml_tensor * posnet_1_norm1_b = nullptr; @@ -3082,6 +3085,48 @@ struct llama_model { struct ggml_tensor * posnet_1_conv2 = nullptr; struct ggml_tensor * posnet_1_conv2_b = nullptr; + // attn 2 + struct ggml_tensor * posnet_2_attn_norm = nullptr; + struct ggml_tensor * posnet_2_attn_norm_b = nullptr; + + struct ggml_tensor * posnet_2_attn_q = nullptr; + struct ggml_tensor * posnet_2_attn_q_b = nullptr; + + struct ggml_tensor * posnet_2_attn_k = nullptr; + struct ggml_tensor * posnet_2_attn_k_b = nullptr; + + struct ggml_tensor * posnet_2_attn_v = nullptr; + struct ggml_tensor * posnet_2_attn_v_b = nullptr; + + struct ggml_tensor * posnet_2_attn_o = nullptr; + struct ggml_tensor * posnet_2_attn_o_b = nullptr; + + // resnet 3 + struct ggml_tensor * posnet_3_norm1 = nullptr; + struct ggml_tensor * posnet_3_norm1_b = nullptr; + + struct ggml_tensor * posnet_3_conv1 = nullptr; + struct ggml_tensor * posnet_3_conv1_b = nullptr; + + struct ggml_tensor * posnet_3_norm2 = nullptr; + struct ggml_tensor * posnet_3_norm2_b = nullptr; + + struct ggml_tensor * posnet_3_conv2 = nullptr; + struct ggml_tensor * posnet_3_conv2_b = nullptr; + + // resnet 4 + struct ggml_tensor * posnet_4_norm1 = nullptr; + struct ggml_tensor * posnet_4_norm1_b = nullptr; + + struct ggml_tensor * posnet_4_conv1 = nullptr; + struct ggml_tensor * posnet_4_conv1_b = nullptr; + + struct ggml_tensor * posnet_4_norm2 = nullptr; + struct ggml_tensor * posnet_4_norm2_b = nullptr; + + struct ggml_tensor * posnet_4_conv2 = nullptr; + struct ggml_tensor * posnet_4_conv2_b = nullptr; + std::vector layers; // gguf metadata @@ -7386,6 +7431,11 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, {LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, + {LLM_TENSOR_POS_NET_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_POS_NET_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, }; // checks if the weight tensor can be used with the specified buffer type and device @@ -9491,6 +9541,45 @@ static bool llm_load_tensors( model.posnet_1_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, 768, 768}, 0); model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 1), {768}, 0); + model.posnet_2_attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {768}, 0); + model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 2), {768}, 0); + + model.posnet_2_attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, 768, 768}, 0); + model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", 2), {768}, 0); + + model.posnet_2_attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, 768, 768}, 0); + model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", 2), {768}, 0); + + model.posnet_2_attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, 768, 768}, 0); + model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", 2), {768}, 0); + + model.posnet_2_attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, 768, 768}, 0); + model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", 2), {768}, 0); + + model.posnet_3_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {768}, 0); + model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 3), {768}, 0); + + model.posnet_3_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, 768, 768}, 0); + model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 3), {768}, 0); + + model.posnet_3_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {768}, 0); + model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 3), {768}, 0); + + model.posnet_3_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, 768, 768}, 0); + model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 3), {768}, 0); + + model.posnet_4_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {768}, 0); + model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 4), {768}, 0); + + model.posnet_4_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, 768, 768}, 0); + model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 4), {768}, 0); + + model.posnet_4_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {768}, 0); + model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 4), {768}, 0); + + model.posnet_4_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, 768, 768}, 0); + model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 4), {768}, 0); + // output model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -17088,58 +17177,94 @@ struct llm_build_context { cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1); cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0])); + inpL = cur; + // resnet block 0 { - struct ggml_tensor * cur_rnet = cur; - - cur_rnet = llm_build_norm(ctx0, cur, hparams, + cur = llm_build_norm(ctx0, cur, hparams, ggml_reshape_2d(ctx0, model.posnet_0_norm1, 1, model.posnet_0_norm1->ne[0]), ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]), LLM_NORM_GROUP, cb, 0); - cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet); + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur_rnet, 1, 1); - cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0])); + cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0])); - cur_rnet = llm_build_norm(ctx0, cur_rnet, hparams, + cur = llm_build_norm(ctx0, cur, hparams, ggml_reshape_2d(ctx0, model.posnet_0_norm2, 1, model.posnet_0_norm2->ne[0]), ggml_reshape_2d(ctx0, model.posnet_0_norm2_b, 1, model.posnet_0_norm2_b->ne[0]), LLM_NORM_GROUP, cb, 0); - cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet); + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur_rnet, 1, 1); - cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_0_conv2_b, 1, model.posnet_0_conv2_b->ne[0])); + cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv2_b, 1, model.posnet_0_conv2_b->ne[0])); - cur = ggml_add(ctx0, cur_rnet, cur); + cur = ggml_add(ctx0, cur, inpL); } + inpL = cur; + // resnet block 1 { - struct ggml_tensor * cur_rnet = cur; - - cur_rnet = llm_build_norm(ctx0, cur, hparams, + cur = llm_build_norm(ctx0, cur, hparams, ggml_reshape_2d(ctx0, model.posnet_1_norm1, 1, model.posnet_1_norm1->ne[0]), ggml_reshape_2d(ctx0, model.posnet_1_norm1_b, 1, model.posnet_1_norm1_b->ne[0]), LLM_NORM_GROUP, cb, 0); - cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet); + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur_rnet, 1, 1); - cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_1_conv1_b, 1, model.posnet_1_conv1_b->ne[0])); + cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_1_conv1_b, 1, model.posnet_1_conv1_b->ne[0])); - cur_rnet = llm_build_norm(ctx0, cur_rnet, hparams, + cur = llm_build_norm(ctx0, cur, hparams, ggml_reshape_2d(ctx0, model.posnet_1_norm2, 1, model.posnet_1_norm2->ne[0]), ggml_reshape_2d(ctx0, model.posnet_1_norm2_b, 1, model.posnet_1_norm2_b->ne[0]), LLM_NORM_GROUP, cb, 0); - cur_rnet = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur_rnet), cur_rnet); + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_1_conv2_b, 1, model.posnet_1_conv2_b->ne[0])); + + cur = ggml_add(ctx0, cur, inpL); + } + + inpL = cur; + + // attention block + { + cur = llm_build_norm(ctx0, cur, hparams, + ggml_reshape_2d(ctx0, model.posnet_2_attn_norm, 1, model.posnet_2_attn_norm->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_2_attn_norm_b, 1, model.posnet_2_attn_norm_b->ne[0]), + LLM_NORM_GROUP, cb, 0); + + struct ggml_tensor * q; + struct ggml_tensor * k; + struct ggml_tensor * v; + + q = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_q, cur, 1, 1); + k = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_k, cur, 1, 1); + v = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_v, cur, 1, 1); + + q = ggml_add(ctx0, q, ggml_reshape_2d(ctx0, model.posnet_2_attn_q_b, 1, model.posnet_2_attn_q_b->ne[0])); + k = ggml_add(ctx0, k, ggml_reshape_2d(ctx0, model.posnet_2_attn_k_b, 1, model.posnet_2_attn_k_b->ne[0])); + v = ggml_add(ctx0, v, ggml_reshape_2d(ctx0, model.posnet_2_attn_v_b, 1, model.posnet_2_attn_v_b->ne[0])); - cur_rnet = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur_rnet, 1, 1); - cur_rnet = ggml_add(ctx0, cur_rnet, ggml_reshape_2d(ctx0, model.posnet_1_conv2_b, 1, model.posnet_1_conv2_b->ne[0])); + q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); + k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); - cur = ggml_add(ctx0, cur_rnet, cur); + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + + kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(768)), 0.0f); + + cur = ggml_mul_mat(ctx0, kq, v); + + cur = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_o, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_2_attn_o_b, 1, model.posnet_2_attn_o_b->ne[0])); + + cur = ggml_add(ctx0, cur, inpL); } printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); From 435cfd788be71f16d94efb9454daf990f858a04d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 22:05:44 +0200 Subject: [PATCH 13/45] pos net --- src/llama.cpp | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index cae15fde5dbf4..31ce4b31ab738 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1631,7 +1631,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_POS_NET_CONV1, "pos_net.%d.conv1" }, { LLM_TENSOR_POS_NET_CONV2, "pos_net.%d.conv2" }, - { LLM_TENSOR_POS_NET_NORM, "pos_net.norm" }, + { LLM_TENSOR_POS_NET_NORM, "pos_net.%d.norm" }, { LLM_TENSOR_POS_NET_NORM1, "pos_net.%d.norm1" }, { LLM_TENSOR_POS_NET_NORM2, "pos_net.%d.norm2" }, { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.%d.attn_norm" }, @@ -3127,6 +3127,10 @@ struct llama_model { struct ggml_tensor * posnet_4_conv2 = nullptr; struct ggml_tensor * posnet_4_conv2_b = nullptr; + // resnet 5 + struct ggml_tensor * posnet_5_norm = nullptr; + struct ggml_tensor * posnet_5_norm_b = nullptr; + std::vector layers; // gguf metadata @@ -7427,6 +7431,7 @@ static const std::map llm_tensor_info_mapping = { // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, + {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, @@ -9580,6 +9585,9 @@ static bool llm_load_tensors( model.posnet_4_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, 768, 768}, 0); model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 4), {768}, 0); + model.posnet_5_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {768}, 0); + model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 5), {768}, 0); + // output model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -17267,6 +17275,69 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, inpL); } + inpL = cur; + + // resnet block 3 + { + cur = llm_build_norm(ctx0, cur, hparams, + ggml_reshape_2d(ctx0, model.posnet_3_norm1, 1, model.posnet_3_norm1->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_3_norm1_b, 1, model.posnet_3_norm1_b->ne[0]), + LLM_NORM_GROUP, cb, 0); + + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv1, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_3_conv1_b, 1, model.posnet_3_conv1_b->ne[0])); + + cur = llm_build_norm(ctx0, cur, hparams, + ggml_reshape_2d(ctx0, model.posnet_3_norm2, 1, model.posnet_3_norm2->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_3_norm2_b, 1, model.posnet_3_norm2_b->ne[0]), + LLM_NORM_GROUP, cb, 0); + + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv2, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_3_conv2_b, 1, model.posnet_3_conv2_b->ne[0])); + + cur = ggml_add(ctx0, cur, inpL); + } + + inpL = cur; + + // resnet block 4 + { + cur = llm_build_norm(ctx0, cur, hparams, + ggml_reshape_2d(ctx0, model.posnet_4_norm1, 1, model.posnet_4_norm1->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_4_norm1_b, 1, model.posnet_4_norm1_b->ne[0]), + LLM_NORM_GROUP, cb, 0); + + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv1, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_4_conv1_b, 1, model.posnet_4_conv1_b->ne[0])); + + cur = llm_build_norm(ctx0, cur, hparams, + ggml_reshape_2d(ctx0, model.posnet_4_norm2, 1, model.posnet_4_norm2->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_4_norm2_b, 1, model.posnet_4_norm2_b->ne[0]), + LLM_NORM_GROUP, cb, 0); + + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv2, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_4_conv2_b, 1, model.posnet_4_conv2_b->ne[0])); + + cur = ggml_add(ctx0, cur, inpL); + } + + // normalize block 5 + { + cur = llm_build_norm(ctx0, cur, hparams, + ggml_reshape_2d(ctx0, model.posnet_5_norm, 1, model.posnet_5_norm->ne[0]), + ggml_reshape_2d(ctx0, model.posnet_5_norm_b, 1, model.posnet_5_norm_b->ne[0]), + LLM_NORM_GROUP, cb, 0); + } + + printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); //cur = llm_build_norm(ctx0, cur, hparams, From b3ba05e5bc62de2154198a8f335b58d54457e259 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 22:37:26 +0200 Subject: [PATCH 14/45] layer norm --- convert_hf_to_gguf.py | 2 +- examples/tts/convert_pt_to_hf.py | 10 ++++++++++ gguf-py/gguf/constants.py | 3 --- gguf-py/gguf/tensor_mapping.py | 6 +----- src/llama.cpp | 9 +++++++-- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ebeb3840cd587..a86490831fdbd 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2046,7 +2046,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.debug(f"Skipping {name!r}") return [] - print(f"{self.map_tensor_name(name)} -> {data_torch.shape}") + logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") return [(self.map_tensor_name(name), data_torch)] diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 389d2de50efe9..4a0d4bcc8cde7 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -88,6 +88,16 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed": new_key = "backbone.embedding.weight" + # these are the only rows used + # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100 + if new_key == "backbone.norm.scale.weight": + new_key = "backbone.norm.weight" + value = value[0] + + if new_key == "backbone.norm.shift.weight": + new_key = "backbone.norm.bias" + value = value[0] + size_mb = value.element_size() * value.nelement() / (1024 * 1024) print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 14e68cffa522c..81e434d117004 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -267,7 +267,6 @@ class MODEL_ARCH(IntEnum): class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() TOKEN_EMBD_NORM = auto() - TOKEN_EMBD_SHIFT = auto() TOKEN_TYPES = auto() POS_EMBD = auto() OUTPUT = auto() @@ -451,7 +450,6 @@ class MODEL_TENSOR(IntEnum): TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_EMBD_SHIFT: "token_embd_shift", MODEL_TENSOR.TOKEN_TYPES: "token_types", MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", @@ -1415,7 +1413,6 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.OUTETTS_VOC: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, - MODEL_TENSOR.TOKEN_EMBD_SHIFT, MODEL_TENSOR.CONV1D, MODEL_TENSOR.CONV_NEXT_DW, MODEL_TENSOR.CONV_NEXT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 4355ccf111456..872205e77b21a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -43,11 +43,7 @@ class TensorNameMap: "emb_ln", # nomic-bert "transformer.norm", # openelm "rwkv.blocks.0.pre_ln", # rwkv - "backbone.norm.scale", # outetts - ), - - MODEL_TENSOR.TOKEN_EMBD_SHIFT: ( - "backbone.norm.shift", # outetts + "backbone.norm", # outetts ), # Position embeddings diff --git a/src/llama.cpp b/src/llama.cpp index 31ce4b31ab738..6c38d9315ea79 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -505,7 +505,6 @@ struct LLM_KV { enum llm_tensor { LLM_TENSOR_TOKEN_EMBD, LLM_TENSOR_TOKEN_EMBD_NORM, - LLM_TENSOR_TOKEN_EMBD_SHIFT, LLM_TENSOR_TOKEN_TYPES, LLM_TENSOR_POS_EMBD, LLM_TENSOR_OUTPUT, @@ -1619,7 +1618,6 @@ static const std::map> LLM_TENSOR_N { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, - { LLM_TENSOR_TOKEN_EMBD_SHIFT, "token_embd_shift" }, { LLM_TENSOR_CONV1D, "conv1d" }, { LLM_TENSOR_CONV_NEXT_DW, "conv_next.dw" }, { LLM_TENSOR_CONV_NEXT_NORM, "conv_next.norm" }, @@ -9519,6 +9517,9 @@ static bool llm_load_tensors( { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0); + model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {768}, 0); + model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0); model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {768}, 0); @@ -17337,6 +17338,10 @@ struct llm_build_context { LLM_NORM_GROUP, cb, 0); } + cur = llm_build_norm(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), hparams, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, cb, -1); printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); From fe6dd5aa61924006ce81a84ba47441fbd570c804 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 10:06:48 +0200 Subject: [PATCH 15/45] convnext --- examples/tts/convert_pt_to_hf.py | 12 +- ggml/include/ggml.h | 41 ++++-- ggml/src/ggml.c | 224 ++++++++++++++++++------------- gguf-py/gguf/constants.py | 3 - gguf-py/gguf/tensor_mapping.py | 6 +- src/llama.cpp | 92 +++++++++++-- 6 files changed, 253 insertions(+), 125 deletions(-) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 4a0d4bcc8cde7..501fc4d6a6aa4 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -90,14 +90,17 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): # these are the only rows used # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100 - if new_key == "backbone.norm.scale.weight": - new_key = "backbone.norm.weight" + if new_key.endswith("norm.scale.weight"): + new_key = new_key.replace("norm.scale.weight", "norm.weight") value = value[0] - if new_key == "backbone.norm.shift.weight": - new_key = "backbone.norm.bias" + if new_key.endswith("norm.shift.weight"): + new_key = new_key.replace("norm.shift.weight", "norm.bias") value = value[0] + if new_key.endswith("gamma"): + new_key = new_key.replace("gamma", "gamma.weight") + size_mb = value.element_size() * value.nelement() / (1024 * 1024) print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}") @@ -149,6 +152,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): "hidden_size": 512, "vocab_size": 4096, "n_head": 1, + "layer_norm_epsilon": 1e-6, "max_position_embeddings": 8192, # ? "num_hidden_layers": 12 } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b0c1ac9ce2b89..c714fc8c837bb 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1564,17 +1564,6 @@ extern "C" { int d1, // dilation dimension 1 bool is_2D); - GGML_API struct ggml_tensor * ggml_conv_depthwise_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s0, // stride dimension 0 - int s1, // stride dimension 1 - int p0, // padding dimension 0 - int p1, // padding dimension 1 - int d0, // dilation dimension 0 - int d1); // dilation dimension 1 - GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel @@ -1592,6 +1581,23 @@ extern "C" { int s, // stride int d); // dilation + // depthwise + // TODO: this is very likely wrong for some cases! - needs more testing + GGML_API struct ggml_tensor * ggml_conv_1d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int p0, // padding + int d0); // dilation + + GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int d0); // dilation + GGML_API struct ggml_tensor * ggml_conv_transpose_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel @@ -1611,7 +1617,6 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 - // kernel size is a->ne[0] x a->ne[1] // stride is equal to kernel size // padding is zero @@ -1638,6 +1643,18 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // depthwise + GGML_API struct ggml_tensor * ggml_conv_2d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3665488b30c40..7c0159ab49c9f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3760,13 +3760,84 @@ struct ggml_tensor * ggml_clamp( return result; } -// ggml_conv_1d - static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) { return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; } -GGML_API struct ggml_tensor * ggml_conv_1d( +// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OH, OW, IC*KH*KW] +struct ggml_tensor * ggml_im2col( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1, + bool is_2D, + enum ggml_type dst_type) { + if (is_2D) { + GGML_ASSERT(a->ne[2] == b->ne[2]); + } else { + //GGML_ASSERT(b->ne[1] % a->ne[1] == 0); + GGML_ASSERT(b->ne[1] == a->ne[1]); + GGML_ASSERT(b->ne[3] == 1); + } + + const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0; + const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + + GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a"); + GGML_ASSERT((OW > 0) && "b too small compared to a"); + + const int64_t ne[4] = { + is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0], + OW, + is_2D ? OH : b->ne[2], + is_2D ? b->ne[3] : 1, + }; + + struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne); + int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_IM2COL; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_im2col_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int64_t * ne, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1, + bool is_2D) { + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_IM2COL_BACK; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_conv_1d + +struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -3775,6 +3846,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d( int d0) { struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K] + printf("a: %lld %lld %lld %lld\n", a->ne[0], a->ne[1], a->ne[2], a->ne[3]); + printf("b: %lld %lld %lld %lld\n", b->ne[0], b->ne[1], b->ne[2], b->ne[3]); + printf("im2col: %lld %lld %lld %lld\n", im2col->ne[0], im2col->ne[1], im2col->ne[2], im2col->ne[3]); + struct ggml_tensor * result = ggml_mul_mat(ctx, ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K] @@ -3796,137 +3871,75 @@ struct ggml_tensor* ggml_conv_1d_ph( return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); } -// ggml_conv_transpose_1d - -static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { - return (ins - 1) * s - 2 * p + d * (ks - 1) + 1; -} +// ggml_conv_1d_dw -GGML_API struct ggml_tensor * ggml_conv_transpose_1d( +struct ggml_tensor * ggml_conv_1d_dw( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, int s0, int p0, int d0) { - GGML_ASSERT(ggml_is_matrix(b)); - GGML_ASSERT(a->ne[2] == b->ne[1]); - GGML_ASSERT(a->ne[3] == 1); + struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]); + struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]); - GGML_ASSERT(p0 == 0); - GGML_ASSERT(d0 == 1); + struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); - const int64_t ne[4] = { - ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/), - a->ne[1], b->ne[2], 1, - }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - - int32_t params[] = { s0, p0, d0 }; - ggml_set_op_params(result, params, sizeof(params)); + struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a); - result->op = GGML_OP_CONV_TRANSPOSE_1D; - result->src[0] = a; - result->src[1] = b; + result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1); return result; } -// ggml_conv_depthwise +// ggml_conv_1d_dw_ph -struct ggml_tensor * ggml_conv_depthwise_2d( +struct ggml_tensor * ggml_conv_1d_dw_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, int s0, - int s1, - int p0, - int p1, - int d0, - int d1) { - struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); - struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, - ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), - s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW] - struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] + int d0) { + return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0); +} - new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW] - struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b); - result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] +// ggml_conv_transpose_1d - return result; +static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { + return (ins - 1) * s - 2 * p + d * (ks - 1) + 1; } -// ggml_conv_2d -// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] -// a: [OC,IC, KH, KW] -// b: [N, IC, IH, IW] -// result: [N, OH, OW, IC*KH*KW] -struct ggml_tensor * ggml_im2col( +GGML_API struct ggml_tensor * ggml_conv_transpose_1d( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, int s0, - int s1, int p0, - int p1, - int d0, - int d1, - bool is_2D, - enum ggml_type dst_type) { - if (is_2D) { - GGML_ASSERT(a->ne[2] == b->ne[2]); - } else { - GGML_ASSERT(a->ne[1] == b->ne[1]); - GGML_ASSERT(b->ne[3] == 1); - } - - const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0; - const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + int d0) { + GGML_ASSERT(ggml_is_matrix(b)); + GGML_ASSERT(a->ne[2] == b->ne[1]); + GGML_ASSERT(a->ne[3] == 1); - GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a"); - GGML_ASSERT((OW > 0) && "b too small compared to a"); + GGML_ASSERT(p0 == 0); + GGML_ASSERT(d0 == 1); const int64_t ne[4] = { - is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0], - OW, - is_2D ? OH : b->ne[2], - is_2D ? b->ne[3] : 1, + ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/), + a->ne[1], b->ne[2], 1, }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne); - int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; + int32_t params[] = { s0, p0, d0 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_IM2COL; + result->op = GGML_OP_CONV_TRANSPOSE_1D; result->src[0] = a; result->src[1] = b; return result; } -struct ggml_tensor * ggml_im2col_back( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int64_t * ne, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1, - bool is_2D) { - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) }; - ggml_set_op_params(result, params, sizeof(params)); - - result->op = GGML_OP_IM2COL_BACK; - result->src[0] = a; - result->src[1] = b; - - return result; -} +// ggml_conv_2d // a: [OC,IC, KH, KW] // b: [N, IC, IH, IW] @@ -3973,6 +3986,31 @@ struct ggml_tensor * ggml_conv_2d_s1_ph( return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1); } +// ggml_conv_2d_dw + +struct ggml_tensor * ggml_conv_2d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); + struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, + ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), + s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW] + struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] + + new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW] + struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b); + result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] + + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 81e434d117004..ea74354a43b8b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -374,7 +374,6 @@ class MODEL_TENSOR(IntEnum): CONV1D = auto() CONV_NEXT_DW = auto() CONV_NEXT_NORM = auto() - CONV_NEXT_SHIFT = auto() CONV_NEXT_PW1 = auto() CONV_NEXT_PW2 = auto() CONV_NEXT_GAMMA = auto() @@ -557,7 +556,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.CONV1D: "conv1d", MODEL_TENSOR.CONV_NEXT_DW: "conv_next.{bid}.dw", MODEL_TENSOR.CONV_NEXT_NORM: "conv_next.{bid}.norm", - MODEL_TENSOR.CONV_NEXT_SHIFT: "conv_next.{bid}.shift", MODEL_TENSOR.CONV_NEXT_PW1: "conv_next.{bid}.pw1", MODEL_TENSOR.CONV_NEXT_PW2: "conv_next.{bid}.pw2", MODEL_TENSOR.CONV_NEXT_GAMMA: "conv_next.{bid}.gamma", @@ -1416,7 +1414,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.CONV1D, MODEL_TENSOR.CONV_NEXT_DW, MODEL_TENSOR.CONV_NEXT_NORM, - MODEL_TENSOR.CONV_NEXT_SHIFT, MODEL_TENSOR.CONV_NEXT_PW1, MODEL_TENSOR.CONV_NEXT_PW2, MODEL_TENSOR.CONV_NEXT_GAMMA, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 872205e77b21a..ca7bb40fef11b 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -714,11 +714,7 @@ class TensorNameMap: ), MODEL_TENSOR.CONV_NEXT_NORM: ( - "backbone.convnext.{bid}.norm.scale", # outetts - ), - - MODEL_TENSOR.CONV_NEXT_SHIFT: ( - "backbone.convnext.{bid}.norm.shift", # outetts + "backbone.convnext.{bid}.norm", # outetts ), MODEL_TENSOR.CONV_NEXT_PW1: ( diff --git a/src/llama.cpp b/src/llama.cpp index 6c38d9315ea79..c94cdd383d709 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -614,7 +614,6 @@ enum llm_tensor { LLM_TENSOR_CONV1D, LLM_TENSOR_CONV_NEXT_DW, LLM_TENSOR_CONV_NEXT_NORM, - LLM_TENSOR_CONV_NEXT_SHIFT, LLM_TENSOR_CONV_NEXT_PW1, LLM_TENSOR_CONV_NEXT_PW2, LLM_TENSOR_CONV_NEXT_GAMMA, @@ -1619,12 +1618,11 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_CONV1D, "conv1d" }, - { LLM_TENSOR_CONV_NEXT_DW, "conv_next.dw" }, - { LLM_TENSOR_CONV_NEXT_NORM, "conv_next.norm" }, - { LLM_TENSOR_CONV_NEXT_SHIFT, "conv_next.shift" }, - { LLM_TENSOR_CONV_NEXT_PW1, "conv_next.pw1" }, - { LLM_TENSOR_CONV_NEXT_PW2, "conv_next.pw2" }, - { LLM_TENSOR_CONV_NEXT_GAMMA, "conv_next.gamma" }, + { LLM_TENSOR_CONV_NEXT_DW, "conv_next.%d.dw" }, + { LLM_TENSOR_CONV_NEXT_NORM, "conv_next.%d.norm" }, + { LLM_TENSOR_CONV_NEXT_PW1, "conv_next.%d.pw1" }, + { LLM_TENSOR_CONV_NEXT_PW2, "conv_next.%d.pw2" }, + { LLM_TENSOR_CONV_NEXT_GAMMA, "conv_next.%d.gamma" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_POS_NET_CONV1, "pos_net.%d.conv1" }, @@ -2922,6 +2920,21 @@ struct llama_layer { struct ggml_tensor * ffn_gate_scale; struct ggml_tensor * ffn_up_scale; struct ggml_tensor * ffn_down_scale; + + // convnext + struct ggml_tensor * convnext_dw; + struct ggml_tensor * convnext_dw_b; + + struct ggml_tensor * convnext_norm; + struct ggml_tensor * convnext_norm_b; + + struct ggml_tensor * convnext_pw1; + struct ggml_tensor * convnext_pw1_b; + + struct ggml_tensor * convnext_pw2; + struct ggml_tensor * convnext_pw2_b; + + struct ggml_tensor * convnext_gamma; }; // very similar to llama_batch, @@ -6420,6 +6433,10 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_OUTETTS_VOC: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + } break; default: (void)0; } @@ -7439,6 +7456,11 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CONV_NEXT_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, + {LLM_TENSOR_CONV_NEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CONV_NEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CONV_NEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CONV_NEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, }; // checks if the weight tensor can be used with the specified buffer type and device @@ -9589,6 +9611,25 @@ static bool llm_load_tensors( model.posnet_5_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {768}, 0); model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 5), {768}, 0); + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.convnext_dw = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, 768}, 0); + layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias", i), {768}, 0); + + layer.convnext_norm = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {768}, 0); + layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias", i), {768}, 0); + + // TODO: n_ff + layer.convnext_pw1 = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "weight", i), {768, 2304}, 0); + layer.convnext_pw1_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "bias", i), {2304}, 0); + + layer.convnext_pw2 = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "weight", i), {2304, 768}, 0); + layer.convnext_pw2_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "bias", i), {768}, 0); + + layer.convnext_gamma = create_tensor(tn(LLM_TENSOR_CONV_NEXT_GAMMA, "weight", i), {768}, 0); + } + // output model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -17338,11 +17379,46 @@ struct llm_build_context { LLM_NORM_GROUP, cb, 0); } - cur = llm_build_norm(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), hparams, + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = llm_build_norm(ctx0, cur, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + inpL = cur; + + for (int il = 0; il < n_layer; ++il) { + cur = inpL; + + cur = ggml_conv_1d_dw_ph(ctx0, model.layers[il].convnext_dw, cur, 1, 1); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.layers[il].convnext_dw_b, 1, model.layers[il].convnext_dw_b->ne[0])); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].convnext_norm, + model.layers[il].convnext_norm_b, + LLM_NORM, cb, -1); + + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].convnext_pw1, model.layers[il].convnext_pw1_b, NULL, + NULL, NULL, NULL, + model.layers[il].convnext_pw2, model.layers[il].convnext_pw2_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + + cur = ggml_mul(ctx0, cur, model.layers[il].convnext_gamma); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + inpL = ggml_add(ctx0, cur, inpL); + } + + cur = inpL; + printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); //cur = llm_build_norm(ctx0, cur, hparams, From 839035d1bbb818feb218c4a74a175fdda81f00fe Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 10:22:12 +0200 Subject: [PATCH 16/45] head --- examples/tts/tts.cpp | 2 +- src/llama.cpp | 31 ++++++++++++++++++++----------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 17dc0eff2cd52..7f5a8a616a1bf 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -170,7 +170,7 @@ int main(int argc, char ** argv) { const float * embd = llama_get_embeddings(ctx_cts); - int n = 768*261; + int n = 1282*261; LOG("result:\n"); for (int i = 0; i < 10; ++i) { diff --git a/src/llama.cpp b/src/llama.cpp index c94cdd383d709..adecc4bffa9ac 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9631,8 +9631,11 @@ static bool llm_load_tensors( } // output - model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); - model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); + model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {768}, 0); + + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {1282}, llama_model_loader::TENSOR_NOT_REQUIRED); } break; default: throw std::runtime_error("unknown architecture"); @@ -17419,16 +17422,22 @@ struct llm_build_context { cur = inpL; - printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cb(cur, "result_output_no_bias", -1); - //cur = llm_build_norm(ctx0, cur, hparams, - // model.output_norm, NULL, - // LLM_NORM_RMS, cb, -1); - //cb(cur, "result_norm", -1); + cur = ggml_add(ctx0, cur, model.output_b); + cb(cur, "result_output", -1); - //// lm_head - //cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - //cb(cur, "result_output", -1); + printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); ggml_build_forward_expand(gf, cur); @@ -18588,7 +18597,7 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) { GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size); // TODO: TEMPORARY [OUTETTS] //ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*768*sizeof(float)); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*1282*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_MEAN: From eb1b70f42a00e0c6e83ea3ee12889d8ccf743851 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 10:52:07 +0200 Subject: [PATCH 17/45] hann window --- examples/tts/convert_pt_to_hf.py | 3 +++ gguf-py/gguf/tensor_mapping.py | 1 - src/llama.cpp | 12 ++++++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 501fc4d6a6aa4..050a61fdc53b0 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -101,6 +101,9 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): if new_key.endswith("gamma"): new_key = new_key.replace("gamma", "gamma.weight") + if new_key == "head.istft.window": + new_key = "head.istft.window.weight" + size_mb = value.element_size() * value.nelement() / (1024 * 1024) print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}") diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ca7bb40fef11b..93b70a1477e6a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -28,7 +28,6 @@ class TensorNameMap: "transformer.token_embeddings", # openelm "shared", # t5 "rwkv.embeddings", # rwkv - "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" # outetts ), # Token type embeddings diff --git a/src/llama.cpp b/src/llama.cpp index adecc4bffa9ac..54cc0ec606350 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3070,6 +3070,8 @@ struct llama_model { struct ggml_tensor * conv_1d = nullptr; struct ggml_tensor * conv_1d_b = nullptr; + struct ggml_tensor * hann_window = nullptr; + // resnet 0 struct ggml_tensor * posnet_0_norm1 = nullptr; struct ggml_tensor * posnet_0_norm1_b = nullptr; @@ -5121,8 +5123,7 @@ struct llama_model_loader { void done_getting_tensors() const { if (n_created != n_tensors) { - // TODO: TEMPORARY DISABLED [OUTETTS] - //throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); + throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); } } @@ -7461,6 +7462,7 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_CONV_NEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONV_NEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONV_NEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_HANN_WINDOW, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, }; // checks if the weight tensor can be used with the specified buffer type and device @@ -9634,8 +9636,10 @@ static bool llm_load_tensors( model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {768}, 0); - model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); - model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {1282}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, 0); + model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {1282}, 0); + + model.hann_window = create_tensor(tn(LLM_TENSOR_HANN_WINDOW, "weight"), {1280}, 0); } break; default: throw std::runtime_error("unknown architecture"); From a1f08ad3385212a446de6ba33b2ef54c7c671a3d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 10:59:08 +0200 Subject: [PATCH 18/45] fix n_embd + remove llama.cpp hacks --- examples/tts/convert_pt_to_hf.py | 2 +- examples/tts/tts.cpp | 3 ++- src/llama.cpp | 39 ++++++++------------------------ 3 files changed, 13 insertions(+), 31 deletions(-) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 050a61fdc53b0..d066248798b41 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -152,7 +152,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): "architectures": [ "OuteTTSVocoder" ], - "hidden_size": 512, + "hidden_size": 1282, "vocab_size": 4096, "n_head": 1, "layer_norm_epsilon": 1e-6, diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 7f5a8a616a1bf..684f6b2fbd8e0 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -168,9 +168,10 @@ int main(int argc, char ** argv) { LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); + const int n_embd = llama_n_embd(model_cts); const float * embd = llama_get_embeddings(ctx_cts); - int n = 1282*261; + int n = n_embd*261; LOG("result:\n"); for (int i = 0; i < 10; ++i) { diff --git a/src/llama.cpp b/src/llama.cpp index 54cc0ec606350..841e9a491b2f6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9539,12 +9539,12 @@ static bool llm_load_tensors( } break; case LLM_ARCH_OUTETTS_VOC: { - model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0); model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0); model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {768}, 0); - model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0); + model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, 512, 768}, 0); model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {768}, 0); model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0); @@ -9636,8 +9636,8 @@ static bool llm_load_tensors( model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {768}, 0); - model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, 0); - model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {1282}, 0); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, n_embd}, 0); + model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0); model.hann_window = create_tensor(tn(LLM_TENSOR_HANN_WINDOW, "weight"), {1280}, 0); } break; @@ -17432,14 +17432,12 @@ struct llm_build_context { model.output_norm, model.output_norm_b, LLM_NORM, cb, -1); - cb(cur, "result_norm", -1); // lm_head cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); - cb(cur, "result_output", -1); + cb(cur, "result_embd", -1); printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); @@ -17732,8 +17730,7 @@ static struct ggml_cgraph * llama_build_graph( // add on pooling layer if (lctx.cparams.embeddings) { - // TODO: TEMPORARY DISABLED [OUTETTS] - //result = llm.append_pooling(result); + result = llm.append_pooling(result); } llm.free(); @@ -18221,13 +18218,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { } const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0; - - // TODO: TEMPORARY !!! [OUTETTS] -#if 0 const size_t new_size = (logits_size + embd_size) * sizeof(float); -#else - const size_t new_size = 1024*1024*32; -#endif // alloc only when more than the current capacity is required // TODO: also consider shrinking the buffer @@ -18501,14 +18492,9 @@ static int llama_decode_internal( ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); - struct ggml_tensor * res = nullptr; - struct ggml_tensor * embd = nullptr; - -// TODO: TEMPORARY DISABLED [OUTETTS] -if (model.arch != LLM_ARCH_OUTETTS_VOC) { // the output is always the last tensor in the graph - res = ggml_graph_node(gf, -1); - embd = ggml_graph_node(gf, -2); + struct ggml_tensor * res = ggml_graph_node(gf, -1); + struct ggml_tensor * embd = ggml_graph_node(gf, -2); if (lctx.n_outputs == 0) { // no output @@ -18528,10 +18514,7 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) { embd = nullptr; // do not extract embeddings when not needed GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } -} else { - res = nullptr; - embd = ggml_graph_node(gf, -1); -} + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); @@ -18599,9 +18582,7 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) { if (n_outputs_new) { GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size); - // TODO: TEMPORARY [OUTETTS] - //ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); - ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*1282*sizeof(float)); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_MEAN: From e728cfd297f2f67693f5d4f94c8f66f405988fd2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 12:35:47 +0200 Subject: [PATCH 19/45] compute hann window --- examples/tts/convert_pt_to_hf.py | 5 +---- examples/tts/tts.cpp | 15 +++++++++++++++ gguf-py/gguf/constants.py | 3 --- gguf-py/gguf/tensor_mapping.py | 4 ---- include/llama.h | 3 --- src/llama.cpp | 27 +++++++++++---------------- 6 files changed, 27 insertions(+), 30 deletions(-) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index d066248798b41..c4a1185a83d47 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -70,7 +70,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): # keep only what we need for inference if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \ not key.startswith('state_dict.backbone.') and \ - not key.startswith('state_dict.head.'): + not key.startswith('state_dict.head.out'): print('Skipping key: ', key) continue @@ -101,9 +101,6 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): if new_key.endswith("gamma"): new_key = new_key.replace("gamma", "gamma.weight") - if new_key == "head.istft.window": - new_key = "head.istft.window.weight" - size_mb = value.element_size() * value.nelement() / (1024 * 1024) print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}") diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 684f6b2fbd8e0..f402ba8a2be97 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -57,6 +57,16 @@ static void print_usage(int, char ** argv) { LOG("\n"); } +void fill_hann_window(int length, bool periodic, float * output) { + int offset = -1; + if (periodic) { + offset = 0; + } + for (int i = 0; i < length; i++) { + output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); + } +} + int main(int argc, char ** argv) { common_params params; @@ -171,6 +181,11 @@ int main(int argc, char ** argv) { const int n_embd = llama_n_embd(model_cts); const float * embd = llama_get_embeddings(ctx_cts); + const int w = 1280; + std::vector hann(w); + fill_hann_window(hann.size(), true, hann.data()); + + int n = n_embd*261; LOG("result:\n"); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ea74354a43b8b..f1f44c7d2c77a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -387,7 +387,6 @@ class MODEL_TENSOR(IntEnum): POS_NET_ATTN_K = auto() POS_NET_ATTN_V = auto() POS_NET_ATTN_OUT = auto() - HANN_WINDOW = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -569,7 +568,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POS_NET_ATTN_K: "pos_net.{bid}.attn_k", MODEL_TENSOR.POS_NET_ATTN_V: "pos_net.{bid}.attn_v", MODEL_TENSOR.POS_NET_ATTN_OUT: "pos_net.{bid}.attn_output", - MODEL_TENSOR.HANN_WINDOW: "hann_window", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1429,7 +1427,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POS_NET_ATTN_K, MODEL_TENSOR.POS_NET_ATTN_V, MODEL_TENSOR.POS_NET_ATTN_OUT, - MODEL_TENSOR.HANN_WINDOW, ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 93b70a1477e6a..5bf1f514a04f6 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -94,10 +94,6 @@ class TensorNameMap: MODEL_TENSOR.ROPE_FACTORS_LONG: (), MODEL_TENSOR.ROPE_FACTORS_SHORT: (), - MODEL_TENSOR.HANN_WINDOW: ( - "head.istft.window", # outetts - ), - MODEL_TENSOR.CONV1D: ( "backbone.embed", # roberta ), diff --git a/include/llama.h b/include/llama.h index efbb27d21523a..a4abf395bcd93 100644 --- a/include/llama.h +++ b/include/llama.h @@ -482,9 +482,6 @@ extern "C" { // Returns the total number of parameters in the model LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); - // Get a llama model tensor - LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name); - // Returns true if the model contains an encoder that requires llama_encode() call LLAMA_API bool llama_model_has_encoder(const struct llama_model * model); diff --git a/src/llama.cpp b/src/llama.cpp index 841e9a491b2f6..07a00b2f63fb7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -627,7 +627,6 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, - LLM_TENSOR_HANN_WINDOW, }; static const std::map> LLM_TENSOR_NAMES = { @@ -1635,7 +1634,6 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_POS_NET_ATTN_K, "pos_net.%d.attn_k" }, { LLM_TENSOR_POS_NET_ATTN_V, "pos_net.%d.attn_v" }, { LLM_TENSOR_POS_NET_ATTN_OUT, "pos_net.%d.attn_output" }, - { LLM_TENSOR_HANN_WINDOW, "hann_window" }, }, }, { @@ -3648,6 +3646,17 @@ static int llama_get_device_count(const llama_model & model) { return (int) model.devices.size(); } +static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) { + auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), + [name](const std::pair & it) { + return it.first == name; + }); + if (it == model->tensors_by_name.end()) { + return nullptr; + } + return it->second; +} + template static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) { ggml_init_params params = { @@ -7462,7 +7471,6 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_CONV_NEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONV_NEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONV_NEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_HANN_WINDOW, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, }; // checks if the weight tensor can be used with the specified buffer type and device @@ -9638,8 +9646,6 @@ static bool llm_load_tensors( model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, n_embd}, 0); model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0); - - model.hann_window = create_tensor(tn(LLM_TENSOR_HANN_WINDOW, "weight"), {1280}, 0); } break; default: throw std::runtime_error("unknown architecture"); @@ -21021,17 +21027,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) { return model->n_elements; } -struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) { - auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), - [name](const std::pair & it) { - return it.first == name; - }); - if (it == model->tensors_by_name.end()) { - return nullptr; - } - return it->second; -} - bool llama_model_has_encoder(const struct llama_model * model) { switch (model->arch) { case LLM_ARCH_T5: return true; From 5a1c98e8d24018cf0c6f7746ded80151a23abf56 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 14:51:17 +0200 Subject: [PATCH 20/45] fft --- examples/tts/tts.cpp | 127 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 116 insertions(+), 11 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index f402ba8a2be97..1fdf756d579f3 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -9,6 +9,7 @@ #include #include #include +#include // // Terminal utils @@ -67,6 +68,45 @@ void fill_hann_window(int length, bool periodic, float * output) { } } +// very poor-man fft +void twiddle(float * real, float * imag, int k, int N) { + float angle = 2 * M_PI * k / N; + *real = cos(angle); + *imag = sin(angle); +} + +void irfft(int n, float * inp_cplx, float * out_real) { + int N = n / 2 + 1; + + std::vector real_input(N); + std::vector imag_input(N); + for (int i = 0; i < N; ++i) { + real_input[i] = inp_cplx[2 * i]; + imag_input[i] = inp_cplx[2 * i + 1]; + } + + std::vector real_output(n); + std::vector imag_output(n); + + for (int k = 0; k < n; ++k) { + real_output[k] = 0.0f; + imag_output[k] = 0.0f; + for (int m = 0; m < N; ++m) { + float twiddle_real; + float twiddle_imag; + + twiddle(&twiddle_real, &twiddle_imag, k * m, n); + + real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag; + imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real; + } + } + + for (int i = 0; i < n; ++i) { + out_real[i] = real_output[i] / N; + } +} + int main(int argc, char ** argv) { common_params params; @@ -181,28 +221,93 @@ int main(int argc, char ** argv) { const int n_embd = llama_n_embd(model_cts); const float * embd = llama_get_embeddings(ctx_cts); - const int w = 1280; - std::vector hann(w); + const int n = prompt_inp.size(); + const int n_fft = 1280; + const int n_hop = 320; + const int n_win = 1280; + const int n_pad = (n_win - n_hop)/2; + + std::vector hann(n_fft); + fill_hann_window(hann.size(), true, hann.data()); + int n_spec = n_embd*n; - int n = n_embd*261; + std::vector E (n_spec); + std::vector S (n_spec); + std::vector ST(n_spec); - LOG("result:\n"); - for (int i = 0; i < 10; ++i) { - LOG("%8.3f ", embd[i]); + for (int l = 0; l < n; ++l) { + for (int k = 0; k < n_embd; ++k) { + E[k*n + l] = embd[l*n_embd + k]; + } } - LOG("\n"); - for (int i = n - 10; i < n; ++i) { - LOG("%8.3f ", embd[i]); + + for (int k = 0; k < n_embd/2; ++k) { + for (int l = 0; l < n; ++l) { + float mag = E[(k )*n + l]; + float phi = E[(k + n_embd/2)*n + l]; + + mag = exp(mag); + + if (mag > 1e2) { + mag = 1e2; + } + S[2*(k*n + l) + 0] = mag*cosf(phi); + S[2*(k*n + l) + 1] = mag*sinf(phi); + } + } + + for (int l = 0; l < n; ++l) { + for (int k = 0; k < n_embd/2; ++k) { + ST[l*n_embd + 2*k + 0] = S[2*(k*n + l) + 0]; + ST[l*n_embd + 2*k + 1] = S[2*(k*n + l) + 1]; + } + } + + std::vector res(n*n_fft); + + const int n_thread = std::thread::hardware_concurrency(); + std::vector workers(n_thread); + for (int i = 0; i < n_thread; ++i) { + workers[i] = std::thread([&, i]() { + for (int l = i; l < n; l += n_thread) { + irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft); + } + }); + } + for (int i = 0; i < n_thread; ++i) { + workers[i].join(); + } + + LOG("result (%d):\n", res.size()); + for (int i = 0; i < n_fft; ++i) { + LOG("%d - %8.5f\n", i, res[5*n_fft + i]); } LOG("\n"); double sum = 0.0; - for (int i = 0; i < n; ++i) { - sum += embd[i]; + for (int i = 0; i < n_fft; ++i) { + sum += res[5*n_fft + i]; } LOG("sum: %f\n", sum); + { + LOG("result:\n"); + for (int i = 0; i < 10; ++i) { + LOG("%8.3f ", S[i]); + } + LOG("\n"); + for (int i = n_spec - 10; i < n_spec; ++i) { + LOG("%8.3f ", S[i]); + } + LOG("\n"); + double sum = 0.0; + for (int i = 0; i < n_spec; ++i) { + sum += S[i]; + } + LOG("sum: %f\n", sum); + } + fprintf(stderr, "\n"); llama_free(ctx_ttc); From e52797162e2f9797d00ec27aca4fe39e849236bc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 16:35:10 +0200 Subject: [PATCH 21/45] spectrum processing --- examples/tts/tts.cpp | 143 ++++++++++++++++++++++++++++++------------- 1 file changed, 101 insertions(+), 42 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 1fdf756d579f3..8eaf5a2623cc1 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -58,7 +58,7 @@ static void print_usage(int, char ** argv) { LOG("\n"); } -void fill_hann_window(int length, bool periodic, float * output) { +static void fill_hann_window(int length, bool periodic, double * output) { int offset = -1; if (periodic) { offset = 0; @@ -69,31 +69,31 @@ void fill_hann_window(int length, bool periodic, float * output) { } // very poor-man fft -void twiddle(float * real, float * imag, int k, int N) { - float angle = 2 * M_PI * k / N; +static void twiddle(double * real, double * imag, int k, int N) { + double angle = 2 * M_PI * k / N; *real = cos(angle); *imag = sin(angle); } -void irfft(int n, float * inp_cplx, float * out_real) { +static void irfft(int n, const double * inp_cplx, double * out_real) { int N = n / 2 + 1; - std::vector real_input(N); - std::vector imag_input(N); + std::vector real_input(N); + std::vector imag_input(N); for (int i = 0; i < N; ++i) { real_input[i] = inp_cplx[2 * i]; imag_input[i] = inp_cplx[2 * i + 1]; } - std::vector real_output(n); - std::vector imag_output(n); + std::vector real_output(n); + std::vector imag_output(n); for (int k = 0; k < n; ++k) { real_output[k] = 0.0f; imag_output[k] = 0.0f; for (int m = 0; m < N; ++m) { - float twiddle_real; - float twiddle_imag; + double twiddle_real; + double twiddle_imag; twiddle(&twiddle_real, &twiddle_imag, k * m, n); @@ -107,6 +107,38 @@ void irfft(int n, float * inp_cplx, float * out_real) { } } +static void fold( + const std::vector & data, + int64_t output_size, + int64_t win_length, + int64_t hop_length, + int64_t pad, + std::vector& output +) { + int64_t output_height = output_size; + int64_t kernel_w = win_length; + int64_t stride_w = hop_length; + + int64_t width = output_size; + + output.resize(width, 0.0f); + + int64_t col_idx = 0; + for (int64_t w_col = 0; w_col < width; ++w_col) { + int64_t start = w_col * stride_w - pad; + int64_t end = start + kernel_w; + + for (int64_t w_im = start; w_im < end; ++w_im) { + if (w_im >= 0 && w_im < output_height) { + output[w_im] += data[col_idx]; + } + col_idx++; + } + } + + output.resize(output_size - 2 * pad); +} + int main(int argc, char ** argv) { common_params params; @@ -226,16 +258,17 @@ int main(int argc, char ** argv) { const int n_hop = 320; const int n_win = 1280; const int n_pad = (n_win - n_hop)/2; + const int n_out = (n - 1)*n_hop + n_win; - std::vector hann(n_fft); + std::vector hann(n_fft); fill_hann_window(hann.size(), true, hann.data()); int n_spec = n_embd*n; - std::vector E (n_spec); - std::vector S (n_spec); - std::vector ST(n_spec); + std::vector E (n_spec); + std::vector S (n_spec); + std::vector ST(n_spec); for (int l = 0; l < n; ++l) { for (int k = 0; k < n_embd; ++k) { @@ -245,8 +278,8 @@ int main(int argc, char ** argv) { for (int k = 0; k < n_embd/2; ++k) { for (int l = 0; l < n; ++l) { - float mag = E[(k )*n + l]; - float phi = E[(k + n_embd/2)*n + l]; + double mag = E[(k )*n + l]; + double phi = E[(k + n_embd/2)*n + l]; mag = exp(mag); @@ -265,7 +298,8 @@ int main(int argc, char ** argv) { } } - std::vector res(n*n_fft); + std::vector res (n*n_fft); + std::vector hann2(n*n_fft); const int n_thread = std::thread::hardware_concurrency(); std::vector workers(n_thread); @@ -273,6 +307,10 @@ int main(int argc, char ** argv) { workers[i] = std::thread([&, i]() { for (int l = i; l < n; l += n_thread) { irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft); + for (int j = 0; j < n_fft; ++j) { + res [l*n_fft + j] *= hann[j]; + hann2[l*n_fft + j] = hann[j] * hann[j]; + } } }); } @@ -280,33 +318,54 @@ int main(int argc, char ** argv) { workers[i].join(); } - LOG("result (%d):\n", res.size()); - for (int i = 0; i < n_fft; ++i) { - LOG("%d - %8.5f\n", i, res[5*n_fft + i]); + //LOG("result (%d):\n", res.size()); + //for (int i = 0; i < n_fft; ++i) { + // LOG("%d - %8.5f\n", i, res[5*n_fft + i]); + //} + //LOG("\n"); + //double sum = 0.0; + //for (int i = 0; i < n_fft; ++i) { + // sum += res[5*n_fft + i]; + //} + //LOG("sum: %f\n", sum); + + std::vector audio; + std::vector env; + + fold(res, n_out, n_win, n_hop, n_pad, audio); + fold(hann2, n_out, n_win, n_hop, n_pad, env); + + for (size_t i = 0; i < audio.size(); ++i) { + audio[i] /= env[i]; } - LOG("\n"); - double sum = 0.0; - for (int i = 0; i < n_fft; ++i) { - sum += res[5*n_fft + i]; - } - LOG("sum: %f\n", sum); - { - LOG("result:\n"); - for (int i = 0; i < 10; ++i) { - LOG("%8.3f ", S[i]); - } - LOG("\n"); - for (int i = n_spec - 10; i < n_spec; ++i) { - LOG("%8.3f ", S[i]); - } - LOG("\n"); - double sum = 0.0; - for (int i = 0; i < n_spec; ++i) { - sum += S[i]; - } - LOG("sum: %f\n", sum); - } + //LOG("audio (%d):\n", audio.size()); + //for (int i = 0; i < 1000; ++i) { + // LOG("%d: %8.5f\n", i, audio[i]); + //} + //LOG("\n"); + //double sum = 0.0; + //for (int i = 0; i < 1000; ++i) { + // sum += audio[i]; + //} + //LOG("sum: %f\n", sum); + + //{ + // LOG("result:\n"); + // for (int i = 0; i < 10; ++i) { + // LOG("%8.3f ", S[i]); + // } + // LOG("\n"); + // for (int i = n_spec - 10; i < n_spec; ++i) { + // LOG("%8.3f ", S[i]); + // } + // LOG("\n"); + // double sum = 0.0; + // for (int i = 0; i < n_spec; ++i) { + // sum += S[i]; + // } + // LOG("sum: %f\n", sum); + //} fprintf(stderr, "\n"); From 191da330fc2c597d7aed1152d389e7e9849d3252 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 16:50:40 +0200 Subject: [PATCH 22/45] clean-up --- examples/tts/tts.cpp | 304 +++++++++++++++++++++++-------------------- ggml/src/ggml.c | 4 - src/llama.cpp | 5 - 3 files changed, 166 insertions(+), 147 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 8eaf5a2623cc1..a3b923332be9b 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -107,26 +107,29 @@ static void irfft(int n, const double * inp_cplx, double * out_real) { } } -static void fold( - const std::vector & data, - int64_t output_size, - int64_t win_length, - int64_t hop_length, - int64_t pad, - std::vector& output -) { - int64_t output_height = output_size; - int64_t kernel_w = win_length; - int64_t stride_w = hop_length; - - int64_t width = output_size; +// +// y = torch.nn.functional.fold( +// data, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length), +// )[:, 0, 0, pad:-pad] +// +// data.shape = torch.Size([1, 1280, 261]) +// output_size = 84480 +// win_length = 1280 +// hop_length = 320 +// pad = 480 +// +static void fold(const std::vector & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector & output) { + int64_t output_height = n_out; + int64_t kernel_w = n_win; + int64_t stride_w = n_hop; + int64_t width = n_out; output.resize(width, 0.0f); int64_t col_idx = 0; for (int64_t w_col = 0; w_col < width; ++w_col) { - int64_t start = w_col * stride_w - pad; - int64_t end = start + kernel_w; + int64_t start = w_col * stride_w - n_pad; + int64_t end = start + kernel_w; for (int64_t w_im = start; w_im < end; ++w_im) { if (w_im >= 0 && w_im < output_height) { @@ -136,7 +139,129 @@ static void fold( } } - output.resize(output_size - 2 * pad); + output.resize(n_out - 2 * n_pad); +} + +struct wav_header { + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t chunk_size; + char wave[4] = {'W', 'A', 'V', 'E'}; + char fmt[4] = {'f', 'm', 't', ' '}; + uint32_t fmt_chunk_size = 16; + uint16_t audio_format = 1; // PCM + uint16_t num_channels = 1; // Mono + uint32_t sample_rate; + uint32_t byte_rate; + uint16_t block_align; + uint16_t bits_per_sample = 16; + char data[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size; +}; + +static void save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { + std::ofstream file(fname, std::ios::binary); + if (!file) { + LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str()); + return; + } + + wav_header header; + header.sample_rate = sample_rate; + header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); + header.block_align = header.num_channels * (header.bits_per_sample / 8); + header.data_size = data.size() * (header.bits_per_sample / 8); + header.chunk_size = 36 + header.data_size; + + file.write(reinterpret_cast(&header), sizeof(header)); + + for (const auto & sample : data) { + int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); + file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); + } + + file.close(); +} + +static std::vector embd_to_audio( + const float * embd, + const std::vector & codes, + const int n_embd, + const int n_thread) { + const int n = codes.size(); + const int n_fft = 1280; + const int n_hop = 320; + const int n_win = 1280; + const int n_pad = (n_win - n_hop)/2; + const int n_out = (n - 1)*n_hop + n_win; + + std::vector hann(n_fft); + + fill_hann_window(hann.size(), true, hann.data()); + + int n_spec = n_embd*n; + + std::vector E (n_spec); + std::vector S (n_spec); + std::vector ST(n_spec); + + for (int l = 0; l < n; ++l) { + for (int k = 0; k < n_embd; ++k) { + E[k*n + l] = embd[l*n_embd + k]; + } + } + + for (int k = 0; k < n_embd/2; ++k) { + for (int l = 0; l < n; ++l) { + double mag = E[(k )*n + l]; + double phi = E[(k + n_embd/2)*n + l]; + + mag = exp(mag); + + if (mag > 1e2) { + mag = 1e2; + } + S[2*(k*n + l) + 0] = mag*cosf(phi); + S[2*(k*n + l) + 1] = mag*sinf(phi); + } + } + + for (int l = 0; l < n; ++l) { + for (int k = 0; k < n_embd/2; ++k) { + ST[l*n_embd + 2*k + 0] = S[2*(k*n + l) + 0]; + ST[l*n_embd + 2*k + 1] = S[2*(k*n + l) + 1]; + } + } + + std::vector res (n*n_fft); + std::vector hann2(n*n_fft); + + std::vector workers(n_thread); + for (int i = 0; i < n_thread; ++i) { + workers[i] = std::thread([&, i]() { + for (int l = i; l < n; l += n_thread) { + irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft); + for (int j = 0; j < n_fft; ++j) { + res [l*n_fft + j] *= hann[j]; + hann2[l*n_fft + j] = hann[j] * hann[j]; + } + } + }); + } + for (int i = 0; i < n_thread; ++i) { + workers[i].join(); + } + + std::vector audio; + std::vector env; + + fold(res, n_out, n_win, n_hop, n_pad, audio); + fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once + + for (size_t i = 0; i < audio.size(); ++i) { + audio[i] /= env[i]; + } + + return audio; } int main(int argc, char ** argv) { @@ -178,7 +303,7 @@ int main(int argc, char ** argv) { const auto t_main_start = ggml_time_us(); - std::vector prompt_inp = {198, 88225, 155856, 151669, 152205, + std::vector codes = {198, 88225, 155856, 151669, 152205, 153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695, 153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010, 153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286, @@ -215,31 +340,33 @@ int main(int argc, char ** argv) { 153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645}; { - const std::string inp_txt = common_detokenize(ctx_ttc, prompt_inp, true); + const std::string inp_txt = common_detokenize(ctx_ttc, codes, true); LOG_INF("prompt: '%s'\n", inp_txt.c_str()); - LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size()); + LOG_INF("%s: prompt size: %d\n", __func__, (int) codes.size()); } // remove all non-audio tokens (i.e. < 151672 || > 155772) - prompt_inp.erase(std::remove_if(prompt_inp.begin(), prompt_inp.end(), [](llama_token t) { return t < 151672 || t > 155772; }), prompt_inp.end()); + codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end()); { - const std::string inp_txt = common_detokenize(ctx_ttc, prompt_inp, true); + const std::string inp_txt = common_detokenize(ctx_ttc, codes, true); LOG_INF("prompt audio: '%s'\n", inp_txt.c_str()); - LOG_INF("%s: prompt audio size: %d\n", __func__, (int) prompt_inp.size()); + LOG_INF("%s: prompt audio size: %d\n", __func__, (int) codes.size()); } - for (auto & token : prompt_inp) { + for (auto & token : codes) { token -= 151672; } - llama_batch batch = llama_batch_init(prompt_inp.size(), 0, 1); + const auto t_voc_start = ggml_time_us(); + + llama_batch batch = llama_batch_init(codes.size(), 0, 1); // evaluate the initial prompt - for (size_t i = 0; i < prompt_inp.size(); ++i) { - common_batch_add(batch, prompt_inp[i], i, { 0 }, true); // TODO: all logits? + for (size_t i = 0; i < codes.size(); ++i) { + common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits? } - GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size()); + GGML_ASSERT(batch.n_tokens == (int) codes.size()); if (llama_decode(ctx_cts, batch) != 0) { LOG_ERR("%s: llama_decode() failed\n", __func__); @@ -248,126 +375,27 @@ int main(int argc, char ** argv) { llama_synchronize(ctx_cts); - LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); + LOG_INF("%s: time for vocoder: %.3f ms\n", __func__, (ggml_time_us() - t_voc_start) / 1000.0f); + + const auto t_spec_start = ggml_time_us(); const int n_embd = llama_n_embd(model_cts); const float * embd = llama_get_embeddings(ctx_cts); - const int n = prompt_inp.size(); - const int n_fft = 1280; - const int n_hop = 320; - const int n_win = 1280; - const int n_pad = (n_win - n_hop)/2; - const int n_out = (n - 1)*n_hop + n_win; - - std::vector hann(n_fft); - - fill_hann_window(hann.size(), true, hann.data()); - - int n_spec = n_embd*n; - - std::vector E (n_spec); - std::vector S (n_spec); - std::vector ST(n_spec); - - for (int l = 0; l < n; ++l) { - for (int k = 0; k < n_embd; ++k) { - E[k*n + l] = embd[l*n_embd + k]; - } - } - - for (int k = 0; k < n_embd/2; ++k) { - for (int l = 0; l < n; ++l) { - double mag = E[(k )*n + l]; - double phi = E[(k + n_embd/2)*n + l]; - - mag = exp(mag); - - if (mag > 1e2) { - mag = 1e2; - } - S[2*(k*n + l) + 0] = mag*cosf(phi); - S[2*(k*n + l) + 1] = mag*sinf(phi); - } - } + // spectral operations + // TODO: not optimized at all + auto audio = embd_to_audio(embd, codes, n_embd, params.cpuparams.n_threads); - for (int l = 0; l < n; ++l) { - for (int k = 0; k < n_embd/2; ++k) { - ST[l*n_embd + 2*k + 0] = S[2*(k*n + l) + 0]; - ST[l*n_embd + 2*k + 1] = S[2*(k*n + l) + 1]; - } - } + const std::string fname = "output.wav"; - std::vector res (n*n_fft); - std::vector hann2(n*n_fft); + const int n_sr = 24000; // sampling rate - const int n_thread = std::thread::hardware_concurrency(); - std::vector workers(n_thread); - for (int i = 0; i < n_thread; ++i) { - workers[i] = std::thread([&, i]() { - for (int l = i; l < n; l += n_thread) { - irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft); - for (int j = 0; j < n_fft; ++j) { - res [l*n_fft + j] *= hann[j]; - hann2[l*n_fft + j] = hann[j] * hann[j]; - } - } - }); - } - for (int i = 0; i < n_thread; ++i) { - workers[i].join(); - } + LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f); + LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); - //LOG("result (%d):\n", res.size()); - //for (int i = 0; i < n_fft; ++i) { - // LOG("%d - %8.5f\n", i, res[5*n_fft + i]); - //} - //LOG("\n"); - //double sum = 0.0; - //for (int i = 0; i < n_fft; ++i) { - // sum += res[5*n_fft + i]; - //} - //LOG("sum: %f\n", sum); - - std::vector audio; - std::vector env; - - fold(res, n_out, n_win, n_hop, n_pad, audio); - fold(hann2, n_out, n_win, n_hop, n_pad, env); - - for (size_t i = 0; i < audio.size(); ++i) { - audio[i] /= env[i]; - } + save_wav16(fname, audio, n_sr); - //LOG("audio (%d):\n", audio.size()); - //for (int i = 0; i < 1000; ++i) { - // LOG("%d: %8.5f\n", i, audio[i]); - //} - //LOG("\n"); - //double sum = 0.0; - //for (int i = 0; i < 1000; ++i) { - // sum += audio[i]; - //} - //LOG("sum: %f\n", sum); - - //{ - // LOG("result:\n"); - // for (int i = 0; i < 10; ++i) { - // LOG("%8.3f ", S[i]); - // } - // LOG("\n"); - // for (int i = n_spec - 10; i < n_spec; ++i) { - // LOG("%8.3f ", S[i]); - // } - // LOG("\n"); - // double sum = 0.0; - // for (int i = 0; i < n_spec; ++i) { - // sum += S[i]; - // } - // LOG("sum: %f\n", sum); - //} - - fprintf(stderr, "\n"); + LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str()); llama_free(ctx_ttc); llama_free_model(model_ttc); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 7c0159ab49c9f..2bbe5f48257b2 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3846,10 +3846,6 @@ struct ggml_tensor * ggml_conv_1d( int d0) { struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K] - printf("a: %lld %lld %lld %lld\n", a->ne[0], a->ne[1], a->ne[2], a->ne[3]); - printf("b: %lld %lld %lld %lld\n", b->ne[0], b->ne[1], b->ne[2], b->ne[3]); - printf("im2col: %lld %lld %lld %lld\n", im2col->ne[0], im2col->ne[1], im2col->ne[2], im2col->ne[3]); - struct ggml_tensor * result = ggml_mul_mat(ctx, ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K] diff --git a/src/llama.cpp b/src/llama.cpp index 07a00b2f63fb7..6397decd70c73 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17234,9 +17234,6 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); - printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); - printf("conv1d: %d %d %d\n", model.conv_1d->ne[0], model.conv_1d->ne[1], model.conv_1d->ne[2]); - cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1); cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0])); @@ -17445,8 +17442,6 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, model.output_b); cb(cur, "result_embd", -1); - printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]); - ggml_build_forward_expand(gf, cur); return gf; From b9a011e1237a5196688e4c72ac858e61e10ffed4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 18:12:59 +0200 Subject: [PATCH 23/45] tts : receive input text and generate codes --- examples/tts/tts.cpp | 433 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 391 insertions(+), 42 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index a3b923332be9b..8ba13944f3cf5 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -264,6 +264,25 @@ static std::vector embd_to_audio( return audio; } +static void prompt_add(llama_tokens & prompt, llama_token token) { + prompt.push_back(token); +} + +static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) { + prompt.insert(prompt.end(), tokens.begin(), tokens.end()); +} + +static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) { + auto tmp = common_tokenize(model, txt, add_special, parse_special); + prompt_add(prompt, tmp); +} + +static void prompt_init(llama_tokens & prompt, const llama_model * model) { + prompt.clear(); + + prompt_add(prompt, model, "<|im_start|>\n", true, true); +} + int main(int argc, char ** argv) { common_params params; @@ -273,10 +292,18 @@ int main(int argc, char ** argv) { params.n_batch = 8192; params.n_ctx = 8192; + params.sampling.top_k = 128; + params.sampling.penalty_repeat = 1.1; + params.sampling.penalty_last_n = 8; + params.sampling.samplers = { COMMON_SAMPLER_TYPE_TEMPERATURE, COMMON_SAMPLER_TYPE_TOP_K, }; + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { return 1; } + const int n_parallel = params.n_parallel; + const int n_predict = params.n_predict; + common_init(); // init LLM @@ -301,48 +328,371 @@ int main(int argc, char ** argv) { model_cts = llama_init_cts.model; ctx_cts = llama_init_cts.context; + std::vector smpl(n_parallel); + for (int i = 0; i < n_parallel; ++i) { + params.sampling.no_perf = (i != 0); + params.sampling.seed = params.sampling.seed + 1; + + smpl[i] = common_sampler_init(model_ttc, params.sampling); + } + + LOG_INF("%s: loading done\n", __func__); + const auto t_main_start = ggml_time_us(); - std::vector codes = {198, 88225, 155856, 151669, 152205, - 153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695, - 153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010, - 153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286, - 152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296, - 153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690, - 153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061, - 153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670, - 198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683, - 152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908, - 151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359, - 153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424, - 151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670, - 198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729, - 152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669, - 153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670, - 198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501, - 152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242, - 153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360, - 153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055, - 152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670, - 198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441, - 152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831, - 153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133, - 153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109, - 152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055, - 155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729, - 151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337, - 153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153, - 153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365, - 153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218, - 152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464, - 152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855, - 152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418, - 153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645}; + std::vector codes; + + // process prompt and generate voice codes + { + LOG_INF("%s: constructing prompt ..\n", __func__); + + std::vector prompt_inp; + + prompt_init(prompt_inp, model_ttc); + + prompt_add(prompt_inp, model_ttc, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true); + + // TODO: not sure if this is correct + { + std::string prompt_user = params.prompt; + std::string prompt_clean = ""; + //string_replace_all(prompt_user, " ", "<|text_sep|>"); + for (auto & c : prompt_user) { + if (c == ' ') { + prompt_clean += "<|text_sep|>"; + } else { + if (isalpha(c)) { + c = tolower(c); + } else { + continue; + } + prompt_clean += c; + } + } + + prompt_add(prompt_inp, model_ttc, prompt_clean, false, true); + } + + prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true); + + // disabled to save time on tokenizing each time + // TODO: load voices from the json files +#if 0 + const std::string voice_data = R"(<|audio_start|> +the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|> +overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|> +package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|> +from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|> +just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|> +two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|> +people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|> +is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|> +pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|> +remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|> +sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|> +i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|> +have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|> +some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|> +critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|> +about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|> +some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|> +of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|> +the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|> +gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|> +aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|> +but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|> +its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|> +still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|> +really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|> +enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|> +and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|> +it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|> +looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|> +lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)"; + + auto tmp = common_tokenize(model_ttc, voice_data, false, true); + printf("\n\n"); + for (int i = 0; i < tmp.size(); ++i) { + printf("%d, ", tmp[i]); + } + printf("\n\n"); +#else + prompt_add(prompt_inp, llama_tokens { + 151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585, + 152460, 153375, 151670, 198, 74455, 155808, 151669, 151799, + 151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470, + 151970, 153413, 152419, 153334, 153289, 153374, 153199, 152040, + 153260, 152721, 152680, 153297, 152419, 153248, 152400, 152691, + 153368, 153437, 151670, 198, 1722, 155828, 151669, 152607, + 152256, 152991, 152299, 152688, 153163, 153016, 152789, 153198, + 152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207, + 152461, 153321, 153309, 151750, 152137, 153340, 152573, 152267, + 153347, 151789, 152681, 153339, 151992, 152512, 151751, 152179, + 153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904, + 152311, 151670, 198, 1499, 155791, 151669, 152276, 152454, + 153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226, + 153043, 152325, 153267, 152622, 151670, 198, 4250, 155797, + 151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271, + 152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213, + 152112, 153204, 151722, 152542, 151670, 198, 19789, 155796, + 151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002, + 152191, 151734, 152312, 152810, 152237, 153224, 153169, 153224, + 152244, 153387, 153404, 151670, 198, 16069, 155811, 151669, + 152265, 151946, 151808, 152412, 152363, 152305, 153156, 152733, + 152810, 153157, 152016, 152100, 152069, 153234, 152317, 152589, + 152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504, + 153376, 152272, 152433, 152325, 151941, 151670, 198, 285, + 155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381, + 152474, 152680, 152157, 153255, 152324, 151682, 151670, 198, + 32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682, + 152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488, + 153070, 151883, 152890, 152489, 153144, 153375, 152358, 151685, + 152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669, + 151902, 152720, 153377, 152027, 152378, 152821, 153207, 153459, + 153028, 153068, 152507, 153255, 152158, 152921, 151958, 152609, + 152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470, + 152606, 152162, 152186, 153071, 152244, 153118, 153375, 153018, + 152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736, + 153380, 153502, 152702, 152115, 153181, 152735, 153277, 153457, + 152393, 153112, 152595, 151670, 198, 19098, 155808, 151669, + 152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239, + 153163, 152922, 153402, 152034, 152591, 153438, 152215, 151673, + 152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482, + 152718, 152862, 153347, 151670, 198, 72, 155780, 151669, 151795, + 152111, 152746, 152377, 153471, 152309, 151670, 198, 19016, + 155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701, + 152939, 152536, 152091, 151815, 152733, 151672, 151670, 198, + 14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042, + 153504, 152589, 153333, 151839, 151941, 153038, 153180, 151670, + 198, 36996, 8303, 155832, 151669, 152231, 152256, 152835, + 152801, 152985, 153400, 152393, 152818, 152765, 152249, 152600, + 151699, 152302, 152752, 153018, 153009, 151992, 153054, 152847, + 153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458, + 152048, 152757, 152428, 153195, 151906, 153006, 153178, 153250, + 152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418, + 152228, 152733, 151670, 198, 9096, 155801, 151669, 151698, + 153321, 152217, 153039, 152935, 153400, 152122, 152531, 153106, + 152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851, + 152901, 152885, 152594, 153446, 153080, 151670, 198, 14689, + 155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191, + 151673, 151690, 151698, 152714, 152846, 152981, 153171, 153384, + 153364, 153188, 153246, 151670, 198, 1055, 155779, 151669, + 151869, 152388, 152711, 153334, 151736, 151670, 198, 1782, + 155780, 151669, 153483, 153240, 152241, 152558, 152697, 153046, + 151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605, + 153034, 153434, 153372, 153347, 151887, 152453, 152758, 152133, + 152510, 152694, 152431, 152321, 153088, 152676, 152223, 152581, + 152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032, + 152903, 152859, 152989, 151748, 152669, 152661, 152650, 152409, + 151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469, + 152988, 152894, 151819, 152391, 153019, 152058, 153062, 153230, + 151826, 152112, 152306, 152264, 152769, 153390, 152384, 152435, + 152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540, + 151919, 151893, 152558, 152817, 152946, 152956, 152129, 152715, + 153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450, + 151670, 198, 8088, 155792, 151669, 152452, 153497, 153353, + 152679, 152533, 152382, 152374, 152611, 153341, 153163, 152285, + 153411, 152495, 153141, 152320, 151670, 198, 1199, 155781, + 151669, 151764, 152360, 153295, 152634, 153342, 152199, 152271, + 151670, 198, 43366, 155799, 151669, 152308, 151682, 152889, + 152016, 152385, 152629, 152495, 151826, 153321, 152958, 152180, + 151886, 153432, 152922, 152128, 153024, 153040, 152593, 152287, + 151677, 151670, 198, 53660, 155808, 151669, 151727, 152092, + 152680, 153331, 151699, 152316, 152938, 152289, 152433, 153384, + 151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691, + 152489, 151941, 152049, 152034, 153053, 152179, 153160, 151676, + 153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350, + 152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234, + 153135, 152291, 153235, 152143, 152583, 152402, 153483, 152678, + 152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825, + 152548, 153442, 152109, 152659, 153325, 152781, 152570, 152957, + 151752, 152265, 153381, 152515, 151670, 198, 437, 155787, + 151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174, + 151792, 153409, 153327, 152990, 151670, 198, 275, 155781, + 151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974, + 151670, 198, 94273, 155799, 151669, 152953, 152938, 153427, + 152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331, + 152257, 152987, 152777, 153448, 152408, 151696, 152408, 152326, + 152699, 151670, 198, 385, 16239, 155828, 151669, 152306, 152268, + 153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110, + 152918, 152923, 152467, 152331, 153053, 153330, 151889, 153444, + 152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751, + 152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499, + 152109, 152255, 151739, 152267, 152759, 153318, 153165, 153349, + 151670,}); +#endif + + // print the prompt token-by-token + + LOG("\n"); + + for (auto id : prompt_inp) { + LOG("%s", common_token_to_piece(ctx_ttc, id).c_str()); + } + + LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size()); + + LOG("\n"); + + // create a llama_batch + // we use this object to submit token data for decoding + llama_batch batch = llama_batch_init(std::max(prompt_inp.size(), (size_t) n_parallel), 0, n_parallel); + + std::vector seq_ids(n_parallel, 0); + for (int32_t i = 0; i < n_parallel; ++i) { + seq_ids[i] = i; + } + + // evaluate the initial prompt + for (size_t i = 0; i < prompt_inp.size(); ++i) { + common_batch_add(batch, prompt_inp[i], i, seq_ids, false); + } + GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size()); + + // llama_decode will output logits only for the last token of the prompt + batch.logits[batch.n_tokens - 1] = true; + + if (llama_decode(ctx_ttc, batch) != 0) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + return 1; + } + + if (n_parallel > 1) { + LOG_INF("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); + } + + llama_synchronize(ctx_ttc); + + LOG_INF("%s: time for prompt: %.3f ms\n\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); + + const auto t_dec_start = ggml_time_us(); + + // main loop + + // remember the batch index of the last token for each parallel sequence + // we need this to determine which logits to sample from + std::vector i_batch(n_parallel, batch.n_tokens - 1); + + int n_past = batch.n_tokens; + int n_decode = 0; + + while (n_decode <= n_predict) { + // prepare the next batch + common_batch_clear(batch); + + // sample the next token for each parallel sequence / stream + for (int32_t i = 0; i < n_parallel; ++i) { + if (i_batch[i] < 0) { + // the stream has already finished + continue; + } + + const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]); + + common_sampler_accept(smpl[i], new_token_id, true); + + codes.push_back(new_token_id); + + const auto * cands = common_sampler_get_candidates(smpl[i]); + + // is it an end of generation? -> mark the stream as finished + if (llama_token_is_eog(model_ttc, new_token_id) || n_decode == n_predict) { + std::string reason; + if (llama_token_is_eog(model_ttc, new_token_id)) { + reason = "eos"; + } else { + reason = "n_predict"; + } + + i_batch[i] = -1; + + LOG("\n"); + if (n_parallel > 1) { + LOG_CNT("\n"); + LOG_INF("%s: stream %d finished at n_past = %d, reason = '%s'\n", __func__, i, n_past, reason.c_str()); + } + + continue; + } + + { + const float p = cands->data[cands->selected].p; + + const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) ((3*p)*float(k_colors.size())))); + + LOG_CNT("%s%d%s", k_colors[col].c_str(), i, "\033[0m"); + //LOG_CNT("%d", i); + } + + i_batch[i] = batch.n_tokens; + + // push this new token for next evaluation + common_batch_add(batch, new_token_id, n_past, { i }, true); + } + + // all streams are finished + if (batch.n_tokens == 0) { + break; + } + + n_decode += 1; + n_past += 1; + + // evaluate the current batch with the transformer model + if (llama_decode(ctx_ttc, batch)) { + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); + return 1; + } + } + + llama_batch_free(batch); + + LOG("\n"); + LOG_INF("%s: time for decoder: %.3f ms\n", __func__, (ggml_time_us() - t_dec_start) / 1000.0f); + } + + //std::vector codes = {198, 88225, 155856, 151669, 152205, + // 153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695, + // 153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010, + // 153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286, + // 152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296, + // 153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690, + // 153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061, + // 153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670, + // 198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683, + // 152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908, + // 151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359, + // 153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424, + // 151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670, + // 198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729, + // 152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669, + // 153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670, + // 198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501, + // 152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242, + // 153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360, + // 153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055, + // 152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670, + // 198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441, + // 152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831, + // 153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133, + // 153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109, + // 152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055, + // 155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729, + // 151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337, + // 153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153, + // 153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365, + // 153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218, + // 152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464, + // 152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855, + // 152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418, + // 153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645}; { const std::string inp_txt = common_detokenize(ctx_ttc, codes, true); - LOG_INF("prompt: '%s'\n", inp_txt.c_str()); - LOG_INF("%s: prompt size: %d\n", __func__, (int) codes.size()); + + LOG("\n"); + LOG_INF("codes: '%s'\n", inp_txt.c_str()); + LOG_INF("%s: codes size: %d\n", __func__, (int) codes.size()); } // remove all non-audio tokens (i.e. < 151672 || > 155772) @@ -350,8 +700,8 @@ int main(int argc, char ** argv) { { const std::string inp_txt = common_detokenize(ctx_ttc, codes, true); - LOG_INF("prompt audio: '%s'\n", inp_txt.c_str()); - LOG_INF("%s: prompt audio size: %d\n", __func__, (int) codes.size()); + LOG_INF("codes audio: '%s'\n", inp_txt.c_str()); + LOG_INF("%s: codes audio size: %d\n", __func__, (int) codes.size()); } for (auto & token : codes) { @@ -362,7 +712,6 @@ int main(int argc, char ** argv) { llama_batch batch = llama_batch_init(codes.size(), 0, 1); - // evaluate the initial prompt for (size_t i = 0; i < codes.size(); ++i) { common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits? } @@ -379,11 +728,11 @@ int main(int argc, char ** argv) { const auto t_spec_start = ggml_time_us(); + // spectral operations + // TODO: not optimized at all const int n_embd = llama_n_embd(model_cts); const float * embd = llama_get_embeddings(ctx_cts); - // spectral operations - // TODO: not optimized at all auto audio = embd_to_audio(embd, codes, n_embd, params.cpuparams.n_threads); const std::string fname = "output.wav"; From db613915de3cc9e982b28294bf596a56a935c4ab Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 18:20:17 +0200 Subject: [PATCH 24/45] clip : fix new conv name --- examples/llava/clip.cpp | 6 +++--- examples/tts/tts.cpp | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index ba28c07c6aeec..463b7c865b90c 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -896,7 +896,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); // stride = 1, padding = 1, bias is nullptr - block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); // layer norm // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] @@ -944,7 +944,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // block_2 { // stride = 2 - block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // layer norm @@ -1005,7 +1005,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // mlp_2 ne [24, 24, 2048, 1] mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); // weight ne = [3, 3, 2048, 1] - struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 8ba13944f3cf5..e2306a5a8085b 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -651,6 +651,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14 LOG_INF("%s: time for decoder: %.3f ms\n", __func__, (ggml_time_us() - t_dec_start) / 1000.0f); } + common_perf_print(ctx_ttc, smpl[0]); + //std::vector codes = {198, 88225, 155856, 151669, 152205, // 153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695, // 153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010, From 8329e850cc17491c895a19a51a1c160539d879d0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 18:43:02 +0200 Subject: [PATCH 25/45] tts : minor fix --- examples/tts/convert_pt_to_hf.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index c4a1185a83d47..62dcc05efc5c2 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -11,9 +11,6 @@ from safetensors.torch import save_file -# change path to script dir -os.chdir(os.path.dirname(os.path.abspath(__file__))) - # default model_path = './model.pt'; From d4fa34bdd4ec91f5f86477f3dff55e1c8dafae53 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 19:00:03 +0200 Subject: [PATCH 26/45] tts : add header + minor fixes ggml-ci --- convert_hf_to_gguf.py | 2 +- examples/tts/tts.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a86490831fdbd..c0ab0f955a496 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -297,7 +297,7 @@ def prepare_tensors(self): for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): # TODO: why do we squeeze here? - #data = data_torch.squeeze().numpy() + # data = data_torch.squeeze().numpy() data = data_torch.numpy() # if data ends up empty, it means data_torch was a scalar tensor -> restore diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index e2306a5a8085b..33ecb8aafe0f3 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -354,9 +355,9 @@ int main(int argc, char ** argv) { // TODO: not sure if this is correct { + std::string prompt_clean; std::string prompt_user = params.prompt; - std::string prompt_clean = ""; - //string_replace_all(prompt_user, " ", "<|text_sep|>"); + for (auto & c : prompt_user) { if (c == ' ') { prompt_clean += "<|text_sep|>"; From 2221e54278eff46b6e8d5e14c1dfcd8074a27a11 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 20:31:20 +0200 Subject: [PATCH 27/45] tts : add matchematical constant ggml-ci --- examples/tts/tts.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 33ecb8aafe0f3..c174efe30a524 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -4,6 +4,8 @@ #include "log.h" #include "llama.h" +#define _USE_MATH_DEFINES // For M_PI on MSVC + #include #include #include From 906a0edb5aa1a9c6700e71b5379a1b13e0900363 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 21:33:51 +0200 Subject: [PATCH 28/45] tts : fix sampling + cut initial noise --- examples/tts/tts.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index c174efe30a524..b9b2e7f9e21a6 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -295,7 +295,7 @@ int main(int argc, char ** argv) { params.n_batch = 8192; params.n_ctx = 8192; - params.sampling.top_k = 128; + params.sampling.top_k = 4; params.sampling.penalty_repeat = 1.1; params.sampling.penalty_last_n = 8; params.sampling.samplers = { COMMON_SAMPLER_TYPE_TEMPERATURE, COMMON_SAMPLER_TYPE_TOP_K, }; @@ -744,6 +744,11 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14 const int n_sr = 24000; // sampling rate + // zero out first 0.25 seconds + for (int i = 0; i < 24000/4; ++i) { + audio[i] = 0.0f; + } + LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f); LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); From 1d7c27ca93aa2e016ea754b6f230d1d5015d9613 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Dec 2024 21:42:53 +0200 Subject: [PATCH 29/45] tts : fixes --- examples/tts/tts.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index b9b2e7f9e21a6..cf04c70607be4 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -135,7 +135,7 @@ static void fold(const std::vector & data, int64_t n_out, int64_t n_win, int64_t end = start + kernel_w; for (int64_t w_im = start; w_im < end; ++w_im) { - if (w_im >= 0 && w_im < output_height) { + if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) { output[w_im] += data[col_idx]; } col_idx++; @@ -291,7 +291,7 @@ int main(int argc, char ** argv) { params.prompt = ""; - params.n_predict = 1024; + params.n_predict = 4096; params.n_batch = 8192; params.n_ctx = 8192; @@ -364,7 +364,7 @@ int main(int argc, char ** argv) { if (c == ' ') { prompt_clean += "<|text_sep|>"; } else { - if (isalpha(c)) { + if (isalpha(c) || isdigit(c)) { c = tolower(c); } else { continue; From 3d54be4d840b623155787d9b5fc98941a2e82bd5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 13:21:36 +0200 Subject: [PATCH 30/45] tts : update default samplers ggml-ci --- examples/tts/tts.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index cf04c70607be4..82bb6d724184a 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -296,9 +296,7 @@ int main(int argc, char ** argv) { params.n_ctx = 8192; params.sampling.top_k = 4; - params.sampling.penalty_repeat = 1.1; - params.sampling.penalty_last_n = 8; - params.sampling.samplers = { COMMON_SAMPLER_TYPE_TEMPERATURE, COMMON_SAMPLER_TYPE_TOP_K, }; + params.sampling.samplers = { COMMON_SAMPLER_TYPE_TOP_K, }; if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { return 1; From befdcd249272313d654189b124b81d54532d4083 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 13:37:36 +0200 Subject: [PATCH 31/45] tts : text pre-processing --- examples/tts/tts.cpp | 165 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 146 insertions(+), 19 deletions(-) diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 82bb6d724184a..57956e5a17075 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -7,12 +7,14 @@ #define _USE_MATH_DEFINES // For M_PI on MSVC #include -#include #include -#include -#include +#include #include +#include +#include +#include #include +#include // // Terminal utils @@ -267,6 +269,143 @@ static std::vector embd_to_audio( return audio; } +static const std::map ones = { + {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"}, + {5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"}, + {10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"}, + {15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"} +}; + +static const std::map tens = { + {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"}, + {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"} +}; + +// Convert a number less than 1000 to words +static std::string convert_less_than_thousand(int num) { + std::string result; + + if (num >= 100) { + result += ones.at(num / 100) + " hundred "; + num %= 100; + } + + if (num >= 20) { + result += tens.at(num / 10); + if (num % 10 > 0) { + result += "-" + ones.at(num % 10); + } + } else if (num > 0) { + result += ones.at(num); + } + + return result; +} + +static std::string number_to_words(const std::string & number_str) { + try { + size_t decimal_pos = number_str.find('.'); + std::string integer_part = number_str.substr(0, decimal_pos); + + int int_number = std::stoi(integer_part); + std::string result; + + if (int_number == 0) { + result = "zero"; + } else { + if (int_number >= 1000000000) { + int billions = int_number / 1000000000; + result += convert_less_than_thousand(billions) + " billion "; + int_number %= 1000000000; + } + + if (int_number >= 1000000) { + int millions = int_number / 1000000; + result += convert_less_than_thousand(millions) + " million "; + int_number %= 1000000; + } + + if (int_number >= 1000) { + int thousands = int_number / 1000; + result += convert_less_than_thousand(thousands) + " thousand "; + int_number %= 1000; + } + + if (int_number > 0) { + result += convert_less_than_thousand(int_number); + } + } + + // Handle decimal part + if (decimal_pos != std::string::npos) { + result += " point"; + std::string decimal_part = number_str.substr(decimal_pos + 1); + for (char digit : decimal_part) { + result += " " + ones.at(digit - '0'); + } + } + + return result; + } catch (const std::exception& e) { + // Skip if fails + return " "; + } +} + +static std::string replace_numbers_with_words(const std::string & input_text) { + std::regex number_pattern(R"(\d+(\.\d+)?)"); + std::string result; + auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern); + auto end = std::sregex_iterator(); + + size_t last_pos = 0; + for (std::sregex_iterator i = it; i != end; ++i) { + const std::smatch& match = *i; + result.append(input_text, last_pos, match.position() - last_pos); + result.append(number_to_words(match.str())); + last_pos = match.position() + match.length(); + } + result.append(input_text, last_pos); + + return result; +} + +// Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39 +static std::string process_text(const std::string & text) { + + // For now I skipped text romanization as I am unsure how to handle + // uroman and MeCab implementations in C++ + // maybe something like https://github.com/anyascii/anyascii/ could work. + // currently only English would be supported in this function + + std::string processed_text = replace_numbers_with_words(text); + + std::transform(processed_text.begin(), processed_text.end(), + processed_text.begin(), ::tolower); + + std::regex special_chars(R"([-_/,\.\\])"); + processed_text = std::regex_replace(processed_text, special_chars, " "); + + std::regex non_alpha(R"([^a-z\s])"); + processed_text = std::regex_replace(processed_text, non_alpha, ""); + + std::regex multiple_spaces(R"(\s+)"); + processed_text = std::regex_replace(processed_text, multiple_spaces, " "); + + processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), ""); + + /* + Replace spaces with the separator token same as in line 365 + + for (auto & c : prompt_user) { + if (c == ' ') { + prompt_clean += "<|text_sep|>"; + */ + processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>"); + + return processed_text; +} + static void prompt_add(llama_tokens & prompt, llama_token token) { prompt.push_back(token); } @@ -353,23 +492,11 @@ int main(int argc, char ** argv) { prompt_add(prompt_inp, model_ttc, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true); - // TODO: not sure if this is correct + // convert the input text into the necessary format expected by OuteTTS { - std::string prompt_clean; - std::string prompt_user = params.prompt; - - for (auto & c : prompt_user) { - if (c == ' ') { - prompt_clean += "<|text_sep|>"; - } else { - if (isalpha(c) || isdigit(c)) { - c = tolower(c); - } else { - continue; - } - prompt_clean += c; - } - } + std::string prompt_clean = process_text(params.prompt); + + LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str()); prompt_add(prompt_inp, model_ttc, prompt_clean, false, true); } From e70f140c04bcd94696ab35e48df6062148d5b0ca Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 13:51:09 +0200 Subject: [PATCH 32/45] tts : outetts-voc -> wavtokenizer-dec --- convert_hf_to_gguf.py | 6 +- examples/tts/convert_pt_to_hf.py | 4 +- gguf-py/gguf/constants.py | 214 +++++++++++++++---------------- gguf-py/gguf/tensor_mapping.py | 36 +++--- src/llama.cpp | 136 ++++++++++---------- 5 files changed, 198 insertions(+), 198 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c0ab0f955a496..8fc4c4f5655f4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2032,9 +2032,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: yield name, data -@Model.register("OuteTTSVocoder") -class OuteTTSVocoderModel(Model): - model_arch = gguf.MODEL_ARCH.OUTETTS_VOC +@Model.register("WavTokenizerDec") +class WavTokenizerDecModel(Model): + model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 62dcc05efc5c2..8d68290e2beb3 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -1,5 +1,5 @@ # convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format -# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the OuteTTSS vocoder +# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder # # TODO: this script is LLM-generated and probably very inefficient and should be rewritten @@ -144,7 +144,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): config = { "architectures": [ - "OuteTTSVocoder" + "WavTokenizerDec" ], "hidden_size": 1282, "vocab_size": 4096, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f1f44c7d2c77a..af2a4f4f4d15a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -209,59 +209,59 @@ class GGUFType: class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GROK = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - REFACT = auto() - BERT = auto() - NOMIC_BERT = auto() - JINA_BERT_V2 = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - QWEN2MOE = auto() - QWEN2VL = auto() - PHI2 = auto() - PHI3 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - MINICPM3 = auto() - GEMMA = auto() - GEMMA2 = auto() - STARCODER2 = auto() - RWKV6 = auto() - MAMBA = auto() - XVERSE = auto() - COMMAND_R = auto() - DBRX = auto() - OLMO = auto() - OLMO2 = auto() - OLMOE = auto() - OPENELM = auto() - ARCTIC = auto() - DEEPSEEK = auto() - DEEPSEEK2 = auto() - CHATGLM = auto() - BITNET = auto() - T5 = auto() - T5ENCODER = auto() - JAIS = auto() - NEMOTRON = auto() - EXAONE = auto() - GRANITE = auto() - GRANITE_MOE = auto() - CHAMELEON = auto() - OUTETTS_VOC = auto() + LLAMA = auto() + FALCON = auto() + BAICHUAN = auto() + GROK = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + REFACT = auto() + BERT = auto() + NOMIC_BERT = auto() + JINA_BERT_V2 = auto() + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + QWEN2MOE = auto() + QWEN2VL = auto() + PHI2 = auto() + PHI3 = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() + INTERNLM2 = auto() + MINICPM = auto() + MINICPM3 = auto() + GEMMA = auto() + GEMMA2 = auto() + STARCODER2 = auto() + RWKV6 = auto() + MAMBA = auto() + XVERSE = auto() + COMMAND_R = auto() + DBRX = auto() + OLMO = auto() + OLMO2 = auto() + OLMOE = auto() + OPENELM = auto() + ARCTIC = auto() + DEEPSEEK = auto() + DEEPSEEK2 = auto() + CHATGLM = auto() + BITNET = auto() + T5 = auto() + T5ENCODER = auto() + JAIS = auto() + NEMOTRON = auto() + EXAONE = auto() + GRANITE = auto() + GRANITE_MOE = auto() + CHAMELEON = auto() + WAVTOKENIZER_DEC = auto() class MODEL_TENSOR(IntEnum): @@ -390,59 +390,59 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN: "baichuan", - MODEL_ARCH.GROK: "grok", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", - MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.REFACT: "refact", - MODEL_ARCH.BERT: "bert", - MODEL_ARCH.NOMIC_BERT: "nomic-bert", - MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", - MODEL_ARCH.BLOOM: "bloom", - MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.QWEN2: "qwen2", - MODEL_ARCH.QWEN2MOE: "qwen2moe", - MODEL_ARCH.QWEN2VL: "qwen2vl", - MODEL_ARCH.PHI2: "phi2", - MODEL_ARCH.PHI3: "phi3", - MODEL_ARCH.PLAMO: "plamo", - MODEL_ARCH.CODESHELL: "codeshell", - MODEL_ARCH.ORION: "orion", - MODEL_ARCH.INTERNLM2: "internlm2", - MODEL_ARCH.MINICPM: "minicpm", - MODEL_ARCH.MINICPM3: "minicpm3", - MODEL_ARCH.GEMMA: "gemma", - MODEL_ARCH.GEMMA2: "gemma2", - MODEL_ARCH.STARCODER2: "starcoder2", - MODEL_ARCH.RWKV6: "rwkv6", - MODEL_ARCH.MAMBA: "mamba", - MODEL_ARCH.XVERSE: "xverse", - MODEL_ARCH.COMMAND_R: "command-r", - MODEL_ARCH.DBRX: "dbrx", - MODEL_ARCH.OLMO: "olmo", - MODEL_ARCH.OLMO2: "olmo2", - MODEL_ARCH.OLMOE: "olmoe", - MODEL_ARCH.OPENELM: "openelm", - MODEL_ARCH.ARCTIC: "arctic", - MODEL_ARCH.DEEPSEEK: "deepseek", - MODEL_ARCH.DEEPSEEK2: "deepseek2", - MODEL_ARCH.CHATGLM: "chatglm", - MODEL_ARCH.BITNET: "bitnet", - MODEL_ARCH.T5: "t5", - MODEL_ARCH.T5ENCODER: "t5encoder", - MODEL_ARCH.JAIS: "jais", - MODEL_ARCH.NEMOTRON: "nemotron", - MODEL_ARCH.EXAONE: "exaone", - MODEL_ARCH.GRANITE: "granite", - MODEL_ARCH.GRANITE_MOE: "granitemoe", - MODEL_ARCH.CHAMELEON: "chameleon", - MODEL_ARCH.OUTETTS_VOC: "outetts-voc", + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GROK: "grok", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.REFACT: "refact", + MODEL_ARCH.BERT: "bert", + MODEL_ARCH.NOMIC_BERT: "nomic-bert", + MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", + MODEL_ARCH.BLOOM: "bloom", + MODEL_ARCH.STABLELM: "stablelm", + MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.QWEN2: "qwen2", + MODEL_ARCH.QWEN2MOE: "qwen2moe", + MODEL_ARCH.QWEN2VL: "qwen2vl", + MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PHI3: "phi3", + MODEL_ARCH.PLAMO: "plamo", + MODEL_ARCH.CODESHELL: "codeshell", + MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.MINICPM3: "minicpm3", + MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.GEMMA2: "gemma2", + MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.RWKV6: "rwkv6", + MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.XVERSE: "xverse", + MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.DBRX: "dbrx", + MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OLMO2: "olmo2", + MODEL_ARCH.OLMOE: "olmoe", + MODEL_ARCH.OPENELM: "openelm", + MODEL_ARCH.ARCTIC: "arctic", + MODEL_ARCH.DEEPSEEK: "deepseek", + MODEL_ARCH.DEEPSEEK2: "deepseek2", + MODEL_ARCH.CHATGLM: "chatglm", + MODEL_ARCH.BITNET: "bitnet", + MODEL_ARCH.T5: "t5", + MODEL_ARCH.T5ENCODER: "t5encoder", + MODEL_ARCH.JAIS: "jais", + MODEL_ARCH.NEMOTRON: "nemotron", + MODEL_ARCH.EXAONE: "exaone", + MODEL_ARCH.GRANITE: "granite", + MODEL_ARCH.GRANITE_MOE: "granitemoe", + MODEL_ARCH.CHAMELEON: "chameleon", + MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -1406,7 +1406,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], - MODEL_ARCH.OUTETTS_VOC: [ + MODEL_ARCH.WAVTOKENIZER_DEC: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.CONV1D, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 5bf1f514a04f6..296f1ca054cf2 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -42,7 +42,7 @@ class TensorNameMap: "emb_ln", # nomic-bert "transformer.norm", # openelm "rwkv.blocks.0.pre_ln", # rwkv - "backbone.norm", # outetts + "backbone.norm", # wavtokenizer ), # Position embeddings @@ -61,7 +61,7 @@ class TensorNameMap: "lm_head.linear", # phi2 "output_layer", # chatglm "head", # rwkv - "head.out", # outetts + "head.out", # wavtokenizer ), # Output norm @@ -82,7 +82,7 @@ class TensorNameMap: "transformer.norm", # openelm "model.norm", # nemotron "rwkv.ln_out", # rwkv - "backbone.final_layer_norm", # outetts + "backbone.final_layer_norm", # wavtokenizer ), # Rope frequencies @@ -705,63 +705,63 @@ class TensorNameMap: ############################################################################# MODEL_TENSOR.CONV_NEXT_DW: ( - "backbone.convnext.{bid}.dwconv", # outetts + "backbone.convnext.{bid}.dwconv", # wavtokenizer ), MODEL_TENSOR.CONV_NEXT_NORM: ( - "backbone.convnext.{bid}.norm", # outetts + "backbone.convnext.{bid}.norm", # wavtokenizer ), MODEL_TENSOR.CONV_NEXT_PW1: ( - "backbone.convnext.{bid}.pwconv1", # outetts + "backbone.convnext.{bid}.pwconv1", # wavtokenizer ), MODEL_TENSOR.CONV_NEXT_PW2: ( - "backbone.convnext.{bid}.pwconv2", # outetts + "backbone.convnext.{bid}.pwconv2", # wavtokenizer ), MODEL_TENSOR.CONV_NEXT_GAMMA: ( - "backbone.convnext.{bid}.gamma", # outetts + "backbone.convnext.{bid}.gamma", # wavtokenizer ), MODEL_TENSOR.POS_NET_CONV1: ( - "backbone.pos_net.{bid}.conv1", # outetts + "backbone.pos_net.{bid}.conv1", # wavtokenizer ), MODEL_TENSOR.POS_NET_CONV2: ( - "backbone.pos_net.{bid}.conv2", # outetts + "backbone.pos_net.{bid}.conv2", # wavtokenizer ), MODEL_TENSOR.POS_NET_NORM: ( - "backbone.pos_net.{bid}.norm", # outetts + "backbone.pos_net.{bid}.norm", # wavtokenizer ), MODEL_TENSOR.POS_NET_NORM1: ( - "backbone.pos_net.{bid}.norm1", # outetts + "backbone.pos_net.{bid}.norm1", # wavtokenizer ), MODEL_TENSOR.POS_NET_NORM2: ( - "backbone.pos_net.{bid}.norm2", # outetts + "backbone.pos_net.{bid}.norm2", # wavtokenizer ), MODEL_TENSOR.POS_NET_ATTN_NORM: ( - "backbone.pos_net.{bid}.norm", # outetts + "backbone.pos_net.{bid}.norm", # wavtokenizer ), MODEL_TENSOR.POS_NET_ATTN_Q: ( - "backbone.pos_net.{bid}.q", # outetts + "backbone.pos_net.{bid}.q", # wavtokenizer ), MODEL_TENSOR.POS_NET_ATTN_K: ( - "backbone.pos_net.{bid}.k", # outetts + "backbone.pos_net.{bid}.k", # wavtokenizer ), MODEL_TENSOR.POS_NET_ATTN_V: ( - "backbone.pos_net.{bid}.v", # outetts + "backbone.pos_net.{bid}.v", # wavtokenizer ), MODEL_TENSOR.POS_NET_ATTN_OUT: ( - "backbone.pos_net.{bid}.proj_out", # outetts + "backbone.pos_net.{bid}.proj_out", # wavtokenizer ), } diff --git a/src/llama.cpp b/src/llama.cpp index 6397decd70c73..9aa5ad17a5bdb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -197,65 +197,65 @@ enum llm_arch { LLM_ARCH_GRANITE, LLM_ARCH_GRANITE_MOE, LLM_ARCH_CHAMELEON, - LLM_ARCH_OUTETTS_VOC, + LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_UNKNOWN, }; static const std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GROK, "grok" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN, "baichuan" }, - { LLM_ARCH_STARCODER, "starcoder" }, - { LLM_ARCH_REFACT, "refact" }, - { LLM_ARCH_BERT, "bert" }, - { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, - { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, - { LLM_ARCH_BLOOM, "bloom" }, - { LLM_ARCH_STABLELM, "stablelm" }, - { LLM_ARCH_QWEN, "qwen" }, - { LLM_ARCH_QWEN2, "qwen2" }, - { LLM_ARCH_QWEN2MOE, "qwen2moe" }, - { LLM_ARCH_QWEN2VL, "qwen2vl" }, - { LLM_ARCH_PHI2, "phi2" }, - { LLM_ARCH_PHI3, "phi3" }, - { LLM_ARCH_PLAMO, "plamo" }, - { LLM_ARCH_CODESHELL, "codeshell" }, - { LLM_ARCH_ORION, "orion" }, - { LLM_ARCH_INTERNLM2, "internlm2" }, - { LLM_ARCH_MINICPM, "minicpm" }, - { LLM_ARCH_MINICPM3, "minicpm3" }, - { LLM_ARCH_GEMMA, "gemma" }, - { LLM_ARCH_GEMMA2, "gemma2" }, - { LLM_ARCH_STARCODER2, "starcoder2" }, - { LLM_ARCH_MAMBA, "mamba" }, - { LLM_ARCH_XVERSE, "xverse" }, - { LLM_ARCH_COMMAND_R, "command-r" }, - { LLM_ARCH_DBRX, "dbrx" }, - { LLM_ARCH_OLMO, "olmo" }, - { LLM_ARCH_OLMO2, "olmo2" }, - { LLM_ARCH_OLMOE, "olmoe" }, - { LLM_ARCH_OPENELM, "openelm" }, - { LLM_ARCH_ARCTIC, "arctic" }, - { LLM_ARCH_DEEPSEEK, "deepseek" }, - { LLM_ARCH_DEEPSEEK2, "deepseek2" }, - { LLM_ARCH_CHATGLM, "chatglm" }, - { LLM_ARCH_BITNET, "bitnet" }, - { LLM_ARCH_T5, "t5" }, - { LLM_ARCH_T5ENCODER, "t5encoder" }, - { LLM_ARCH_JAIS, "jais" }, - { LLM_ARCH_NEMOTRON, "nemotron" }, - { LLM_ARCH_EXAONE, "exaone" }, - { LLM_ARCH_RWKV6, "rwkv6" }, - { LLM_ARCH_GRANITE, "granite" }, - { LLM_ARCH_GRANITE_MOE, "granitemoe" }, - { LLM_ARCH_CHAMELEON, "chameleon" }, - { LLM_ARCH_OUTETTS_VOC, "outetts-voc" }, - { LLM_ARCH_UNKNOWN, "(unknown)" }, + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GROK, "grok" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, + { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_BERT, "bert" }, + { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, + { LLM_ARCH_BLOOM, "bloom" }, + { LLM_ARCH_STABLELM, "stablelm" }, + { LLM_ARCH_QWEN, "qwen" }, + { LLM_ARCH_QWEN2, "qwen2" }, + { LLM_ARCH_QWEN2MOE, "qwen2moe" }, + { LLM_ARCH_QWEN2VL, "qwen2vl" }, + { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PHI3, "phi3" }, + { LLM_ARCH_PLAMO, "plamo" }, + { LLM_ARCH_CODESHELL, "codeshell" }, + { LLM_ARCH_ORION, "orion" }, + { LLM_ARCH_INTERNLM2, "internlm2" }, + { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_MINICPM3, "minicpm3" }, + { LLM_ARCH_GEMMA, "gemma" }, + { LLM_ARCH_GEMMA2, "gemma2" }, + { LLM_ARCH_STARCODER2, "starcoder2" }, + { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_XVERSE, "xverse" }, + { LLM_ARCH_COMMAND_R, "command-r" }, + { LLM_ARCH_DBRX, "dbrx" }, + { LLM_ARCH_OLMO, "olmo" }, + { LLM_ARCH_OLMO2, "olmo2" }, + { LLM_ARCH_OLMOE, "olmoe" }, + { LLM_ARCH_OPENELM, "openelm" }, + { LLM_ARCH_ARCTIC, "arctic" }, + { LLM_ARCH_DEEPSEEK, "deepseek" }, + { LLM_ARCH_DEEPSEEK2, "deepseek2" }, + { LLM_ARCH_CHATGLM, "chatglm" }, + { LLM_ARCH_BITNET, "bitnet" }, + { LLM_ARCH_T5, "t5" }, + { LLM_ARCH_T5ENCODER, "t5encoder" }, + { LLM_ARCH_JAIS, "jais" }, + { LLM_ARCH_NEMOTRON, "nemotron" }, + { LLM_ARCH_EXAONE, "exaone" }, + { LLM_ARCH_RWKV6, "rwkv6" }, + { LLM_ARCH_GRANITE, "granite" }, + { LLM_ARCH_GRANITE_MOE, "granitemoe" }, + { LLM_ARCH_CHAMELEON, "chameleon" }, + { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, + { LLM_ARCH_UNKNOWN, "(unknown)" }, }; enum llm_kv { @@ -1612,7 +1612,7 @@ static const std::map> LLM_TENSOR_N }, }, { - LLM_ARCH_OUTETTS_VOC, + LLM_ARCH_WAVTOKENIZER_DEC, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, @@ -3063,7 +3063,7 @@ struct llama_model { struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out_b = nullptr; - // outetts vocoder + // wavtokenizer decoder // TODO: dedup struct ggml_tensor * conv_1d = nullptr; struct ggml_tensor * conv_1d_b = nullptr; @@ -6443,7 +6443,7 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; - case LLM_ARCH_OUTETTS_VOC: + case LLM_ARCH_WAVTOKENIZER_DEC: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); } break; @@ -9545,7 +9545,7 @@ static bool llm_load_tensors( layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; - case LLM_ARCH_OUTETTS_VOC: + case LLM_ARCH_WAVTOKENIZER_DEC: { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0); @@ -16142,7 +16142,7 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_t5_encoder() { + struct ggml_cgraph * build_t5_enc() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens @@ -16274,7 +16274,7 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_t5_decoder() { + struct ggml_cgraph * build_t5_dec() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens @@ -17224,7 +17224,7 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_outetts_voc() { + struct ggml_cgraph * build_wavtokenizer_dec() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_tensor * cur; @@ -17692,14 +17692,14 @@ static struct ggml_cgraph * llama_build_graph( case LLM_ARCH_T5: { if (lctx.is_encoding) { - result = llm.build_t5_encoder(); + result = llm.build_t5_enc(); } else { - result = llm.build_t5_decoder(); + result = llm.build_t5_dec(); } } break; case LLM_ARCH_T5ENCODER: { - result = llm.build_t5_encoder(); + result = llm.build_t5_enc(); } break; case LLM_ARCH_JAIS: { @@ -17721,9 +17721,9 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_chameleon(); } break; - case LLM_ARCH_OUTETTS_VOC: + case LLM_ARCH_WAVTOKENIZER_DEC: { - result = llm.build_outetts_voc(); + result = llm.build_wavtokenizer_dec(); } break; default: GGML_ABORT("fatal error"); @@ -20904,7 +20904,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_T5ENCODER: case LLM_ARCH_JAIS: case LLM_ARCH_RWKV6: - case LLM_ARCH_OUTETTS_VOC: + case LLM_ARCH_WAVTOKENIZER_DEC: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values From c096bbd8ddfc778738498bd005e8ada156bd9406 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 15:07:38 +0200 Subject: [PATCH 33/45] tts : remove hardcoded constants ggml-ci --- convert_hf_to_gguf.py | 6 +- examples/tts/convert_pt_to_hf.py | 4 + gguf-py/gguf/constants.py | 3 + gguf-py/gguf/gguf_writer.py | 12 +++ src/llama.cpp | 151 ++++++++++++++++++------------- 5 files changed, 110 insertions(+), 66 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8fc4c4f5655f4..bf6da96017058 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2055,7 +2055,11 @@ def set_vocab(self): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) + self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) + self.gguf_writer.add_posnet_length (self.hparams["n_embd_posnet"]) + self.gguf_writer.add_convnext_length (self.hparams["n_embd_convnext"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) @Model.register("Qwen2MoeForCausalLM") diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 8d68290e2beb3..4938515af62ad 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -147,6 +147,10 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): "WavTokenizerDec" ], "hidden_size": 1282, + "n_embd_features": 512, + "n_embd_posnet": 768, + "n_embd_convnext": 768, + "n_ff": 2304, "vocab_size": 4096, "n_head": 1, "layer_norm_epsilon": 1e-6, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index af2a4f4f4d15a..e8a31f1144434 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -90,6 +90,9 @@ class LLM: VOCAB_SIZE = "{arch}.vocab_size" CONTEXT_LENGTH = "{arch}.context_length" EMBEDDING_LENGTH = "{arch}.embedding_length" + FEATURES_LENGTH = "{arch}.features_length" + POSNET_LENGTH = "{arch}.posnet_length" + CONVNEXT_LENGTH = "{arch}.convnext_length" BLOCK_COUNT = "{arch}.block_count" LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 65a64e10dd33f..8266ba99fd06a 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -631,6 +631,18 @@ def add_context_length(self, length: int) -> None: def add_embedding_length(self, length: int) -> None: self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) + def add_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) + + def add_features_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length) + + def add_posnet_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.POSNET_LENGTH.format(arch=self.arch), length) + + def add_convnext_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.CONVNEXT_LENGTH.format(arch=self.arch), length) + def add_block_count(self, length: int) -> None: self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) diff --git a/src/llama.cpp b/src/llama.cpp index 9aa5ad17a5bdb..f0c4b5e624066 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -77,6 +77,7 @@ #endif // bump if necessary +#define LLAMA_MAX_EMBD 8 #define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 @@ -275,6 +276,9 @@ enum llm_kv { LLM_KV_VOCAB_SIZE, LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, + LLM_KV_FEATURES_LENGTH, + LLM_KV_POSNET_LENGTH, + LLM_KV_CONVNEXT_LENGTH, LLM_KV_BLOCK_COUNT, LLM_KV_LEADING_DENSE_BLOCK_COUNT, LLM_KV_FEED_FORWARD_LENGTH, @@ -392,6 +396,9 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_VOCAB_SIZE, "%s.vocab_size" }, { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, + { LLM_KV_FEATURES_LENGTH, "%s.features_length" }, + { LLM_KV_POSNET_LENGTH, "%s.posnet_length" }, + { LLM_KV_CONVNEXT_LENGTH, "%s.convnext_length" }, { LLM_KV_BLOCK_COUNT, "%s.block_count" }, { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" }, { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, @@ -2545,6 +2552,11 @@ struct llama_hparams { uint32_t n_vocab_type = 0; // for BERT-style token types uint32_t n_rel_attn_bkts = 0; + // for WavTokenizer + uint32_t n_embd_features = 0; + uint32_t n_embd_posnet = 0; + uint32_t n_embd_convnext = 0; + std::array n_head_arr; std::array n_head_kv_arr; std::array n_ff_arr; @@ -5684,6 +5696,12 @@ static void llm_load_hparams( ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); + if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) { + ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); + ml.get_key(LLM_KV_POSNET_LENGTH, hparams.n_embd_posnet); + ml.get_key(LLM_KV_CONVNEXT_LENGTH, hparams.n_embd_convnext); + } + GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); if (hparams.n_expert > 0) { @@ -5692,7 +5710,7 @@ static void llm_load_hparams( GGML_ASSERT(hparams.n_expert_used == 0); } - // zero-out the per-layer hparams + // zero-out the array hparams std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); @@ -7577,7 +7595,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_IM2COL: { - int n_embd = hparams.n_embd; + const int n_embd = hparams.n_embd; ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1); op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16); } break; @@ -9547,104 +9565,107 @@ static bool llm_load_tensors( } break; case LLM_ARCH_WAVTOKENIZER_DEC: { - model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0); + const int64_t n_embd_features = hparams.n_embd_features; + const int64_t n_embd_posnet = hparams.n_embd_posnet; + const int64_t n_embd_convnext = hparams.n_embd_convnext; + + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd_features, n_vocab}, 0); - model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0); - model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {768}, 0); + model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd_posnet}, 0); + model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd_posnet}, 0); - model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, 512, 768}, 0); - model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {768}, 0); + model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd_features, n_embd_posnet}, 0); + model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {n_embd_posnet}, 0); - model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0); - model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 0), {768}, 0); + model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {n_embd_posnet}, 0); + model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 0), {n_embd_posnet}, 0); - model.posnet_0_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, 768, 768}, 0); - model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 0), {768}, 0); + model.posnet_0_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 0), {n_embd_posnet}, 0); - model.posnet_0_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {768}, 0); - model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 0), {768}, 0); + model.posnet_0_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {n_embd_posnet}, 0); + model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 0), {n_embd_posnet}, 0); - model.posnet_0_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, 768, 768}, 0); - model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 0), {768}, 0); + model.posnet_0_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 0), {n_embd_posnet}, 0); - model.posnet_1_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {768}, 0); - model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 1), {768}, 0); + model.posnet_1_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {n_embd_posnet}, 0); + model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 1), {n_embd_posnet}, 0); - model.posnet_1_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, 768, 768}, 0); - model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 1), {768}, 0); + model.posnet_1_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 1), {n_embd_posnet}, 0); - model.posnet_1_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {768}, 0); - model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 1), {768}, 0); + model.posnet_1_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {n_embd_posnet}, 0); + model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 1), {n_embd_posnet}, 0); - model.posnet_1_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, 768, 768}, 0); - model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 1), {768}, 0); + model.posnet_1_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 1), {n_embd_posnet}, 0); - model.posnet_2_attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {768}, 0); - model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 2), {768}, 0); + model.posnet_2_attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {n_embd_posnet}, 0); + model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 2), {n_embd_posnet}, 0); - model.posnet_2_attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, 768, 768}, 0); - model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", 2), {768}, 0); + model.posnet_2_attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", 2), {n_embd_posnet}, 0); - model.posnet_2_attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, 768, 768}, 0); - model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", 2), {768}, 0); + model.posnet_2_attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", 2), {n_embd_posnet}, 0); - model.posnet_2_attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, 768, 768}, 0); - model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", 2), {768}, 0); + model.posnet_2_attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", 2), {n_embd_posnet}, 0); - model.posnet_2_attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, 768, 768}, 0); - model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", 2), {768}, 0); + model.posnet_2_attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", 2), {n_embd_posnet}, 0); - model.posnet_3_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {768}, 0); - model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 3), {768}, 0); + model.posnet_3_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {n_embd_posnet}, 0); + model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 3), {n_embd_posnet}, 0); - model.posnet_3_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, 768, 768}, 0); - model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 3), {768}, 0); + model.posnet_3_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 3), {n_embd_posnet}, 0); - model.posnet_3_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {768}, 0); - model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 3), {768}, 0); + model.posnet_3_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {n_embd_posnet}, 0); + model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 3), {n_embd_posnet}, 0); - model.posnet_3_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, 768, 768}, 0); - model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 3), {768}, 0); + model.posnet_3_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 3), {n_embd_posnet}, 0); - model.posnet_4_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {768}, 0); - model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 4), {768}, 0); + model.posnet_4_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {n_embd_posnet}, 0); + model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 4), {n_embd_posnet}, 0); - model.posnet_4_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, 768, 768}, 0); - model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 4), {768}, 0); + model.posnet_4_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 4), {n_embd_posnet}, 0); - model.posnet_4_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {768}, 0); - model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 4), {768}, 0); + model.posnet_4_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {n_embd_posnet}, 0); + model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 4), {n_embd_posnet}, 0); - model.posnet_4_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, 768, 768}, 0); - model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 4), {768}, 0); + model.posnet_4_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0); + model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 4), {n_embd_posnet}, 0); - model.posnet_5_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {768}, 0); - model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 5), {768}, 0); + model.posnet_5_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {n_embd_posnet}, 0); + model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 5), {n_embd_posnet}, 0); for (int i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; - layer.convnext_dw = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, 768}, 0); - layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias", i), {768}, 0); + layer.convnext_dw = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, n_embd_convnext}, 0); + layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias", i), {n_embd_convnext}, 0); - layer.convnext_norm = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {768}, 0); - layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias", i), {768}, 0); + layer.convnext_norm = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {n_embd_convnext}, 0); + layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias", i), {n_embd_convnext}, 0); - // TODO: n_ff - layer.convnext_pw1 = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "weight", i), {768, 2304}, 0); - layer.convnext_pw1_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "bias", i), {2304}, 0); + layer.convnext_pw1 = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "weight", i), {n_embd_convnext, n_ff}, 0); + layer.convnext_pw1_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "bias", i), {n_ff}, 0); - layer.convnext_pw2 = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "weight", i), {2304, 768}, 0); - layer.convnext_pw2_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "bias", i), {768}, 0); + layer.convnext_pw2 = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "weight", i), {n_ff, n_embd_convnext}, 0); + layer.convnext_pw2_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "bias", i), {n_embd_convnext}, 0); - layer.convnext_gamma = create_tensor(tn(LLM_TENSOR_CONV_NEXT_GAMMA, "weight", i), {768}, 0); + layer.convnext_gamma = create_tensor(tn(LLM_TENSOR_CONV_NEXT_GAMMA, "weight", i), {n_embd_convnext}, 0); } // output - model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); - model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {768}, 0); + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd_convnext}, 0); + model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd_convnext}, 0); - model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, n_embd}, 0); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd_convnext, n_embd}, 0); model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0); } break; default: @@ -17317,7 +17338,7 @@ struct llm_build_context { struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(768)), 0.0f); + kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(model.hparams.n_embd_posnet)), 0.0f); cur = ggml_mul_mat(ctx0, kq, v); From d1ef627c51870ca836ba82dd5d1b75edb1e7961a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 16:48:22 +0200 Subject: [PATCH 34/45] tts : fix tensor shapes --- convert_hf_to_gguf.py | 4 + examples/tts/convert_pt_to_hf.py | 9 ++ gguf-py/gguf/constants.py | 2 + gguf-py/gguf/gguf_writer.py | 9 +- gguf-py/tests/test_quants.py | 2 +- src/llama.cpp | 150 ++++++++++++++++--------------- 6 files changed, 101 insertions(+), 75 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bf6da96017058..ad423d0b2625b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -326,6 +326,8 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, + gguf.MODEL_TENSOR.POS_NET_NORM1, + gguf.MODEL_TENSOR.POS_NET_NORM2, ) ) or not new_name.endswith(".weight") @@ -2060,6 +2062,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_posnet_length (self.hparams["n_embd_posnet"]) self.gguf_writer.add_convnext_length (self.hparams["n_embd_convnext"]) self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) + self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) + self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) @Model.register("Qwen2MoeForCausalLM") diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 4938515af62ad..99fef33d27815 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -98,6 +98,13 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): if new_key.endswith("gamma"): new_key = new_key.replace("gamma", "gamma.weight") + # convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias + if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.pos_net") or new_key.startswith("backbone.embed.bias")): + value = value.unsqueeze(1) + + if new_key.endswith("dwconv.bias"): + value = value.unsqueeze(1) + size_mb = value.element_size() * value.nelement() / (1024 * 1024) print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}") @@ -154,6 +161,8 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): "vocab_size": 4096, "n_head": 1, "layer_norm_epsilon": 1e-6, + "group_norm_epsilon": 1e-6, + "group_norm_groups": 32, "max_position_embeddings": 8192, # ? "num_hidden_layers": 12 } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e8a31f1144434..139d0d15f7e06 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -125,6 +125,8 @@ class Attention: VALUE_LENGTH = "{arch}.attention.value_length" LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" + GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon" + GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups" CAUSAL = "{arch}.attention.causal" Q_LORA_RANK = "{arch}.attention.q_lora_rank" KV_LORA_RANK = "{arch}.attention.kv_lora_rank" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 8266ba99fd06a..5bb2a8af87e25 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -631,9 +631,6 @@ def add_context_length(self, length: int) -> None: def add_embedding_length(self, length: int) -> None: self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) - def add_embedding_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) - def add_features_length(self, length: int) -> None: self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length) @@ -739,6 +736,12 @@ def add_layer_norm_eps(self, value: float) -> None: def add_layer_norm_rms_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) + def add_group_norm_eps(self, value: float) -> None: + self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value) + + def add_group_norm_groups(self, value: int) -> None: + self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value) + def add_causal_attention(self, value: bool) -> None: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) diff --git a/gguf-py/tests/test_quants.py b/gguf-py/tests/test_quants.py index 762067814224e..f04d5acce2793 100755 --- a/gguf-py/tests/test_quants.py +++ b/gguf-py/tests/test_quants.py @@ -136,7 +136,7 @@ def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType) logger.debug(f"Sample bad block ({diff_bits[bad_block_id]} differing bits):\n{t1[bad_block_id]}\nReference:\n{t2[bad_block_id]}") sum_diff_bits = np.sum(diff_bits) - logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits/(x.size * 8):.6f}%)") + logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits / (x.size * 8):.6f}%)") return False diff --git a/src/llama.cpp b/src/llama.cpp index f0c4b5e624066..4c298d4062cc3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -310,6 +310,8 @@ enum llm_kv { LLM_KV_ATTENTION_VALUE_LENGTH, LLM_KV_ATTENTION_LAYERNORM_EPS, LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, + LLM_KV_ATTENTION_GROUPNORM_EPS, + LLM_KV_ATTENTION_GROUPNORM_GROUPS, LLM_KV_ATTENTION_CAUSAL, LLM_KV_ATTENTION_Q_LORA_RANK, LLM_KV_ATTENTION_KV_LORA_RANK, @@ -430,6 +432,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" }, { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, + { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" }, + { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" }, { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" }, { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" }, { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, @@ -2571,6 +2575,9 @@ struct llama_hparams { float f_norm_eps; float f_norm_rms_eps; + float f_norm_group_eps; + + uint32_t n_norm_groups; float f_attn_logit_softcapping = 50.0f; float f_final_logit_softcapping = 30.0f; @@ -6463,7 +6470,9 @@ static void llm_load_hparams( } break; case LLM_ARCH_WAVTOKENIZER_DEC: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps); + ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups); } break; default: (void)0; } @@ -9575,79 +9584,79 @@ static bool llm_load_tensors( model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd_posnet}, 0); model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd_features, n_embd_posnet}, 0); - model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {n_embd_posnet}, 0); + model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, n_embd_posnet}, 0); - model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {n_embd_posnet}, 0); - model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 0), {n_embd_posnet}, 0); + model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {1, n_embd_posnet}, 0); + model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 0), {1, n_embd_posnet}, 0); model.posnet_0_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 0), {n_embd_posnet}, 0); + model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 0), {1, n_embd_posnet}, 0); - model.posnet_0_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {n_embd_posnet}, 0); - model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 0), {n_embd_posnet}, 0); + model.posnet_0_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {1, n_embd_posnet}, 0); + model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 0), {1, n_embd_posnet}, 0); model.posnet_0_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 0), {n_embd_posnet}, 0); + model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 0), {1, n_embd_posnet}, 0); - model.posnet_1_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {n_embd_posnet}, 0); - model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 1), {n_embd_posnet}, 0); + model.posnet_1_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {1, n_embd_posnet}, 0); + model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 1), {1, n_embd_posnet}, 0); model.posnet_1_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 1), {n_embd_posnet}, 0); + model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 1), {1, n_embd_posnet}, 0); - model.posnet_1_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {n_embd_posnet}, 0); - model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 1), {n_embd_posnet}, 0); + model.posnet_1_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {1, n_embd_posnet}, 0); + model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 1), {1, n_embd_posnet}, 0); model.posnet_1_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 1), {n_embd_posnet}, 0); + model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 1), {1, n_embd_posnet}, 0); - model.posnet_2_attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {n_embd_posnet}, 0); - model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 2), {n_embd_posnet}, 0); + model.posnet_2_attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {1, n_embd_posnet}, 0); + model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 2), {1, n_embd_posnet}, 0); model.posnet_2_attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", 2), {n_embd_posnet}, 0); + model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", 2), {1, n_embd_posnet}, 0); model.posnet_2_attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", 2), {n_embd_posnet}, 0); + model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", 2), {1, n_embd_posnet}, 0); model.posnet_2_attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", 2), {n_embd_posnet}, 0); + model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", 2), {1, n_embd_posnet}, 0); model.posnet_2_attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", 2), {n_embd_posnet}, 0); + model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", 2), {1, n_embd_posnet}, 0); - model.posnet_3_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {n_embd_posnet}, 0); - model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 3), {n_embd_posnet}, 0); + model.posnet_3_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {1, n_embd_posnet}, 0); + model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 3), {1, n_embd_posnet}, 0); model.posnet_3_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 3), {n_embd_posnet}, 0); + model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 3), {1, n_embd_posnet}, 0); - model.posnet_3_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {n_embd_posnet}, 0); - model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 3), {n_embd_posnet}, 0); + model.posnet_3_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {1, n_embd_posnet}, 0); + model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 3), {1, n_embd_posnet}, 0); model.posnet_3_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 3), {n_embd_posnet}, 0); + model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 3), {1, n_embd_posnet}, 0); - model.posnet_4_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {n_embd_posnet}, 0); - model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 4), {n_embd_posnet}, 0); + model.posnet_4_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {1, n_embd_posnet}, 0); + model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 4), {1, n_embd_posnet}, 0); model.posnet_4_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 4), {n_embd_posnet}, 0); + model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 4), {1, n_embd_posnet}, 0); - model.posnet_4_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {n_embd_posnet}, 0); - model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 4), {n_embd_posnet}, 0); + model.posnet_4_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {1, n_embd_posnet}, 0); + model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 4), {1, n_embd_posnet}, 0); model.posnet_4_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 4), {n_embd_posnet}, 0); + model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 4), {1, n_embd_posnet}, 0); - model.posnet_5_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {n_embd_posnet}, 0); - model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 5), {n_embd_posnet}, 0); + model.posnet_5_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {1, n_embd_posnet}, 0); + model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 5), {1, n_embd_posnet}, 0); for (int i = 0; i < n_layer; ++i) { auto & layer = model.layers[i]; layer.convnext_dw = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, n_embd_convnext}, 0); - layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias", i), {n_embd_convnext}, 0); + layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias", i), {1, n_embd_convnext}, 0); layer.convnext_norm = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {n_embd_convnext}, 0); layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias", i), {n_embd_convnext}, 0); @@ -10033,9 +10042,8 @@ static struct ggml_tensor * llm_build_norm( case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break; case LLM_NORM_GROUP: { - // TODO: these reshapes should be removed, fix ggml_group_norm cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]); - cur = ggml_group_norm(ctx, cur, 32, 1e-6); // TODO: add groups, eps params + cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); cur = ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]); } break; } @@ -17256,31 +17264,31 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.conv_1d_b, 1, model.conv_1d_b->ne[0])); + cur = ggml_add(ctx0, cur, model.conv_1d_b); inpL = cur; // resnet block 0 { cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_0_norm1, 1, model.posnet_0_norm1->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_0_norm1_b, 1, model.posnet_0_norm1_b->ne[0]), + model.posnet_0_norm1, + model.posnet_0_norm1_b, LLM_NORM_GROUP, cb, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv1_b, 1, model.posnet_0_conv1_b->ne[0])); + cur = ggml_add(ctx0, cur, model.posnet_0_conv1_b); cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_0_norm2, 1, model.posnet_0_norm2->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_0_norm2_b, 1, model.posnet_0_norm2_b->ne[0]), + model.posnet_0_norm2, + model.posnet_0_norm2_b, LLM_NORM_GROUP, cb, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_0_conv2_b, 1, model.posnet_0_conv2_b->ne[0])); + cur = ggml_add(ctx0, cur, model.posnet_0_conv2_b); cur = ggml_add(ctx0, cur, inpL); } @@ -17290,24 +17298,24 @@ struct llm_build_context { // resnet block 1 { cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_1_norm1, 1, model.posnet_1_norm1->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_1_norm1_b, 1, model.posnet_1_norm1_b->ne[0]), + model.posnet_1_norm1, + model.posnet_1_norm1_b, LLM_NORM_GROUP, cb, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_1_conv1_b, 1, model.posnet_1_conv1_b->ne[0])); + cur = ggml_add(ctx0, cur, model.posnet_1_conv1_b); cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_1_norm2, 1, model.posnet_1_norm2->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_1_norm2_b, 1, model.posnet_1_norm2_b->ne[0]), + model.posnet_1_norm2, + model.posnet_1_norm2_b, LLM_NORM_GROUP, cb, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_1_conv2_b, 1, model.posnet_1_conv2_b->ne[0])); + cur = ggml_add(ctx0, cur, model.posnet_1_conv2_b); cur = ggml_add(ctx0, cur, inpL); } @@ -17317,8 +17325,8 @@ struct llm_build_context { // attention block { cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_2_attn_norm, 1, model.posnet_2_attn_norm->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_2_attn_norm_b, 1, model.posnet_2_attn_norm_b->ne[0]), + model.posnet_2_attn_norm, + model.posnet_2_attn_norm_b, LLM_NORM_GROUP, cb, 0); struct ggml_tensor * q; @@ -17329,9 +17337,9 @@ struct llm_build_context { k = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_k, cur, 1, 1); v = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_v, cur, 1, 1); - q = ggml_add(ctx0, q, ggml_reshape_2d(ctx0, model.posnet_2_attn_q_b, 1, model.posnet_2_attn_q_b->ne[0])); - k = ggml_add(ctx0, k, ggml_reshape_2d(ctx0, model.posnet_2_attn_k_b, 1, model.posnet_2_attn_k_b->ne[0])); - v = ggml_add(ctx0, v, ggml_reshape_2d(ctx0, model.posnet_2_attn_v_b, 1, model.posnet_2_attn_v_b->ne[0])); + q = ggml_add(ctx0, q, model.posnet_2_attn_q_b); + k = ggml_add(ctx0, k, model.posnet_2_attn_k_b); + v = ggml_add(ctx0, v, model.posnet_2_attn_v_b); q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); @@ -17343,7 +17351,7 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, kq, v); cur = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_o, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_2_attn_o_b, 1, model.posnet_2_attn_o_b->ne[0])); + cur = ggml_add(ctx0, cur, model.posnet_2_attn_o_b); cur = ggml_add(ctx0, cur, inpL); } @@ -17353,24 +17361,24 @@ struct llm_build_context { // resnet block 3 { cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_3_norm1, 1, model.posnet_3_norm1->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_3_norm1_b, 1, model.posnet_3_norm1_b->ne[0]), + model.posnet_3_norm1, + model.posnet_3_norm1_b, LLM_NORM_GROUP, cb, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_3_conv1_b, 1, model.posnet_3_conv1_b->ne[0])); + cur = ggml_add(ctx0, cur, model.posnet_3_conv1_b); cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_3_norm2, 1, model.posnet_3_norm2->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_3_norm2_b, 1, model.posnet_3_norm2_b->ne[0]), + model.posnet_3_norm2, + model.posnet_3_norm2_b, LLM_NORM_GROUP, cb, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_3_conv2_b, 1, model.posnet_3_conv2_b->ne[0])); + cur = ggml_add(ctx0, cur, model.posnet_3_conv2_b); cur = ggml_add(ctx0, cur, inpL); } @@ -17380,24 +17388,24 @@ struct llm_build_context { // resnet block 4 { cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_4_norm1, 1, model.posnet_4_norm1->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_4_norm1_b, 1, model.posnet_4_norm1_b->ne[0]), + model.posnet_4_norm1, + model.posnet_4_norm1_b, LLM_NORM_GROUP, cb, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_4_conv1_b, 1, model.posnet_4_conv1_b->ne[0])); + cur = ggml_add(ctx0, cur, model.posnet_4_conv1_b); cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_4_norm2, 1, model.posnet_4_norm2->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_4_norm2_b, 1, model.posnet_4_norm2_b->ne[0]), + model.posnet_4_norm2, + model.posnet_4_norm2_b, LLM_NORM_GROUP, cb, 0); cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.posnet_4_conv2_b, 1, model.posnet_4_conv2_b->ne[0])); + cur = ggml_add(ctx0, cur, model.posnet_4_conv2_b); cur = ggml_add(ctx0, cur, inpL); } @@ -17405,8 +17413,8 @@ struct llm_build_context { // normalize block 5 { cur = llm_build_norm(ctx0, cur, hparams, - ggml_reshape_2d(ctx0, model.posnet_5_norm, 1, model.posnet_5_norm->ne[0]), - ggml_reshape_2d(ctx0, model.posnet_5_norm_b, 1, model.posnet_5_norm_b->ne[0]), + model.posnet_5_norm, + model.posnet_5_norm_b, LLM_NORM_GROUP, cb, 0); } @@ -17425,7 +17433,7 @@ struct llm_build_context { cur = inpL; cur = ggml_conv_1d_dw_ph(ctx0, model.layers[il].convnext_dw, cur, 1, 1); - cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, model.layers[il].convnext_dw_b, 1, model.layers[il].convnext_dw_b->ne[0])); + cur = ggml_add(ctx0, cur, model.layers[il].convnext_dw_b); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); From 980d6310322859cc957d77dbfcdee92b62260302 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 19:21:50 +0200 Subject: [PATCH 35/45] llama : refactor wavtokenizer tensors ggml-ci --- common/arg.cpp | 2 +- convert_hf_to_gguf.py | 12 +- examples/tts/convert_pt_to_hf.py | 27 +- examples/tts/tts.cpp | 4 + gguf-py/gguf/constants.py | 100 ++--- gguf-py/gguf/gguf_writer.py | 14 +- gguf-py/gguf/tensor_mapping.py | 50 +-- src/llama.cpp | 694 ++++++++++++------------------- 8 files changed, 394 insertions(+), 509 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 3d03c676c2b65..93c15ecdcb4d2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -842,7 +842,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_sparam()); add_opt(common_arg( - {"--sampling-seq"}, "SEQUENCE", + {"--sampling-seq", "--sampler-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](common_params & params, const std::string & value) { params.sampling.samplers = common_sampler_types_from_chars(value); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ad423d0b2625b..7bf67a268183c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -326,8 +326,8 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, - gguf.MODEL_TENSOR.POS_NET_NORM1, - gguf.MODEL_TENSOR.POS_NET_NORM2, + gguf.MODEL_TENSOR.POSNET_NORM1, + gguf.MODEL_TENSOR.POSNET_NORM2, ) ) or not new_name.endswith(".weight") @@ -2059,12 +2059,16 @@ def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) - self.gguf_writer.add_posnet_length (self.hparams["n_embd_posnet"]) - self.gguf_writer.add_convnext_length (self.hparams["n_embd_convnext"]) self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) + self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) + self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) + + self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) + self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index 99fef33d27815..adba21a3661a8 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -74,12 +74,13 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): new_key = key new_key = new_key.replace('state_dict.', '') + new_key = new_key.replace('pos_net', 'posnet') - # check if matches "backbone.pos_net.%d.bias" or "backbone.pos_net.%d.weight" - if new_key.startswith("backbone.pos_net."): - match = re.match(r"backbone\.pos_net\.(\d+)\.(bias|weight)", new_key) + # check if matches "backbone.posnet.%d.bias" or "backbone.posnet.%d.weight" + if new_key.startswith("backbone.posnet."): + match = re.match(r"backbone\.posnet\.(\d+)\.(bias|weight)", new_key) if match: - new_key = f"backbone.pos_net.{match.group(1)}.norm.{match.group(2)}" + new_key = f"backbone.posnet.{match.group(1)}.norm.{match.group(2)}" # "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight" if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed": @@ -99,7 +100,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): new_key = new_key.replace("gamma", "gamma.weight") # convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias - if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.pos_net") or new_key.startswith("backbone.embed.bias")): + if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.posnet") or new_key.startswith("backbone.embed.bias")): value = value.unsqueeze(1) if new_key.endswith("dwconv.bias"): @@ -155,8 +156,6 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): ], "hidden_size": 1282, "n_embd_features": 512, - "n_embd_posnet": 768, - "n_embd_convnext": 768, "n_ff": 2304, "vocab_size": 4096, "n_head": 1, @@ -164,7 +163,19 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): "group_norm_epsilon": 1e-6, "group_norm_groups": 32, "max_position_embeddings": 8192, # ? - "num_hidden_layers": 12 + "n_layer": 12, + "posnet": { + "n_embd": 768, + "n_layer": 6 + }, + "convnext": { + "n_embd": 768, + "n_layer": 12 + }, + #"n_embd_posnet": 768, + #"n_embd_convnext": 768, + #"n_layer_posnet": 6, + #"n_layer_convnext": 12 } with open(path_dst + '/config.json', 'w') as f: diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 57956e5a17075..aa5508be382d4 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -476,6 +476,10 @@ int main(int argc, char ** argv) { smpl[i] = common_sampler_init(model_ttc, params.sampling); } + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl[0])); + LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl[0]).c_str()); + LOG_INF("%s: loading done\n", __func__); const auto t_main_start = ggml_time_us(); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 139d0d15f7e06..a40df974d1fca 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -91,8 +91,6 @@ class LLM: CONTEXT_LENGTH = "{arch}.context_length" EMBEDDING_LENGTH = "{arch}.embedding_length" FEATURES_LENGTH = "{arch}.features_length" - POSNET_LENGTH = "{arch}.posnet_length" - CONVNEXT_LENGTH = "{arch}.convnext_length" BLOCK_COUNT = "{arch}.block_count" LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" @@ -160,6 +158,14 @@ class SSM: class WKV: HEAD_SIZE = "{arch}.wkv.head_size" + class PosNet: + EMBEDDING_LENGTH = "{arch}.posnet.embedding_length" + BLOCK_COUNT = "{arch}.posnet.block_count" + + class ConvNext: + EMBEDDING_LENGTH = "{arch}.convnext.embedding_length" + BLOCK_COUNT = "{arch}.convnext.block_count" + class Tokenizer: MODEL = "tokenizer.ggml.model" PRE = "tokenizer.ggml.pre" @@ -377,21 +383,21 @@ class MODEL_TENSOR(IntEnum): CLS = auto() # classifier CLS_OUT = auto() # classifier output projection CONV1D = auto() - CONV_NEXT_DW = auto() - CONV_NEXT_NORM = auto() - CONV_NEXT_PW1 = auto() - CONV_NEXT_PW2 = auto() - CONV_NEXT_GAMMA = auto() - POS_NET_CONV1 = auto() - POS_NET_CONV2 = auto() - POS_NET_NORM = auto() - POS_NET_NORM1 = auto() - POS_NET_NORM2 = auto() - POS_NET_ATTN_NORM = auto() - POS_NET_ATTN_Q = auto() - POS_NET_ATTN_K = auto() - POS_NET_ATTN_V = auto() - POS_NET_ATTN_OUT = auto() + CONVNEXT_DW = auto() + CONVNEXT_NORM = auto() + CONVNEXT_PW1 = auto() + CONVNEXT_PW2 = auto() + CONVNEXT_GAMMA = auto() + POSNET_CONV1 = auto() + POSNET_CONV2 = auto() + POSNET_NORM = auto() + POSNET_NORM1 = auto() + POSNET_NORM2 = auto() + POSNET_ATTN_NORM = auto() + POSNET_ATTN_Q = auto() + POSNET_ATTN_K = auto() + POSNET_ATTN_V = auto() + POSNET_ATTN_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -558,21 +564,21 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.CLS: "cls", MODEL_TENSOR.CLS_OUT: "cls.output", MODEL_TENSOR.CONV1D: "conv1d", - MODEL_TENSOR.CONV_NEXT_DW: "conv_next.{bid}.dw", - MODEL_TENSOR.CONV_NEXT_NORM: "conv_next.{bid}.norm", - MODEL_TENSOR.CONV_NEXT_PW1: "conv_next.{bid}.pw1", - MODEL_TENSOR.CONV_NEXT_PW2: "conv_next.{bid}.pw2", - MODEL_TENSOR.CONV_NEXT_GAMMA: "conv_next.{bid}.gamma", - MODEL_TENSOR.POS_NET_CONV1: "pos_net.{bid}.conv1", - MODEL_TENSOR.POS_NET_CONV2: "pos_net.{bid}.conv2", - MODEL_TENSOR.POS_NET_NORM: "pos_net.{bid}.norm", - MODEL_TENSOR.POS_NET_NORM1: "pos_net.{bid}.norm1", - MODEL_TENSOR.POS_NET_NORM2: "pos_net.{bid}.norm2", - MODEL_TENSOR.POS_NET_ATTN_NORM: "pos_net.{bid}.attn_norm", - MODEL_TENSOR.POS_NET_ATTN_Q: "pos_net.{bid}.attn_q", - MODEL_TENSOR.POS_NET_ATTN_K: "pos_net.{bid}.attn_k", - MODEL_TENSOR.POS_NET_ATTN_V: "pos_net.{bid}.attn_v", - MODEL_TENSOR.POS_NET_ATTN_OUT: "pos_net.{bid}.attn_output", + MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw", + MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm", + MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1", + MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2", + MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma", + MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1", + MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2", + MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm", + MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1", + MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2", + MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm", + MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q", + MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", + MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", + MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1415,23 +1421,23 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.CONV1D, - MODEL_TENSOR.CONV_NEXT_DW, - MODEL_TENSOR.CONV_NEXT_NORM, - MODEL_TENSOR.CONV_NEXT_PW1, - MODEL_TENSOR.CONV_NEXT_PW2, - MODEL_TENSOR.CONV_NEXT_GAMMA, + MODEL_TENSOR.CONVNEXT_DW, + MODEL_TENSOR.CONVNEXT_NORM, + MODEL_TENSOR.CONVNEXT_PW1, + MODEL_TENSOR.CONVNEXT_PW2, + MODEL_TENSOR.CONVNEXT_GAMMA, MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.POS_NET_CONV1, - MODEL_TENSOR.POS_NET_CONV2, - MODEL_TENSOR.POS_NET_NORM, - MODEL_TENSOR.POS_NET_NORM1, - MODEL_TENSOR.POS_NET_NORM2, - MODEL_TENSOR.POS_NET_ATTN_NORM, - MODEL_TENSOR.POS_NET_ATTN_Q, - MODEL_TENSOR.POS_NET_ATTN_K, - MODEL_TENSOR.POS_NET_ATTN_V, - MODEL_TENSOR.POS_NET_ATTN_OUT, + MODEL_TENSOR.POSNET_CONV1, + MODEL_TENSOR.POSNET_CONV2, + MODEL_TENSOR.POSNET_NORM, + MODEL_TENSOR.POSNET_NORM1, + MODEL_TENSOR.POSNET_NORM2, + MODEL_TENSOR.POSNET_ATTN_NORM, + MODEL_TENSOR.POSNET_ATTN_Q, + MODEL_TENSOR.POSNET_ATTN_K, + MODEL_TENSOR.POSNET_ATTN_V, + MODEL_TENSOR.POSNET_ATTN_OUT, ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 5bb2a8af87e25..3023b539ae82b 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -634,11 +634,17 @@ def add_embedding_length(self, length: int) -> None: def add_features_length(self, length: int) -> None: self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length) - def add_posnet_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.POSNET_LENGTH.format(arch=self.arch), length) + def add_posnet_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length) - def add_convnext_length(self, length: int) -> None: - self.add_uint32(Keys.LLM.CONVNEXT_LENGTH.format(arch=self.arch), length) + def add_posnet_block_count(self, length: int) -> None: + self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length) + + def add_convnext_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length) + + def add_convnext_block_count(self, length: int) -> None: + self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length) def add_block_count(self, length: int) -> None: self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 296f1ca054cf2..82cdb121a1f26 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -704,64 +704,64 @@ class TensorNameMap: ), ############################################################################# - MODEL_TENSOR.CONV_NEXT_DW: ( + MODEL_TENSOR.CONVNEXT_DW: ( "backbone.convnext.{bid}.dwconv", # wavtokenizer ), - MODEL_TENSOR.CONV_NEXT_NORM: ( + MODEL_TENSOR.CONVNEXT_NORM: ( "backbone.convnext.{bid}.norm", # wavtokenizer ), - MODEL_TENSOR.CONV_NEXT_PW1: ( + MODEL_TENSOR.CONVNEXT_PW1: ( "backbone.convnext.{bid}.pwconv1", # wavtokenizer ), - MODEL_TENSOR.CONV_NEXT_PW2: ( + MODEL_TENSOR.CONVNEXT_PW2: ( "backbone.convnext.{bid}.pwconv2", # wavtokenizer ), - MODEL_TENSOR.CONV_NEXT_GAMMA: ( + MODEL_TENSOR.CONVNEXT_GAMMA: ( "backbone.convnext.{bid}.gamma", # wavtokenizer ), - MODEL_TENSOR.POS_NET_CONV1: ( - "backbone.pos_net.{bid}.conv1", # wavtokenizer + MODEL_TENSOR.POSNET_CONV1: ( + "backbone.posnet.{bid}.conv1", # wavtokenizer ), - MODEL_TENSOR.POS_NET_CONV2: ( - "backbone.pos_net.{bid}.conv2", # wavtokenizer + MODEL_TENSOR.POSNET_CONV2: ( + "backbone.posnet.{bid}.conv2", # wavtokenizer ), - MODEL_TENSOR.POS_NET_NORM: ( - "backbone.pos_net.{bid}.norm", # wavtokenizer + MODEL_TENSOR.POSNET_NORM: ( + "backbone.posnet.{bid}.norm", # wavtokenizer ), - MODEL_TENSOR.POS_NET_NORM1: ( - "backbone.pos_net.{bid}.norm1", # wavtokenizer + MODEL_TENSOR.POSNET_NORM1: ( + "backbone.posnet.{bid}.norm1", # wavtokenizer ), - MODEL_TENSOR.POS_NET_NORM2: ( - "backbone.pos_net.{bid}.norm2", # wavtokenizer + MODEL_TENSOR.POSNET_NORM2: ( + "backbone.posnet.{bid}.norm2", # wavtokenizer ), - MODEL_TENSOR.POS_NET_ATTN_NORM: ( - "backbone.pos_net.{bid}.norm", # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_NORM: ( + "backbone.posnet.{bid}.norm", # wavtokenizer ), - MODEL_TENSOR.POS_NET_ATTN_Q: ( - "backbone.pos_net.{bid}.q", # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_Q: ( + "backbone.posnet.{bid}.q", # wavtokenizer ), - MODEL_TENSOR.POS_NET_ATTN_K: ( - "backbone.pos_net.{bid}.k", # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_K: ( + "backbone.posnet.{bid}.k", # wavtokenizer ), - MODEL_TENSOR.POS_NET_ATTN_V: ( - "backbone.pos_net.{bid}.v", # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_V: ( + "backbone.posnet.{bid}.v", # wavtokenizer ), - MODEL_TENSOR.POS_NET_ATTN_OUT: ( - "backbone.pos_net.{bid}.proj_out", # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_OUT: ( + "backbone.posnet.{bid}.proj_out", # wavtokenizer ), } diff --git a/src/llama.cpp b/src/llama.cpp index 4c298d4062cc3..2d16ce72ba4b4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -277,8 +277,6 @@ enum llm_kv { LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, LLM_KV_FEATURES_LENGTH, - LLM_KV_POSNET_LENGTH, - LLM_KV_CONVNEXT_LENGTH, LLM_KV_BLOCK_COUNT, LLM_KV_LEADING_DENSE_BLOCK_COUNT, LLM_KV_FEED_FORWARD_LENGTH, @@ -375,6 +373,12 @@ enum llm_kv { LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, + LLM_KV_POSNET_EMBEDDING_LENGTH, + LLM_KV_POSNET_BLOCK_COUNT, + + LLM_KV_CONVNEXT_EMBEDDING_LENGTH, + LLM_KV_CONVNEXT_BLOCK_COUNT, + // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_SUFFIX_ID, @@ -399,8 +403,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, { LLM_KV_FEATURES_LENGTH, "%s.features_length" }, - { LLM_KV_POSNET_LENGTH, "%s.posnet_length" }, - { LLM_KV_CONVNEXT_LENGTH, "%s.convnext_length" }, { LLM_KV_BLOCK_COUNT, "%s.block_count" }, { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" }, { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, @@ -464,6 +466,12 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, + { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" }, + { LLM_KV_POSNET_BLOCK_COUNT, "%s.posnet.block_count" }, + + { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" }, + { LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -623,11 +631,11 @@ enum llm_tensor { LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, LLM_TENSOR_CONV1D, - LLM_TENSOR_CONV_NEXT_DW, - LLM_TENSOR_CONV_NEXT_NORM, - LLM_TENSOR_CONV_NEXT_PW1, - LLM_TENSOR_CONV_NEXT_PW2, - LLM_TENSOR_CONV_NEXT_GAMMA, + LLM_TENSOR_CONVNEXT_DW, + LLM_TENSOR_CONVNEXT_NORM, + LLM_TENSOR_CONVNEXT_PW1, + LLM_TENSOR_CONVNEXT_PW2, + LLM_TENSOR_CONVNEXT_GAMMA, LLM_TENSOR_POS_NET_CONV1, LLM_TENSOR_POS_NET_CONV2, LLM_TENSOR_POS_NET_NORM, @@ -1628,23 +1636,23 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_CONV1D, "conv1d" }, - { LLM_TENSOR_CONV_NEXT_DW, "conv_next.%d.dw" }, - { LLM_TENSOR_CONV_NEXT_NORM, "conv_next.%d.norm" }, - { LLM_TENSOR_CONV_NEXT_PW1, "conv_next.%d.pw1" }, - { LLM_TENSOR_CONV_NEXT_PW2, "conv_next.%d.pw2" }, - { LLM_TENSOR_CONV_NEXT_GAMMA, "conv_next.%d.gamma" }, + { LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" }, + { LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" }, + { LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" }, + { LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" }, + { LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, - { LLM_TENSOR_POS_NET_CONV1, "pos_net.%d.conv1" }, - { LLM_TENSOR_POS_NET_CONV2, "pos_net.%d.conv2" }, - { LLM_TENSOR_POS_NET_NORM, "pos_net.%d.norm" }, - { LLM_TENSOR_POS_NET_NORM1, "pos_net.%d.norm1" }, - { LLM_TENSOR_POS_NET_NORM2, "pos_net.%d.norm2" }, - { LLM_TENSOR_POS_NET_ATTN_NORM, "pos_net.%d.attn_norm" }, - { LLM_TENSOR_POS_NET_ATTN_Q, "pos_net.%d.attn_q" }, - { LLM_TENSOR_POS_NET_ATTN_K, "pos_net.%d.attn_k" }, - { LLM_TENSOR_POS_NET_ATTN_V, "pos_net.%d.attn_v" }, - { LLM_TENSOR_POS_NET_ATTN_OUT, "pos_net.%d.attn_output" }, + { LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" }, + { LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" }, + { LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" }, + { LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" }, + { LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" }, + { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" }, + { LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" }, + { LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" }, + { LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" }, + { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, }, }, { @@ -2537,6 +2545,16 @@ static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; +struct llama_hparams_posnet { + uint32_t n_embd; + uint32_t n_layer; +}; + +struct llama_hparams_convnext { + uint32_t n_embd; + uint32_t n_layer; +}; + struct llama_hparams { bool vocab_only; bool rope_finetuned; @@ -2546,6 +2564,7 @@ struct llama_hparams { uint32_t n_vocab = 0; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; + uint32_t n_embd_features = 0; uint32_t n_layer; uint32_t n_rot; uint32_t n_swa = 0; // sliding window attention (SWA) @@ -2557,9 +2576,8 @@ struct llama_hparams { uint32_t n_rel_attn_bkts = 0; // for WavTokenizer - uint32_t n_embd_features = 0; - uint32_t n_embd_posnet = 0; - uint32_t n_embd_convnext = 0; + struct llama_hparams_posnet posnet; + struct llama_hparams_convnext convnext; std::array n_head_arr; std::array n_head_kv_arr; @@ -2623,66 +2641,6 @@ struct llama_hparams { enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; - bool operator!=(const llama_hparams & other) const { - if (this->vocab_only != other.vocab_only) return true; - if (this->n_vocab != other.n_vocab) return true; - if (this->n_ctx_train != other.n_ctx_train) return true; - if (this->n_embd != other.n_embd) return true; - if (this->n_layer != other.n_layer) return true; - if (this->n_rot != other.n_rot) return true; - if (this->n_swa != other.n_swa) return true; - if (this->n_embd_head_k != other.n_embd_head_k) return true; - if (this->n_embd_head_v != other.n_embd_head_v) return true; - if (this->n_expert != other.n_expert) return true; - if (this->n_expert_used != other.n_expert_used) return true; - - if (this->n_head_arr != other.n_head_arr) return true; - if (this->n_head_kv_arr != other.n_head_kv_arr) return true; - if (this->n_ff_arr != other.n_ff_arr) return true; - - if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true; - if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true; - if (this->n_lora_q != other.n_lora_q) return true; - if (this->n_lora_kv != other.n_lora_kv) return true; - if (this->n_ff_exp != other.n_ff_exp) return true; - if (this->n_ff_shexp != other.n_ff_shexp) return true; - if (this->n_expert_shared != other.n_expert_shared) return true; - - if (this->rope_finetuned != other.rope_finetuned) return true; - if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true; - if (std::equal(std::begin(this->rope_sections), - std::end(this->rope_sections), - std::begin(other.rope_sections))) return true; - - if (this->ssm_d_conv != other.ssm_d_conv) return true; - if (this->ssm_d_inner != other.ssm_d_inner) return true; - if (this->ssm_d_state != other.ssm_d_state) return true; - if (this->ssm_dt_rank != other.ssm_dt_rank) return true; - if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true; - - if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true; - if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true; - if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true; - if (this->wkv_head_size != other.wkv_head_size) return true; - - if (this->dec_start_token_id != other.dec_start_token_id) return true; - - const float EPSILON = 1e-9f; - - if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; - if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; - if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true; - if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; - if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; - if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; - if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; - if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true; - if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true; - if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true; - - return false; - } - uint32_t n_head(uint32_t il = 0) const { if (il < n_layer) { return n_head_arr[il]; @@ -2735,21 +2693,21 @@ struct llama_hparams { if (wkv_head_size != 0) { // for RWKV models return 2 * n_embd; - } else { - // TODO: maybe support other convolution strides than 1 - // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed - return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner; } + + // TODO: maybe support other convolution strides than 1 + // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed + return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner; } uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings if (wkv_head_size != 0) { // corresponds to RWKV's wkv_states size return n_embd * wkv_head_size; - } else { - // corresponds to Mamba's ssm_states size - return ssm_d_state * ssm_d_inner; } + + // corresponds to Mamba's ssm_states size + return ssm_d_state * ssm_d_inner; } }; @@ -2787,6 +2745,57 @@ struct llama_cparams { void * cb_eval_user_data; }; +struct llama_layer_posnet { + // resnet + struct ggml_tensor * norm1 = nullptr; + struct ggml_tensor * norm1_b = nullptr; + + struct ggml_tensor * conv1 = nullptr; + struct ggml_tensor * conv1_b = nullptr; + + struct ggml_tensor * norm2 = nullptr; + struct ggml_tensor * norm2_b = nullptr; + + struct ggml_tensor * conv2 = nullptr; + struct ggml_tensor * conv2_b = nullptr; + + // attention + struct ggml_tensor * attn_norm = nullptr; + struct ggml_tensor * attn_norm_b = nullptr; + + struct ggml_tensor * attn_q = nullptr; + struct ggml_tensor * attn_q_b = nullptr; + + struct ggml_tensor * attn_k = nullptr; + struct ggml_tensor * attn_k_b = nullptr; + + struct ggml_tensor * attn_v = nullptr; + struct ggml_tensor * attn_v_b = nullptr; + + struct ggml_tensor * attn_o = nullptr; + struct ggml_tensor * attn_o_b = nullptr; + + // normalize + struct ggml_tensor * norm = nullptr; + struct ggml_tensor * norm_b = nullptr; +}; + +struct llama_layer_convnext { + struct ggml_tensor * dw; + struct ggml_tensor * dw_b; + + struct ggml_tensor * norm; + struct ggml_tensor * norm_b; + + struct ggml_tensor * pw1; + struct ggml_tensor * pw1_b; + + struct ggml_tensor * pw2; + struct ggml_tensor * pw2_b; + + struct ggml_tensor * gamma; +}; + // TODO: separate into "llama_layer_enc" and "llama_layer_dec" struct llama_layer { llama_layer() { @@ -2938,20 +2947,9 @@ struct llama_layer { struct ggml_tensor * ffn_up_scale; struct ggml_tensor * ffn_down_scale; - // convnext - struct ggml_tensor * convnext_dw; - struct ggml_tensor * convnext_dw_b; - - struct ggml_tensor * convnext_norm; - struct ggml_tensor * convnext_norm_b; - - struct ggml_tensor * convnext_pw1; - struct ggml_tensor * convnext_pw1_b; + struct llama_layer_posnet posnet; - struct ggml_tensor * convnext_pw2; - struct ggml_tensor * convnext_pw2_b; - - struct ggml_tensor * convnext_gamma; + struct llama_layer_convnext convnext; }; // very similar to llama_batch, @@ -3082,85 +3080,9 @@ struct llama_model { struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out_b = nullptr; - // wavtokenizer decoder - // TODO: dedup struct ggml_tensor * conv_1d = nullptr; struct ggml_tensor * conv_1d_b = nullptr; - struct ggml_tensor * hann_window = nullptr; - - // resnet 0 - struct ggml_tensor * posnet_0_norm1 = nullptr; - struct ggml_tensor * posnet_0_norm1_b = nullptr; - - struct ggml_tensor * posnet_0_conv1 = nullptr; - struct ggml_tensor * posnet_0_conv1_b = nullptr; - - struct ggml_tensor * posnet_0_norm2 = nullptr; - struct ggml_tensor * posnet_0_norm2_b = nullptr; - - struct ggml_tensor * posnet_0_conv2 = nullptr; - struct ggml_tensor * posnet_0_conv2_b = nullptr; - - // resnet 1 - struct ggml_tensor * posnet_1_norm1 = nullptr; - struct ggml_tensor * posnet_1_norm1_b = nullptr; - - struct ggml_tensor * posnet_1_conv1 = nullptr; - struct ggml_tensor * posnet_1_conv1_b = nullptr; - - struct ggml_tensor * posnet_1_norm2 = nullptr; - struct ggml_tensor * posnet_1_norm2_b = nullptr; - - struct ggml_tensor * posnet_1_conv2 = nullptr; - struct ggml_tensor * posnet_1_conv2_b = nullptr; - - // attn 2 - struct ggml_tensor * posnet_2_attn_norm = nullptr; - struct ggml_tensor * posnet_2_attn_norm_b = nullptr; - - struct ggml_tensor * posnet_2_attn_q = nullptr; - struct ggml_tensor * posnet_2_attn_q_b = nullptr; - - struct ggml_tensor * posnet_2_attn_k = nullptr; - struct ggml_tensor * posnet_2_attn_k_b = nullptr; - - struct ggml_tensor * posnet_2_attn_v = nullptr; - struct ggml_tensor * posnet_2_attn_v_b = nullptr; - - struct ggml_tensor * posnet_2_attn_o = nullptr; - struct ggml_tensor * posnet_2_attn_o_b = nullptr; - - // resnet 3 - struct ggml_tensor * posnet_3_norm1 = nullptr; - struct ggml_tensor * posnet_3_norm1_b = nullptr; - - struct ggml_tensor * posnet_3_conv1 = nullptr; - struct ggml_tensor * posnet_3_conv1_b = nullptr; - - struct ggml_tensor * posnet_3_norm2 = nullptr; - struct ggml_tensor * posnet_3_norm2_b = nullptr; - - struct ggml_tensor * posnet_3_conv2 = nullptr; - struct ggml_tensor * posnet_3_conv2_b = nullptr; - - // resnet 4 - struct ggml_tensor * posnet_4_norm1 = nullptr; - struct ggml_tensor * posnet_4_norm1_b = nullptr; - - struct ggml_tensor * posnet_4_conv1 = nullptr; - struct ggml_tensor * posnet_4_conv1_b = nullptr; - - struct ggml_tensor * posnet_4_norm2 = nullptr; - struct ggml_tensor * posnet_4_norm2_b = nullptr; - - struct ggml_tensor * posnet_4_conv2 = nullptr; - struct ggml_tensor * posnet_4_conv2_b = nullptr; - - // resnet 5 - struct ggml_tensor * posnet_5_norm = nullptr; - struct ggml_tensor * posnet_5_norm_b = nullptr; - std::vector layers; // gguf metadata @@ -5705,8 +5627,12 @@ static void llm_load_hparams( if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); - ml.get_key(LLM_KV_POSNET_LENGTH, hparams.n_embd_posnet); - ml.get_key(LLM_KV_CONVNEXT_LENGTH, hparams.n_embd_convnext); + + ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd); + ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer); + + ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd); + ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer); } GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); @@ -7493,11 +7419,11 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CONV_NEXT_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, - {LLM_TENSOR_CONV_NEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CONV_NEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CONV_NEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_CONV_NEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, + {LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, }; // checks if the weight tensor can be used with the specified buffer type and device @@ -7738,7 +7664,8 @@ static bool llm_load_tensors( model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; - const int n_layer = hparams.n_layer; + const int n_layer = hparams.n_layer; + bool use_mmap_buffer = true; // build a list of buffer types for the CPU and GPU devices @@ -9574,107 +9501,105 @@ static bool llm_load_tensors( } break; case LLM_ARCH_WAVTOKENIZER_DEC: { - const int64_t n_embd_features = hparams.n_embd_features; - const int64_t n_embd_posnet = hparams.n_embd_posnet; - const int64_t n_embd_convnext = hparams.n_embd_convnext; - - model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd_features, n_vocab}, 0); - - model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd_posnet}, 0); - model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd_posnet}, 0); - - model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd_features, n_embd_posnet}, 0); - model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, n_embd_posnet}, 0); - - model.posnet_0_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {1, n_embd_posnet}, 0); - model.posnet_0_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 0), {1, n_embd_posnet}, 0); - - model.posnet_0_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_0_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 0), {1, n_embd_posnet}, 0); - - model.posnet_0_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 0), {1, n_embd_posnet}, 0); - model.posnet_0_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 0), {1, n_embd_posnet}, 0); - - model.posnet_0_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 0), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_0_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 0), {1, n_embd_posnet}, 0); - - model.posnet_1_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 1), {1, n_embd_posnet}, 0); - model.posnet_1_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 1), {1, n_embd_posnet}, 0); - - model.posnet_1_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_1_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 1), {1, n_embd_posnet}, 0); - - model.posnet_1_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 1), {1, n_embd_posnet}, 0); - model.posnet_1_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 1), {1, n_embd_posnet}, 0); - - model.posnet_1_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 1), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_1_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 1), {1, n_embd_posnet}, 0); - - model.posnet_2_attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 2), {1, n_embd_posnet}, 0); - model.posnet_2_attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 2), {1, n_embd_posnet}, 0); - - model.posnet_2_attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", 2), {1, n_embd_posnet}, 0); - - model.posnet_2_attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", 2), {1, n_embd_posnet}, 0); - - model.posnet_2_attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", 2), {1, n_embd_posnet}, 0); + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0); - model.posnet_2_attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", 2), {1, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_2_attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", 2), {1, n_embd_posnet}, 0); + model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0); + model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0); - model.posnet_3_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 3), {1, n_embd_posnet}, 0); - model.posnet_3_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 3), {1, n_embd_posnet}, 0); - - model.posnet_3_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_3_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 3), {1, n_embd_posnet}, 0); - - model.posnet_3_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 3), {1, n_embd_posnet}, 0); - model.posnet_3_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 3), {1, n_embd_posnet}, 0); - - model.posnet_3_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 3), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_3_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 3), {1, n_embd_posnet}, 0); - - model.posnet_4_norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 4), {1, n_embd_posnet}, 0); - model.posnet_4_norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", 4), {1, n_embd_posnet}, 0); + // posnet + { + const int64_t n_embd = hparams.posnet.n_embd; + + for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) { + auto & layer = model.layers[i].posnet; + + // posnet: + // + // - resnet + // - resnet + // - attn + // - resnet + // - resnet + // - norm + // + switch (i) { + case 0: + case 1: + case 3: + case 4: + { + layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0); + layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0); + + layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0); + layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0); + + layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0); + layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0); + + layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0); + layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0); + } break; + case 2: + { + layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); + layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); + + layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0); + + layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0); + + layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0); + + layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0); + } break; + case 5: + { + layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); + layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); + } break; + default: GGML_ABORT("unknown posnet layer"); + }; + } + } - model.posnet_4_conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_4_conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", 4), {1, n_embd_posnet}, 0); + GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd); - model.posnet_4_norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", 4), {1, n_embd_posnet}, 0); - model.posnet_4_norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", 4), {1, n_embd_posnet}, 0); + model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0); + model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0); - model.posnet_4_conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", 4), {3, n_embd_posnet, n_embd_posnet}, 0); - model.posnet_4_conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", 4), {1, n_embd_posnet}, 0); + // convnext + { + const int64_t n_embd = hparams.convnext.n_embd; - model.posnet_5_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", 5), {1, n_embd_posnet}, 0); - model.posnet_5_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", 5), {1, n_embd_posnet}, 0); + for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) { + auto & layer = model.layers[i].convnext; - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; + layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0); + layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0); - layer.convnext_dw = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "weight", i), {7, 1, n_embd_convnext}, 0); - layer.convnext_dw_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_DW, "bias", i), {1, n_embd_convnext}, 0); + layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0); + layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0); - layer.convnext_norm = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "weight", i), {n_embd_convnext}, 0); - layer.convnext_norm_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_NORM, "bias", i), {n_embd_convnext}, 0); + layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0); + layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0); - layer.convnext_pw1 = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "weight", i), {n_embd_convnext, n_ff}, 0); - layer.convnext_pw1_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW1, "bias", i), {n_ff}, 0); + layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0); + layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0); - layer.convnext_pw2 = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "weight", i), {n_ff, n_embd_convnext}, 0); - layer.convnext_pw2_b = create_tensor(tn(LLM_TENSOR_CONV_NEXT_PW2, "bias", i), {n_embd_convnext}, 0); + layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0); + } - layer.convnext_gamma = create_tensor(tn(LLM_TENSOR_CONV_NEXT_GAMMA, "weight", i), {n_embd_convnext}, 0); + // output + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); } - // output - model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd_convnext}, 0); - model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd_convnext}, 0); - - model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd_convnext, n_embd}, 0); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0); model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0); } break; default: @@ -17266,156 +17191,82 @@ struct llm_build_context { cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1); cur = ggml_add(ctx0, cur, model.conv_1d_b); - inpL = cur; - - // resnet block 0 - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_0_norm1, - model.posnet_0_norm1_b, - LLM_NORM_GROUP, cb, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_0_conv1_b); - - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_0_norm2, - model.posnet_0_norm2_b, - LLM_NORM_GROUP, cb, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, model.posnet_0_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_0_conv2_b); - - cur = ggml_add(ctx0, cur, inpL); - } - - inpL = cur; - - // resnet block 1 - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_1_norm1, - model.posnet_1_norm1_b, - LLM_NORM_GROUP, cb, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_1_conv1_b); - - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_1_norm2, - model.posnet_1_norm2_b, - LLM_NORM_GROUP, cb, 0); - - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - - cur = ggml_conv_1d_ph(ctx0, model.posnet_1_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_1_conv2_b); - - cur = ggml_add(ctx0, cur, inpL); - } + // posnet + for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) { + const auto & layer = model.layers[il].posnet; - inpL = cur; - - // attention block - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_2_attn_norm, - model.posnet_2_attn_norm_b, - LLM_NORM_GROUP, cb, 0); - - struct ggml_tensor * q; - struct ggml_tensor * k; - struct ggml_tensor * v; - - q = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_q, cur, 1, 1); - k = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_k, cur, 1, 1); - v = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_v, cur, 1, 1); - - q = ggml_add(ctx0, q, model.posnet_2_attn_q_b); - k = ggml_add(ctx0, k, model.posnet_2_attn_k_b); - v = ggml_add(ctx0, v, model.posnet_2_attn_v_b); - - q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); - k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); - - struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - - kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(model.hparams.n_embd_posnet)), 0.0f); - - cur = ggml_mul_mat(ctx0, kq, v); - - cur = ggml_conv_1d_ph(ctx0, model.posnet_2_attn_o, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_2_attn_o_b); - - cur = ggml_add(ctx0, cur, inpL); - } - - inpL = cur; + inpL = cur; - // resnet block 3 - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_3_norm1, - model.posnet_3_norm1_b, - LLM_NORM_GROUP, cb, 0); + switch (il) { + case 0: + case 1: + case 3: + case 4: + { + cur = llm_build_norm(ctx0, cur, hparams, + layer.norm1, + layer.norm1_b, + LLM_NORM_GROUP, cb, 0); - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_3_conv1_b); + cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.conv1_b); - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_3_norm2, - model.posnet_3_norm2_b, - LLM_NORM_GROUP, cb, 0); + cur = llm_build_norm(ctx0, cur, hparams, + layer.norm2, + layer.norm2_b, + LLM_NORM_GROUP, cb, 0); - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); - cur = ggml_conv_1d_ph(ctx0, model.posnet_3_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_3_conv2_b); + cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.conv2_b); - cur = ggml_add(ctx0, cur, inpL); - } + cur = ggml_add(ctx0, cur, inpL); + } break; + case 2: + { + cur = llm_build_norm(ctx0, cur, hparams, + layer.attn_norm, + layer.attn_norm_b, + LLM_NORM_GROUP, cb, 0); - inpL = cur; + struct ggml_tensor * q; + struct ggml_tensor * k; + struct ggml_tensor * v; - // resnet block 4 - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_4_norm1, - model.posnet_4_norm1_b, - LLM_NORM_GROUP, cb, 0); + q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1); + k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1); + v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1); - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + q = ggml_add(ctx0, q, layer.attn_q_b); + k = ggml_add(ctx0, k, layer.attn_k_b); + v = ggml_add(ctx0, v, layer.attn_v_b); - cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv1, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_4_conv1_b); + q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); + k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_4_norm2, - model.posnet_4_norm2_b, - LLM_NORM_GROUP, cb, 0); + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f); - cur = ggml_conv_1d_ph(ctx0, model.posnet_4_conv2, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.posnet_4_conv2_b); + cur = ggml_mul_mat(ctx0, kq, v); - cur = ggml_add(ctx0, cur, inpL); - } + cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.attn_o_b); - // normalize block 5 - { - cur = llm_build_norm(ctx0, cur, hparams, - model.posnet_5_norm, - model.posnet_5_norm_b, - LLM_NORM_GROUP, cb, 0); + cur = ggml_add(ctx0, cur, inpL); + } break; + case 5: + { + cur = llm_build_norm(ctx0, cur, hparams, + layer.norm, + layer.norm_b, + LLM_NORM_GROUP, cb, 0); + } break; + default: GGML_ABORT("unknown posnet layer"); + }; } cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); @@ -17429,27 +17280,30 @@ struct llm_build_context { inpL = cur; - for (int il = 0; il < n_layer; ++il) { + // convnext + for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) { + const auto & layer = model.layers[il].convnext; + cur = inpL; - cur = ggml_conv_1d_dw_ph(ctx0, model.layers[il].convnext_dw, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.layers[il].convnext_dw_b); + cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.dw_b); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); cur = llm_build_norm(ctx0, cur, hparams, - model.layers[il].convnext_norm, - model.layers[il].convnext_norm_b, + layer.norm, + layer.norm_b, LLM_NORM, cb, -1); cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].convnext_pw1, model.layers[il].convnext_pw1_b, NULL, - NULL, NULL, NULL, - model.layers[il].convnext_pw2, model.layers[il].convnext_pw2_b, NULL, + layer.pw1, layer.pw1_b, NULL, + NULL, NULL, NULL, + layer.pw2, layer.pw2_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cur = ggml_mul(ctx0, cur, model.layers[il].convnext_gamma); + cur = ggml_mul(ctx0, cur, layer.gamma); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); From 35259e53353fc2b59384ae51342c96e07cb81192 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 19:33:35 +0200 Subject: [PATCH 36/45] cont ggml-ci --- examples/tts/convert_pt_to_hf.py | 4 - src/llama.cpp | 267 +++++++++++++++---------------- 2 files changed, 131 insertions(+), 140 deletions(-) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index adba21a3661a8..8909a65fd1e13 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -172,10 +172,6 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'): "n_embd": 768, "n_layer": 12 }, - #"n_embd_posnet": 768, - #"n_embd_convnext": 768, - #"n_layer_posnet": 6, - #"n_layer_convnext": 12 } with open(path_dst + '/config.json', 'w') as f: diff --git a/src/llama.cpp b/src/llama.cpp index 2d16ce72ba4b4..019def4a8ac3e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2747,191 +2747,185 @@ struct llama_cparams { struct llama_layer_posnet { // resnet - struct ggml_tensor * norm1 = nullptr; + struct ggml_tensor * norm1 = nullptr; struct ggml_tensor * norm1_b = nullptr; - struct ggml_tensor * conv1 = nullptr; + struct ggml_tensor * conv1 = nullptr; struct ggml_tensor * conv1_b = nullptr; - struct ggml_tensor * norm2 = nullptr; + struct ggml_tensor * norm2 = nullptr; struct ggml_tensor * norm2_b = nullptr; - struct ggml_tensor * conv2 = nullptr; + struct ggml_tensor * conv2 = nullptr; struct ggml_tensor * conv2_b = nullptr; // attention - struct ggml_tensor * attn_norm = nullptr; + struct ggml_tensor * attn_norm = nullptr; struct ggml_tensor * attn_norm_b = nullptr; - struct ggml_tensor * attn_q = nullptr; + struct ggml_tensor * attn_q = nullptr; struct ggml_tensor * attn_q_b = nullptr; - struct ggml_tensor * attn_k = nullptr; + struct ggml_tensor * attn_k = nullptr; struct ggml_tensor * attn_k_b = nullptr; - struct ggml_tensor * attn_v = nullptr; + struct ggml_tensor * attn_v = nullptr; struct ggml_tensor * attn_v_b = nullptr; - struct ggml_tensor * attn_o = nullptr; + struct ggml_tensor * attn_o = nullptr; struct ggml_tensor * attn_o_b = nullptr; // normalize - struct ggml_tensor * norm = nullptr; + struct ggml_tensor * norm = nullptr; struct ggml_tensor * norm_b = nullptr; }; struct llama_layer_convnext { - struct ggml_tensor * dw; - struct ggml_tensor * dw_b; + struct ggml_tensor * dw = nullptr; + struct ggml_tensor * dw_b = nullptr; - struct ggml_tensor * norm; - struct ggml_tensor * norm_b; + struct ggml_tensor * norm = nullptr; + struct ggml_tensor * norm_b = nullptr; - struct ggml_tensor * pw1; - struct ggml_tensor * pw1_b; + struct ggml_tensor * pw1 = nullptr; + struct ggml_tensor * pw1_b = nullptr; - struct ggml_tensor * pw2; - struct ggml_tensor * pw2_b; + struct ggml_tensor * pw2 = nullptr; + struct ggml_tensor * pw2_b = nullptr; - struct ggml_tensor * gamma; + struct ggml_tensor * gamma = nullptr; }; -// TODO: separate into "llama_layer_enc" and "llama_layer_dec" struct llama_layer { - llama_layer() { - // initialize all pointers to NULL - std::memset(this, 0, sizeof(*this)); - } - // normalization - struct ggml_tensor * attn_norm; - struct ggml_tensor * attn_norm_b; - struct ggml_tensor * attn_norm_2; - struct ggml_tensor * attn_norm_2_b; - struct ggml_tensor * attn_q_norm; - struct ggml_tensor * attn_q_norm_b; - struct ggml_tensor * attn_k_norm; - struct ggml_tensor * attn_k_norm_b; - struct ggml_tensor * attn_out_norm; - struct ggml_tensor * attn_out_norm_b; - struct ggml_tensor * attn_q_a_norm; - struct ggml_tensor * attn_kv_a_norm; - struct ggml_tensor * attn_sub_norm; - struct ggml_tensor * attn_post_norm; - struct ggml_tensor * ffn_sub_norm; - struct ggml_tensor * attn_norm_cross; - struct ggml_tensor * attn_norm_enc; + struct ggml_tensor * attn_norm = nullptr; + struct ggml_tensor * attn_norm_b = nullptr; + struct ggml_tensor * attn_norm_2 = nullptr; + struct ggml_tensor * attn_norm_2_b = nullptr; + struct ggml_tensor * attn_q_norm = nullptr; + struct ggml_tensor * attn_q_norm_b = nullptr; + struct ggml_tensor * attn_k_norm = nullptr; + struct ggml_tensor * attn_k_norm_b = nullptr; + struct ggml_tensor * attn_out_norm = nullptr; + struct ggml_tensor * attn_out_norm_b = nullptr; + struct ggml_tensor * attn_q_a_norm = nullptr; + struct ggml_tensor * attn_kv_a_norm = nullptr; + struct ggml_tensor * attn_sub_norm = nullptr; + struct ggml_tensor * attn_post_norm = nullptr; + struct ggml_tensor * ffn_sub_norm = nullptr; + struct ggml_tensor * attn_norm_cross = nullptr; + struct ggml_tensor * attn_norm_enc = nullptr; // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - struct ggml_tensor * wqkv; - struct ggml_tensor * wq_a; - struct ggml_tensor * wq_b; - struct ggml_tensor * wkv_a_mqa; - struct ggml_tensor * wkv_b; - struct ggml_tensor * wq_cross; - struct ggml_tensor * wk_cross; - struct ggml_tensor * wv_cross; - struct ggml_tensor * wo_cross; - struct ggml_tensor * wq_enc; - struct ggml_tensor * wk_enc; - struct ggml_tensor * wv_enc; - struct ggml_tensor * wo_enc; + struct ggml_tensor * wq = nullptr; + struct ggml_tensor * wk = nullptr; + struct ggml_tensor * wv = nullptr; + struct ggml_tensor * wo = nullptr; + struct ggml_tensor * wqkv = nullptr; + struct ggml_tensor * wq_a = nullptr; + struct ggml_tensor * wq_b = nullptr; + struct ggml_tensor * wkv_a_mqa = nullptr; + struct ggml_tensor * wkv_b = nullptr; + struct ggml_tensor * wq_cross = nullptr; + struct ggml_tensor * wk_cross = nullptr; + struct ggml_tensor * wv_cross = nullptr; + struct ggml_tensor * wo_cross = nullptr; + struct ggml_tensor * wq_enc = nullptr; + struct ggml_tensor * wk_enc = nullptr; + struct ggml_tensor * wv_enc = nullptr; + struct ggml_tensor * wo_enc = nullptr; // attention bias - struct ggml_tensor * bq; - struct ggml_tensor * bk; - struct ggml_tensor * bv; - struct ggml_tensor * bo; - struct ggml_tensor * bqkv; + struct ggml_tensor * bq = nullptr; + struct ggml_tensor * bk = nullptr; + struct ggml_tensor * bv = nullptr; + struct ggml_tensor * bo = nullptr; + struct ggml_tensor * bqkv = nullptr; // relative position bias - struct ggml_tensor * attn_rel_b; - struct ggml_tensor * attn_rel_b_enc; - struct ggml_tensor * attn_rel_b_cross; + struct ggml_tensor * attn_rel_b = nullptr; + struct ggml_tensor * attn_rel_b_enc = nullptr; + struct ggml_tensor * attn_rel_b_cross = nullptr; // normalization - struct ggml_tensor * ffn_norm; - struct ggml_tensor * ffn_norm_b; - struct ggml_tensor * ffn_post_norm; - struct ggml_tensor * layer_out_norm; - struct ggml_tensor * layer_out_norm_b; - struct ggml_tensor * ffn_norm_exps; - struct ggml_tensor * ffn_norm_enc; + struct ggml_tensor * ffn_norm = nullptr; + struct ggml_tensor * ffn_norm_b = nullptr; + struct ggml_tensor * ffn_post_norm = nullptr; + struct ggml_tensor * layer_out_norm = nullptr; + struct ggml_tensor * layer_out_norm_b = nullptr; + struct ggml_tensor * ffn_norm_exps = nullptr; + struct ggml_tensor * ffn_norm_enc = nullptr; // ff - struct ggml_tensor * ffn_gate; // w1 - struct ggml_tensor * ffn_down; // w2 - struct ggml_tensor * ffn_up; // w3 - struct ggml_tensor * ffn_gate_enc; - struct ggml_tensor * ffn_down_enc; - struct ggml_tensor * ffn_up_enc; + struct ggml_tensor * ffn_gate = nullptr; // w1 + struct ggml_tensor * ffn_down = nullptr; // w2 + struct ggml_tensor * ffn_up = nullptr; // w3 + struct ggml_tensor * ffn_gate_enc = nullptr; + struct ggml_tensor * ffn_down_enc = nullptr; + struct ggml_tensor * ffn_up_enc = nullptr; // ff MoE - struct ggml_tensor * ffn_gate_inp; - struct ggml_tensor * ffn_gate_exps; - struct ggml_tensor * ffn_down_exps; - struct ggml_tensor * ffn_up_exps ; + struct ggml_tensor * ffn_gate_inp = nullptr; + struct ggml_tensor * ffn_gate_exps = nullptr; + struct ggml_tensor * ffn_down_exps = nullptr; + struct ggml_tensor * ffn_up_exps = nullptr; // ff shared expert (shexp) - struct ggml_tensor * ffn_gate_inp_shexp; - struct ggml_tensor * ffn_gate_shexp; - struct ggml_tensor * ffn_down_shexp; - struct ggml_tensor * ffn_up_shexp; + struct ggml_tensor * ffn_gate_inp_shexp = nullptr; + struct ggml_tensor * ffn_gate_shexp = nullptr; + struct ggml_tensor * ffn_down_shexp = nullptr; + struct ggml_tensor * ffn_up_shexp = nullptr; // ff bias - struct ggml_tensor * ffn_gate_b; - struct ggml_tensor * ffn_down_b; // b2 - struct ggml_tensor * ffn_up_b; // b3 - struct ggml_tensor * ffn_act; + struct ggml_tensor * ffn_gate_b = nullptr; + struct ggml_tensor * ffn_down_b = nullptr; // b2 + struct ggml_tensor * ffn_up_b = nullptr; // b3 + struct ggml_tensor * ffn_act = nullptr; // mamba proj - struct ggml_tensor * ssm_in; - struct ggml_tensor * ssm_x; - struct ggml_tensor * ssm_dt; - struct ggml_tensor * ssm_out; + struct ggml_tensor * ssm_in = nullptr; + struct ggml_tensor * ssm_x = nullptr; + struct ggml_tensor * ssm_dt = nullptr; + struct ggml_tensor * ssm_out = nullptr; // mamba - struct ggml_tensor * ssm_conv1d; - struct ggml_tensor * ssm_a; - struct ggml_tensor * ssm_d; + struct ggml_tensor * ssm_conv1d = nullptr; + struct ggml_tensor * ssm_a = nullptr; + struct ggml_tensor * ssm_d = nullptr; // mamba bias - struct ggml_tensor * ssm_conv1d_b; - struct ggml_tensor * ssm_dt_b; + struct ggml_tensor * ssm_conv1d_b = nullptr; + struct ggml_tensor * ssm_dt_b = nullptr; // rwkv - struct ggml_tensor * time_mix_w1; - struct ggml_tensor * time_mix_w2; - struct ggml_tensor * time_mix_lerp_x; - struct ggml_tensor * time_mix_lerp_w; - struct ggml_tensor * time_mix_lerp_k; - struct ggml_tensor * time_mix_lerp_v; - struct ggml_tensor * time_mix_lerp_r; - struct ggml_tensor * time_mix_lerp_g; - - struct ggml_tensor * time_mix_first; - struct ggml_tensor * time_mix_decay; - struct ggml_tensor * time_mix_decay_w1; - struct ggml_tensor * time_mix_decay_w2; - struct ggml_tensor * time_mix_key; - struct ggml_tensor * time_mix_value; - struct ggml_tensor * time_mix_receptance; - struct ggml_tensor * time_mix_gate; - - struct ggml_tensor * time_mix_ln; - struct ggml_tensor * time_mix_ln_b; - struct ggml_tensor * time_mix_output; - - struct ggml_tensor * channel_mix_lerp_k; - struct ggml_tensor * channel_mix_lerp_r; - - struct ggml_tensor * channel_mix_key; - struct ggml_tensor * channel_mix_receptance; - struct ggml_tensor * channel_mix_value; + struct ggml_tensor * time_mix_w1 = nullptr; + struct ggml_tensor * time_mix_w2 = nullptr; + struct ggml_tensor * time_mix_lerp_x = nullptr; + struct ggml_tensor * time_mix_lerp_w = nullptr; + struct ggml_tensor * time_mix_lerp_k = nullptr; + struct ggml_tensor * time_mix_lerp_v = nullptr; + struct ggml_tensor * time_mix_lerp_r = nullptr; + struct ggml_tensor * time_mix_lerp_g = nullptr; + + struct ggml_tensor * time_mix_first = nullptr; + struct ggml_tensor * time_mix_decay = nullptr; + struct ggml_tensor * time_mix_decay_w1 = nullptr; + struct ggml_tensor * time_mix_decay_w2 = nullptr; + struct ggml_tensor * time_mix_key = nullptr; + struct ggml_tensor * time_mix_value = nullptr; + struct ggml_tensor * time_mix_receptance = nullptr; + struct ggml_tensor * time_mix_gate = nullptr; + + struct ggml_tensor * time_mix_ln = nullptr; + struct ggml_tensor * time_mix_ln_b = nullptr; + struct ggml_tensor * time_mix_output = nullptr; + + struct ggml_tensor * channel_mix_lerp_k = nullptr; + struct ggml_tensor * channel_mix_lerp_r = nullptr; + + struct ggml_tensor * channel_mix_key = nullptr; + struct ggml_tensor * channel_mix_receptance = nullptr; + struct ggml_tensor * channel_mix_value = nullptr; // long rope factors struct ggml_tensor * rope_long = nullptr; @@ -2939,13 +2933,13 @@ struct llama_layer { struct ggml_tensor * rope_freqs = nullptr; // bitnet scale - struct ggml_tensor * wq_scale; - struct ggml_tensor * wk_scale; - struct ggml_tensor * wv_scale; - struct ggml_tensor * wo_scale; - struct ggml_tensor * ffn_gate_scale; - struct ggml_tensor * ffn_up_scale; - struct ggml_tensor * ffn_down_scale; + struct ggml_tensor * wq_scale = nullptr; + struct ggml_tensor * wk_scale = nullptr; + struct ggml_tensor * wv_scale = nullptr; + struct ggml_tensor * wo_scale = nullptr; + struct ggml_tensor * ffn_gate_scale = nullptr; + struct ggml_tensor * ffn_up_scale = nullptr; + struct ggml_tensor * ffn_down_scale = nullptr; struct llama_layer_posnet posnet; @@ -3167,6 +3161,7 @@ struct llama_sbatch { // batch indices of the output std::vector out_ids; std::vector seq; + const llama_batch * batch = nullptr; // buffers for the ubatch From 2033fb7eefa6919404642900d8e4f10a7b235aef Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 20:39:46 +0200 Subject: [PATCH 37/45] cont [no ci] --- src/llama.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 019def4a8ac3e..8f16d2d4fc853 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -77,7 +77,6 @@ #endif // bump if necessary -#define LLAMA_MAX_EMBD 8 #define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 @@ -3074,8 +3073,8 @@ struct llama_model { struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out_b = nullptr; - struct ggml_tensor * conv_1d = nullptr; - struct ggml_tensor * conv_1d_b = nullptr; + struct ggml_tensor * conv1d = nullptr; + struct ggml_tensor * conv1d_b = nullptr; std::vector layers; @@ -9498,8 +9497,8 @@ static bool llm_load_tensors( { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0); - model.conv_1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0); - model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0); + model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0); + model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0); // posnet { @@ -17183,8 +17182,8 @@ struct llm_build_context { cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); - cur = ggml_conv_1d_ph(ctx0, model.conv_1d, cur, 1, 1); - cur = ggml_add(ctx0, cur, model.conv_1d_b); + cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1); + cur = ggml_add(ctx0, cur, model.conv1d_b); // posnet for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) { From 824fa750d4b441c4266a0581efbc32194f1bfe42 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 10:25:17 +0200 Subject: [PATCH 38/45] llama : update WavTokenizer to non-causal attn --- convert_hf_to_gguf.py | 2 ++ src/llama.cpp | 1 + 2 files changed, 3 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7bf67a268183c..4a0b00f69c699 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2069,6 +2069,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) + self.gguf_writer.add_causal_attention(False) + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): diff --git a/src/llama.cpp b/src/llama.cpp index 8f16d2d4fc853..94160d534435f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6393,6 +6393,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps); ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); } break; default: (void)0; } From d291c742534fdb6f4626283b9b4c2ad105d5a803 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 16 Dec 2024 21:45:25 +0200 Subject: [PATCH 39/45] llama : handle no-vocab detokenization --- src/llama-vocab.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index e38e598532345..7f2725f94be13 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1867,6 +1867,10 @@ int32_t llama_detokenize_impl( int32_t text_len_max, bool remove_special, bool unparse_special) { + if (vocab.type == LLAMA_VOCAB_TYPE_NONE) { + return 0; + } + GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first."); int32_t avail = text_len_max; From 5038abe1ee2be71473975d70cd317b33707b2a59 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 10:27:52 +0200 Subject: [PATCH 40/45] tts : add Python example for OuteTTS (wip) --- examples/tts/tts-outetts.py | 168 ++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 examples/tts/tts-outetts.py diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py new file mode 100644 index 0000000000000..86a7806573a96 --- /dev/null +++ b/examples/tts/tts-outetts.py @@ -0,0 +1,168 @@ +import sys +import json +import struct +import requests +import re + +def process_text(text: str): + text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed + text = re.sub(r'[-_/,\.\\]', ' ', text) + text = re.sub(r'[^a-z\s]', '', text) + text = re.sub(r'\s+', ' ', text).strip() + return text.split() + +# usage: +# python tts-outetts.py http://server-llm:port http://server-dec:port "text" + +if len(sys.argv) <= 3: + print("usage: python tts-outetts.py http://server-llm:port http://server-dec:port \"text\"") + exit(1) + +host_llm = sys.argv[1] +host_dec = sys.argv[2] +text = sys.argv[3] + +prefix = """<|im_start|> +<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>""" + +words = process_text(text) +words = "<|text_sep|>".join([i.strip() for i in words]) +words += "<|text_end|>\n" + +# voice data +# TODO: load from json +#suffix = """<|audio_start|> +#the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|> +#overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|> +#package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|> +#from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|> +#just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|> +#two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|> +#people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|> +#is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|> +#pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|> +#remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|> +#sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|> +#i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|> +#have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|> +#some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|> +#critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|> +#about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|> +#some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|> +#of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|> +#the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|> +#gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|> +#aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|> +#but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|> +#its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|> +#still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|> +#really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|> +#enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|> +#and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|> +#it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|> +#looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|> +#lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>""" + +# TODO: tokenization is slow for some reason - here is pre-tokenized input +suffix = [ 151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585, 152460, 153375, 151670, 198, 74455, + 155808, 151669, 151799, 151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470, 151970, 153413, + 152419, 153334, 153289, 153374, 153199, 152040, 153260, 152721, 152680, 153297, 152419, 153248, 152400, + 152691, 153368, 153437, 151670, 198, 1722, 155828, 151669, 152607, 152256, 152991, 152299, 152688, 153163, + 153016, 152789, 153198, 152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207, 152461, 153321, + 153309, 151750, 152137, 153340, 152573, 152267, 153347, 151789, 152681, 153339, 151992, 152512, 151751, + 152179, 153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904, 152311, 151670, 198, 1499, 155791, + 151669, 152276, 152454, 153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226, 153043, 152325, + 153267, 152622, 151670, 198, 4250, 155797, 151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271, + 152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213, 152112, 153204, 151722, 152542, 151670, 198, + 19789, 155796, 151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002, 152191, 151734, 152312, 152810, + 152237, 153224, 153169, 153224, 152244, 153387, 153404, 151670, 198, 16069, 155811, 151669, 152265, 151946, + 151808, 152412, 152363, 152305, 153156, 152733, 152810, 153157, 152016, 152100, 152069, 153234, 152317, + 152589, 152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504, 153376, 152272, 152433, 152325, + 151941, 151670, 198, 285, 155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381, 152474, 152680, + 152157, 153255, 152324, 151682, 151670, 198, 32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682, + 152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488, 153070, 151883, 152890, 152489, 153144, + 153375, 152358, 151685, 152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669, 151902, 152720, + 153377, 152027, 152378, 152821, 153207, 153459, 153028, 153068, 152507, 153255, 152158, 152921, 151958, + 152609, 152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470, 152606, 152162, 152186, 153071, + 152244, 153118, 153375, 153018, 152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736, 153380, + 153502, 152702, 152115, 153181, 152735, 153277, 153457, 152393, 153112, 152595, 151670, 198, 19098, 155808, + 151669, 152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239, 153163, 152922, 153402, 152034, + 152591, 153438, 152215, 151673, 152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482, 152718, + 152862, 153347, 151670, 198, 72, 155780, 151669, 151795, 152111, 152746, 152377, 153471, 152309, 151670, 198, + 19016, 155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701, 152939, 152536, 152091, 151815, 152733, + 151672, 151670, 198, 14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042, 153504, 152589, 153333, + 151839, 151941, 153038, 153180, 151670, 198, 36996, 8303, 155832, 151669, 152231, 152256, 152835, 152801, + 152985, 153400, 152393, 152818, 152765, 152249, 152600, 151699, 152302, 152752, 153018, 153009, 151992, + 153054, 152847, 153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458, 152048, 152757, 152428, + 153195, 151906, 153006, 153178, 153250, 152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418, + 152228, 152733, 151670, 198, 9096, 155801, 151669, 151698, 153321, 152217, 153039, 152935, 153400, 152122, + 152531, 153106, 152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851, 152901, 152885, 152594, + 153446, 153080, 151670, 198, 14689, 155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191, 151673, + 151690, 151698, 152714, 152846, 152981, 153171, 153384, 153364, 153188, 153246, 151670, 198, 1055, 155779, + 151669, 151869, 152388, 152711, 153334, 151736, 151670, 198, 1782, 155780, 151669, 153483, 153240, 152241, + 152558, 152697, 153046, 151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605, 153034, 153434, + 153372, 153347, 151887, 152453, 152758, 152133, 152510, 152694, 152431, 152321, 153088, 152676, 152223, + 152581, 152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032, 152903, 152859, 152989, 151748, + 152669, 152661, 152650, 152409, 151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469, 152988, + 152894, 151819, 152391, 153019, 152058, 153062, 153230, 151826, 152112, 152306, 152264, 152769, 153390, + 152384, 152435, 152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540, 151919, 151893, 152558, + 152817, 152946, 152956, 152129, 152715, 153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450, + 151670, 198, 8088, 155792, 151669, 152452, 153497, 153353, 152679, 152533, 152382, 152374, 152611, 153341, + 153163, 152285, 153411, 152495, 153141, 152320, 151670, 198, 1199, 155781, 151669, 151764, 152360, 153295, + 152634, 153342, 152199, 152271, 151670, 198, 43366, 155799, 151669, 152308, 151682, 152889, 152016, 152385, + 152629, 152495, 151826, 153321, 152958, 152180, 151886, 153432, 152922, 152128, 153024, 153040, 152593, + 152287, 151677, 151670, 198, 53660, 155808, 151669, 151727, 152092, 152680, 153331, 151699, 152316, 152938, + 152289, 152433, 153384, 151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691, 152489, 151941, + 152049, 152034, 153053, 152179, 153160, 151676, 153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350, + 152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234, 153135, 152291, 153235, 152143, 152583, + 152402, 153483, 152678, 152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825, 152548, 153442, + 152109, 152659, 153325, 152781, 152570, 152957, 151752, 152265, 153381, 152515, 151670, 198, 437, 155787, + 151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174, 151792, 153409, 153327, 152990, 151670, 198, + 275, 155781, 151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974, 151670, 198, 94273, 155799, + 151669, 152953, 152938, 153427, 152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331, 152257, + 152987, 152777, 153448, 152408, 151696, 152408, 152326, 152699, 151670, 198, 385, 16239, 155828, 151669, + 152306, 152268, 153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110, 152918, 152923, 152467, + 152331, 153053, 153330, 151889, 153444, 152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751, + 152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499, 152109, 152255, 151739, 152267, 152759, + 153318, 153165, 153349, 151670, ] + +response = requests.post( + host_llm + "/completion", + json={ + "prompt": [prefix + words, *suffix], + "n_predict": 1024, + "cache_prompt": True, + "samplers": ["top_k"], + "top_k": 16, + "seed": 1003, + } +) + +response_json = response.json() + +#print(json.dumps(response_json, indent=4)) +#print(json.dumps(response_json["prompt"], indent=4).replace("\\n", "\n")) +#print(json.dumps(response_json["timings"], indent=4)) +print(json.dumps(response_json["tokens"], indent=4)) + +codes = response_json["tokens"] + +codes = [t - 151672 for t in codes if t >= 151672 and t <= 155772] + +response = requests.post( + host_dec + "/embeddings", + json={ + "input": [*codes], + } +) + +response_json = response.json() + +#print(json.dumps(response_json, indent=4)) + +embd = response_json["data"][0]["embedding"] + +print(len(embd)) + + + From edb7896b495fbeddebe5d33601b6a844c4833acb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Dec 2024 14:21:06 +0200 Subject: [PATCH 41/45] tts : extend python example to generate spectrogram ggml-ci --- examples/tts/tts-outetts.py | 12 ++- examples/tts/tts.cpp | 188 +++++++++++++++++++++--------------- 2 files changed, 118 insertions(+), 82 deletions(-) diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py index 86a7806573a96..3c085dd5437de 100644 --- a/examples/tts/tts-outetts.py +++ b/examples/tts/tts-outetts.py @@ -143,7 +143,7 @@ def process_text(text: str): #print(json.dumps(response_json, indent=4)) #print(json.dumps(response_json["prompt"], indent=4).replace("\\n", "\n")) #print(json.dumps(response_json["timings"], indent=4)) -print(json.dumps(response_json["tokens"], indent=4)) +#print(json.dumps(response_json["tokens"], indent=4)) codes = response_json["tokens"] @@ -160,9 +160,15 @@ def process_text(text: str): #print(json.dumps(response_json, indent=4)) +# spectrogram embd = response_json["data"][0]["embedding"] -print(len(embd)) - +n_codes = len(embd) +n_embd = len(embd[0]) +print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd)) +# post-process the spectrogram to convert to audio +# TODO: see the tts.cpp:embd_to_audio() and implement it in Python +print('converting to audio ...') +print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python') diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index aa5508be382d4..9c3d58f891703 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -63,7 +63,47 @@ static void print_usage(int, char ** argv) { LOG("\n"); } -static void fill_hann_window(int length, bool periodic, double * output) { +struct wav_header { + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t chunk_size; + char wave[4] = {'W', 'A', 'V', 'E'}; + char fmt[4] = {'f', 'm', 't', ' '}; + uint32_t fmt_chunk_size = 16; + uint16_t audio_format = 1; // PCM + uint16_t num_channels = 1; // Mono + uint32_t sample_rate; + uint32_t byte_rate; + uint16_t block_align; + uint16_t bits_per_sample = 16; + char data[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size; +}; + +static void save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { + std::ofstream file(fname, std::ios::binary); + if (!file) { + LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str()); + return; + } + + wav_header header; + header.sample_rate = sample_rate; + header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); + header.block_align = header.num_channels * (header.bits_per_sample / 8); + header.data_size = data.size() * (header.bits_per_sample / 8); + header.chunk_size = 36 + header.data_size; + + file.write(reinterpret_cast(&header), sizeof(header)); + + for (const auto & sample : data) { + int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); + file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); + } + + file.close(); +} + +static void fill_hann_window(int length, bool periodic, float * output) { int offset = -1; if (periodic) { offset = 0; @@ -74,31 +114,31 @@ static void fill_hann_window(int length, bool periodic, double * output) { } // very poor-man fft -static void twiddle(double * real, double * imag, int k, int N) { - double angle = 2 * M_PI * k / N; +static void twiddle(float * real, float * imag, int k, int N) { + float angle = 2 * M_PI * k / N; *real = cos(angle); *imag = sin(angle); } -static void irfft(int n, const double * inp_cplx, double * out_real) { +static void irfft(int n, const float * inp_cplx, float * out_real) { int N = n / 2 + 1; - std::vector real_input(N); - std::vector imag_input(N); + std::vector real_input(N); + std::vector imag_input(N); for (int i = 0; i < N; ++i) { real_input[i] = inp_cplx[2 * i]; imag_input[i] = inp_cplx[2 * i + 1]; } - std::vector real_output(n); - std::vector imag_output(n); + std::vector real_output(n); + std::vector imag_output(n); for (int k = 0; k < n; ++k) { real_output[k] = 0.0f; imag_output[k] = 0.0f; for (int m = 0; m < N; ++m) { - double twiddle_real; - double twiddle_imag; + float twiddle_real; + float twiddle_imag; twiddle(&twiddle_real, &twiddle_imag, k * m, n); @@ -123,7 +163,7 @@ static void irfft(int n, const double * inp_cplx, double * out_real) { // hop_length = 320 // pad = 480 // -static void fold(const std::vector & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector & output) { +static void fold(const std::vector & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector & output) { int64_t output_height = n_out; int64_t kernel_w = n_win; int64_t stride_w = n_hop; @@ -147,103 +187,63 @@ static void fold(const std::vector & data, int64_t n_out, int64_t n_win, output.resize(n_out - 2 * n_pad); } -struct wav_header { - char riff[4] = {'R', 'I', 'F', 'F'}; - uint32_t chunk_size; - char wave[4] = {'W', 'A', 'V', 'E'}; - char fmt[4] = {'f', 'm', 't', ' '}; - uint32_t fmt_chunk_size = 16; - uint16_t audio_format = 1; // PCM - uint16_t num_channels = 1; // Mono - uint32_t sample_rate; - uint32_t byte_rate; - uint16_t block_align; - uint16_t bits_per_sample = 16; - char data[4] = {'d', 'a', 't', 'a'}; - uint32_t data_size; -}; - -static void save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { - std::ofstream file(fname, std::ios::binary); - if (!file) { - LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str()); - return; - } - - wav_header header; - header.sample_rate = sample_rate; - header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); - header.block_align = header.num_channels * (header.bits_per_sample / 8); - header.data_size = data.size() * (header.bits_per_sample / 8); - header.chunk_size = 36 + header.data_size; - - file.write(reinterpret_cast(&header), sizeof(header)); - - for (const auto & sample : data) { - int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); - file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); - } - - file.close(); -} - -static std::vector embd_to_audio( +// TODO: not optimized at all +static std::vector embd_to_audio( const float * embd, - const std::vector & codes, + const int n_codes, const int n_embd, const int n_thread) { - const int n = codes.size(); const int n_fft = 1280; const int n_hop = 320; const int n_win = 1280; const int n_pad = (n_win - n_hop)/2; - const int n_out = (n - 1)*n_hop + n_win; + const int n_out = (n_codes - 1)*n_hop + n_win; - std::vector hann(n_fft); + std::vector hann(n_fft); fill_hann_window(hann.size(), true, hann.data()); - int n_spec = n_embd*n; + int n_spec = n_embd*n_codes; - std::vector E (n_spec); - std::vector S (n_spec); - std::vector ST(n_spec); + std::vector E (n_spec); + std::vector S (n_spec); + std::vector ST(n_spec); - for (int l = 0; l < n; ++l) { + for (int l = 0; l < n_codes; ++l) { for (int k = 0; k < n_embd; ++k) { - E[k*n + l] = embd[l*n_embd + k]; + E[k*n_codes + l] = embd[l*n_embd + k]; } } for (int k = 0; k < n_embd/2; ++k) { - for (int l = 0; l < n; ++l) { - double mag = E[(k )*n + l]; - double phi = E[(k + n_embd/2)*n + l]; + for (int l = 0; l < n_codes; ++l) { + float mag = E[(k )*n_codes + l]; + float phi = E[(k + n_embd/2)*n_codes + l]; mag = exp(mag); if (mag > 1e2) { mag = 1e2; } - S[2*(k*n + l) + 0] = mag*cosf(phi); - S[2*(k*n + l) + 1] = mag*sinf(phi); + S[2*(k*n_codes + l) + 0] = mag*cosf(phi); + S[2*(k*n_codes + l) + 1] = mag*sinf(phi); } } - for (int l = 0; l < n; ++l) { + for (int l = 0; l < n_codes; ++l) { for (int k = 0; k < n_embd/2; ++k) { - ST[l*n_embd + 2*k + 0] = S[2*(k*n + l) + 0]; - ST[l*n_embd + 2*k + 1] = S[2*(k*n + l) + 1]; + ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0]; + ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1]; } } - std::vector res (n*n_fft); - std::vector hann2(n*n_fft); + std::vector res (n_codes*n_fft); + std::vector hann2(n_codes*n_fft); std::vector workers(n_thread); for (int i = 0; i < n_thread; ++i) { workers[i] = std::thread([&, i]() { - for (int l = i; l < n; l += n_thread) { + for (int l = i; l < n_codes; l += n_thread) { irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft); for (int j = 0; j < n_fft; ++j) { res [l*n_fft + j] *= hann[j]; @@ -256,8 +256,8 @@ static std::vector embd_to_audio( workers[i].join(); } - std::vector audio; - std::vector env; + std::vector audio; + std::vector env; fold(res, n_out, n_win, n_hop, n_pad, audio); fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once @@ -844,12 +844,14 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14 const auto t_voc_start = ggml_time_us(); - llama_batch batch = llama_batch_init(codes.size(), 0, 1); + const int n_codes = codes.size(); + + llama_batch batch = llama_batch_init(n_codes, 0, 1); for (size_t i = 0; i < codes.size(); ++i) { common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits? } - GGML_ASSERT(batch.n_tokens == (int) codes.size()); + GGML_ASSERT(batch.n_tokens == n_codes); if (llama_decode(ctx_cts, batch) != 0) { LOG_ERR("%s: llama_decode() failed\n", __func__); @@ -862,12 +864,40 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14 const auto t_spec_start = ggml_time_us(); +#if 1 // spectral operations - // TODO: not optimized at all const int n_embd = llama_n_embd(model_cts); const float * embd = llama_get_embeddings(ctx_cts); - auto audio = embd_to_audio(embd, codes, n_embd, params.cpuparams.n_threads); + auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads); + +#else + // read the spectrogram from a file for debugging purposes + std::vector audio; + { + std::ifstream fin("out.bin", std::ios::binary); + if (!fin) { + LOG_ERR("%s: failed to open file '%s'\n", __func__, "out.bin"); + return 1; + } + + std::vector embd; + + int n_codes; + int n_embd; + + fin.read(reinterpret_cast(&n_codes), sizeof(int)); + fin.read(reinterpret_cast(&n_embd), sizeof(int)); + + embd.resize(n_codes * n_embd); + fin.read(reinterpret_cast(embd.data()), n_codes * n_embd * sizeof(float)); + fin.close(); + + LOG_INF("%s: n_codes: %d, n_embd: %d\n", __func__, n_codes, n_embd); + + audio = embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads); + } +#endif const std::string fname = "output.wav"; From 2a1a6f6326403032602897d9f48552f182c08be6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 18 Dec 2024 14:10:59 +0200 Subject: [PATCH 42/45] server : fix rebase artifacts --- examples/server/server.cpp | 4 ++-- examples/server/tests/unit/test_embedding.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 57db582d72754..5ed4e8d274428 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -687,6 +687,8 @@ struct server_task_result_cmpl_partial : server_task_result { json second_ret = json{ {"choices", json::array({json{{"finish_reason", nullptr}, {"index", 0}, + {"delta", json { + {"content", content}}} }})}, {"created", t}, {"id", oaicompat_cmpl_id}, @@ -702,7 +704,6 @@ struct server_task_result_cmpl_partial : server_task_result { {"delta", json { {"content", content}, - {"tokens", tokens} }}, }}); } @@ -1016,7 +1017,6 @@ struct server_slot { n_prompt_tokens = 0; last_nl_pos = 0; generated_text = ""; - generated_tokens = {}; has_new_line = false; truncated = false; stop = STOP_TYPE_NONE; diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py index af6d148537452..e32d745829605 100644 --- a/examples/server/tests/unit/test_embedding.py +++ b/examples/server/tests/unit/test_embedding.py @@ -92,10 +92,6 @@ def test_embedding_pooling_none(): for x in res.body[0]['embedding']: assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON - # make sure embedding vector is not normalized - for x in res.body[0]['embedding']: - assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON - def test_embedding_pooling_none_oai(): global server From 29df666d44efd6a53f15b3a7c4875b5d9483a9b2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 18 Dec 2024 14:13:09 +0200 Subject: [PATCH 43/45] tts : enable "return_tokens" in Python example ggml-ci --- examples/tts/tts-outetts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py index 3c085dd5437de..bebd38ef4a5fe 100644 --- a/examples/tts/tts-outetts.py +++ b/examples/tts/tts-outetts.py @@ -132,6 +132,7 @@ def process_text(text: str): "prompt": [prefix + words, *suffix], "n_predict": 1024, "cache_prompt": True, + "return_tokens": True, "samplers": ["top_k"], "top_k": 16, "seed": 1003, From a95191c468f5fa81ca4ed080e2dd63bf6be0f744 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 18 Dec 2024 17:50:57 +0200 Subject: [PATCH 44/45] tts : minor fixes --- examples/tts/tts-outetts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py index bebd38ef4a5fe..0f81192fca52d 100644 --- a/examples/tts/tts-outetts.py +++ b/examples/tts/tts-outetts.py @@ -1,6 +1,6 @@ import sys -import json -import struct +#import json +#import struct import requests import re @@ -162,7 +162,7 @@ def process_text(text: str): #print(json.dumps(response_json, indent=4)) # spectrogram -embd = response_json["data"][0]["embedding"] +embd = response_json[0]["embedding"] n_codes = len(embd) n_embd = len(embd[0]) From c0df192838f51507e06b7293030b43232cd2670f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 18 Dec 2024 19:22:56 +0200 Subject: [PATCH 45/45] common : support HF download for vocoder --- common/arg.cpp | 50 +++++++++++++++++++++++++++++++------------- common/common.cpp | 7 ++++--- common/common.h | 6 +++++- examples/tts/tts.cpp | 7 ++++++- 4 files changed, 50 insertions(+), 20 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 93c15ecdcb4d2..e5ddd8318f787 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -119,29 +119,33 @@ std::string common_arg::to_string() { // utils // -static void common_params_handle_model_default(common_params & params) { - if (!params.hf_repo.empty()) { +static void common_params_handle_model_default( + std::string & model, + std::string & model_url, + std::string & hf_repo, + std::string & hf_file) { + if (!hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model - if (params.hf_file.empty()) { - if (params.model.empty()) { + if (hf_file.empty()) { + if (model.empty()) { throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); } - params.hf_file = params.model; - } else if (params.model.empty()) { + hf_file = model; + } else if (model.empty()) { // this is to avoid different repo having same file name, or same file name in different subdirs - std::string filename = params.hf_repo + "_" + params.hf_file; + std::string filename = hf_repo + "_" + hf_file; // to make sure we don't have any slashes in the filename string_replace_all(filename, "/", "_"); - params.model = fs_get_cache_file(filename); + model = fs_get_cache_file(filename); } - } else if (!params.model_url.empty()) { - if (params.model.empty()) { - auto f = string_split(params.model_url, '#').front(); + } else if (!model_url.empty()) { + if (model.empty()) { + auto f = string_split(model_url, '#').front(); f = string_split(f, '?').front(); - params.model = fs_get_cache_file(string_split(f, '/').back()); + model = fs_get_cache_file(string_split(f, '/').back()); } - } else if (params.model.empty()) { - params.model = DEFAULT_MODEL_PATH; + } else if (model.empty()) { + model = DEFAULT_MODEL_PATH; } } @@ -276,7 +280,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - common_params_handle_model_default(params); + // TODO: refactor model params in a common struct + common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file); + common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file); if (params.escape) { string_process_escapes(params.prompt); @@ -1581,6 +1587,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); + add_opt(common_arg( + {"-hfrv", "--hf-repo-v"}, "REPO", + "Hugging Face model repository for the vocoder model (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.hf_repo = value; + } + ).set_env("LLAMA_ARG_HF_REPO_V")); + add_opt(common_arg( + {"-hffv", "--hf-file-v"}, "FILE", + "Hugging Face model file for the vocoder model (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.hf_file = value; + } + ).set_env("LLAMA_ARG_HF_FILE_V")); add_opt(common_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", diff --git a/common/common.cpp b/common/common.cpp index 05d3ba766e38b..20be9291161ca 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1095,7 +1095,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p #define CURL_MAX_RETRY 3 #define CURL_RETRY_DELAY_SECONDS 2 -static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { +static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) { int remaining_attempts = max_attempts; while (remaining_attempts > 0) { @@ -1119,7 +1119,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_ } static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { - // Initialize libcurl std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); if (!curl) { @@ -1192,11 +1191,13 @@ static bool common_download_file(const std::string & url, const std::string & pa std::string etag; std::string last_modified; }; + common_load_model_from_url_headers headers; + { typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { - common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata; + common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata; static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex etag_regex("ETag", std::regex_constants::icase); diff --git a/common/common.h b/common/common.h index c09c4eb0d3628..1d2bd932c211d 100644 --- a/common/common.h +++ b/common/common.h @@ -175,7 +175,11 @@ struct common_params_speculative { }; struct common_params_vocoder { - std::string model = ""; // vocoder model for producing audio // NOLINT + std::string hf_repo = ""; // HF repo // NOLINT + std::string hf_file = ""; // HF file // NOLINT + + std::string model = ""; // model path // NOLINT + std::string model_url = ""; // model url to download // NOLINT }; struct common_params { diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 9c3d58f891703..7f36b80f0dee1 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -461,7 +461,12 @@ int main(int argc, char ** argv) { model_ttc = llama_init_ttc.model; ctx_ttc = llama_init_ttc.context; - params.model = params.vocoder.model; + // TODO: refactor in a common struct + params.model = params.vocoder.model; + params.model_url = params.vocoder.model_url; + params.hf_repo = params.vocoder.hf_repo; + params.hf_file = params.vocoder.hf_file; + params.embedding = true; common_init_result llama_init_cts = common_init_from_params(params);