Replies: 97 comments 840 replies
-
|
hello @SWivid do you think is possible fine tuning pretrained model on new language planing to add another language italian + english (to avoid catastrophic forgetting) |
Beta Was this translation helpful? Give feedback.
-
audio files of maybe 3 - 12s duration, i'm not sure what's good, and their transcripts /your_dataset
|-- metadata.csv
|-- wavs/
| |-- audio_0001.wav
| |-- audio_0002.wav
| `-- ...metadata.csv contents:
python scripts/prepare_csv_wavs.py <path_to_your_dataset> <F5-TTS_repo_data_path>/<dataset_name>_pinyinexample: python scripts/prepare_csv_wavs.py /my_pc/your_dataset /my_pc/F5-TTS/data/your_dataset_pinyin
set dataset name to name of your dataset in f5-tts data folder dataset_name = "your_dataset"play around with these parameters and see what give the best results: set max samples to 2, or whatever you seem fit max_samples = 2also play around with learning rate, don't know which one is best learning_rate = 5e-06change epochs and warmup to whatver you seem fit for your dataset epochs = 10 # use linear decay, thus epochs control the slope
num_warmup_updates = 20 # warmup stepsadjust this to your dataset size, eg for 100 audio files and 2 max samples, maybe 500 last_per_steps = 500 # save last checkpoint per steps
hopefully we find good hyperparams for good finetuning results could put it doesn't handle other tokenizers, always assumes english dataset and import sys, os
sys.path.append(os.getcwd())
from pathlib import Path
import json
import shutil
import argparse
from tqdm import tqdm
from datasets.arrow_writer import ArrowWriter
from model.utils import (
convert_char_to_pinyin,
)
PRETRAINED_VOCAB_PATH = Path(__file__).parent.parent / "data/Emilia_ZH_EN_pinyin/vocab.txt"
def is_csv_wavs_format(input_dataset_dir):
fpath = Path(input_dataset_dir)
metadata = fpath / "metadata.csv"
wavs = fpath / 'wavs'
return metadata.exists() and metadata.is_file() and wavs.exists() and wavs.is_dir()
def prepare_csv_wavs_dir(input_dir):
assert is_csv_wavs_format(input_dir), f"not csv_wavs format: {input_dir}"
input_dir = Path(input_dir)
metadata_path = input_dir / "metadata.csv"
audio_path_text_pairs = read_audio_text_pairs(metadata_path.as_posix())
sub_result, durations = [], []
vocab_set = set()
polyphone = True
for audio_path, text in audio_path_text_pairs:
if not Path(audio_path).exists():
print(f"audio {audio_path} not found, skipping")
continue
audio_duration = get_audio_duration(audio_path)
# assume tokenizer = "pinyin" ("pinyin" | "char")
text = convert_char_to_pinyin([text], polyphone=polyphone)[0]
sub_result.append({"audio_path": audio_path, "text": text, "duration": audio_duration})
durations.append(audio_duration)
vocab_set.update(list(text))
return sub_result, durations, vocab_set
def get_audio_duration(audio_path):
import torchaudio
audio, sample_rate = torchaudio.load(audio_path)
num_channels = audio.shape[0]
return audio.shape[1] / (sample_rate * num_channels)
def read_audio_text_pairs(csv_file_path):
import csv
audio_text_pairs = []
parent = Path(csv_file_path).parent
with open(csv_file_path, mode='r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile, delimiter='|')
next(reader) # Skip the header row
for row in reader:
if len(row) >= 2:
audio_file = row[0].strip() # First column: audio file path
text = row[1].strip() # Second column: text
audio_file_path = parent / audio_file
audio_text_pairs.append((audio_file_path.as_posix(), text))
return audio_text_pairs
def save_prepped_dataset(out_dir, result, duration_list, text_vocab_set, is_finetune):
out_dir = Path(out_dir)
# save preprocessed dataset to disk
out_dir.mkdir(exist_ok=True, parents=True)
print(f"\nSaving to {out_dir} ...")
# dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list}) # oom
# dataset.save_to_disk(f"data/{dataset_name}/raw", max_shard_size="2GB")
raw_arrow_path = out_dir / "raw.arrow"
with ArrowWriter(path=raw_arrow_path.as_posix(), writer_batch_size=1) as writer:
for line in tqdm(result, desc=f"Writing to raw.arrow ..."):
writer.write(line)
# dup a json separately saving duration in case for DynamicBatchSampler ease
dur_json_path = out_dir / "duration.json"
with open(dur_json_path.as_posix(), 'w', encoding='utf-8') as f:
json.dump({"duration": duration_list}, f, ensure_ascii=False)
# vocab map, i.e. tokenizer
# add alphabets and symbols (optional, if plan to ft on de/fr etc.)
# if tokenizer == "pinyin":
# text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)])
voca_out_path = out_dir / "vocab.txt"
with open(voca_out_path.as_posix(), "w") as f:
for vocab in sorted(text_vocab_set):
f.write(vocab + "\n")
if is_finetune:
file_vocab_finetune = PRETRAINED_VOCAB_PATH.as_posix()
shutil.copy2(file_vocab_finetune, voca_out_path)
else:
with open(voca_out_path, "w") as f:
for vocab in sorted(text_vocab_set):
f.write(vocab + "\n")
dataset_name = out_dir.stem
print(f"\nFor {dataset_name}, sample count: {len(result)}")
print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
def prepare_and_save_set(inp_dir, out_dir, is_finetune: bool = True):
if is_finetune:
assert PRETRAINED_VOCAB_PATH.exists(), f"pretrained vocab.txt not found: {PRETRAINED_VOCAB_PATH}"
sub_result, durations, vocab_set = prepare_csv_wavs_dir(inp_dir)
save_prepped_dataset(out_dir, sub_result, durations, vocab_set, is_finetune)
def cli():
# finetune: python script.py /path/to/input_dir /path/to/output_dir
# pretrain: python script.py /path/to/input_dir /path/to/output_dir --pretrain
parser = argparse.ArgumentParser(description="Prepare and save dataset.")
parser.add_argument('inp_dir', type=str, help="Input directory containing the data.")
parser.add_argument('out_dir', type=str, help="Output directory to save the prepared data.")
parser.add_argument('--pretrain', action='store_true', help="Enable for new pretrain, otherwise is a fine-tune")
args = parser.parse_args()
prepare_and_save_set(args.inp_dir, args.out_dir, is_finetune=not args.pretrain)
if __name__ == "__main__":
cli() |
Beta Was this translation helpful? Give feedback.
-
|
@kunibald413 Thank you for the script. I've already created something similar here. #62 (comment) can you update this part i think be nice to have like this before after |
Beta Was this translation helpful? Give feedback.
-
|
The code @kunibald413 has provided works. However, when training it seems to initialize from a model with random weights. Can we initialize from the trained model weights instead? |
Beta Was this translation helpful? Give feedback.
-
|
Just started my spanish finetune from the facebook libraspeech dataset. Single 4090 so it will take a while. |
Beta Was this translation helpful? Give feedback.
-
|
I am using a Chinese dataset (about 33hr) to fine-tune my model. The loss is continuously decreasing, and the generated voice tone is getting closer to the target. However, as the training steps increase, the pronunciation of words is becoming increasingly unclear.
parm |
Beta Was this translation helpful? Give feedback.
-
|
hi i just create gradio interface for easy user-friendly and accessible for beginners you can see here Features |
Beta Was this translation helpful? Give feedback.
-
|
can confirm also training work i training 3 language indonesia-italian-english eng: https://vocaroo.com/1mGEFlRNgouY italian: https://voca.ro/1l6SYplhnSxz (Quattro imperdibili appuntamenti con l’Orchestra da Camera di Caserta e solisti internazionali.) indonesia: it even can do code switching (eng-indonesia): using same config as train
|
Beta Was this translation helpful? Give feedback.
-
|
Hi, I was just wondering why you dont try to train on small data first instead of starting with a large dataset. For me, I trained for only 40 hours greek and with 20 hours (LibriTTS-R) focused on English, and it’s working fine speak very well. in half a day about with the 4090, and after about 100k to 150k steps, the model can speak greek and english in same time, very well and have great zero shot , try see if ths working for you i hope this help |
Beta Was this translation helpful? Give feedback.
-
|
here the setting i use |
Beta Was this translation helpful? Give feedback.
-
|
Hi all, this is very important and might be confusing for some. You need to copy the original model If you start training without copying this model, it will train from scratch! I’ve created a script called finetune-cli.py that can automate this process. However, before running the script, you need to update all the settings accordingly. Please make sure to do this before you start. or you can run simple run simple change only the dataname my_speak in 3090 with about 60-80 hours dataset working well for 4090 like say @JarodMica working very well and also with very big dataset about the vocab i dont replace anything because suport all symbols in language i train make sure if suport all symbols in your language you want to train if miss symbols not working correct or another idea it's in case miss symbols , you can simple covert all symbols in english language , here how check the vocab in finetune_gradio.py make sure in data/project_name/ you have inside metadata.csv for all text thats why i make gradio_finetune.py to dont confuse for begin users i hope this help |
Beta Was this translation helpful? Give feedback.
-
|
@jpgallegoar I’m trying to train in Spanish as an experiment , let see this take some hours. I just hope the dataset I’m using is okay since I don’t speak Spanish. I’ll let you know soon. |
Beta Was this translation helpful? Give feedback.
-
|
Given a large dataset, how important is it that the transcription is 1-1 with the source audio? The reason I ask, most of my datasets are built using a Whisper model, and they often do some text compression and correct misspoken words or stutter. Is this TTS-architecture forgiving for those kinds of variations or inconsistencies in transcription, or should I consider using a more verbose Whisper model for creating this dataset? |
Beta Was this translation helpful? Give feedback.
-
|
Has anyone here tried finetuning the base model on a single speaker dataset? I tried finetuning with a 6 hr English dataset, but I don’t hear any difference after the training. |
Beta Was this translation helpful? Give feedback.
-
|
After much testing, I'm gonna have to give up on the spanish finetune for now. Anyway, if anyone wants it, here is the model: Link |
Beta Was this translation helpful? Give feedback.
-
|
Has anyone tested training on fp32 vs fp16 vs bf16? Is there a noticeable quality dropoff? Which is the best? |
Beta Was this translation helpful? Give feedback.
-
|
I'm finetuning models with F5-TTS via Pinokio but i'm struggling to identify how to use the models i've trained Would a kind person possibly update the Gradio UI for Pinokio and add an ability to automatically find and be able to select any of the finetuned models that have been trained / created to make it easy please? 😊 |
Beta Was this translation helpful? Give feedback.
-
|
I'm training 200hrs for pt-br reaching 1M steps, using google colab, half with A100 and half with T4, but it still not perfect, it is actually doing a little inference, but have some misspellings, and for numbers, just does not work. Is it possible to finetune it with a new dataset with only numbers and misspells? will it destroy the previous trainings? |
Beta Was this translation helpful? Give feedback.
-
Beta Was this translation helpful? Give feedback.
-
|
Has anyone able to train this on Multiple-4090 GPU's setup (2 or more)?? I am getting this - #728 (comment) |
Beta Was this translation helpful? Give feedback.
-
|
Has anyone tried parallelized training with multi GPUs? I mean getting parallel performance, not only more VRAM. Is it even possible? |
Beta Was this translation helpful? Give feedback.
-
|
quick note that might be relevant for anyone trying to finetune with low VRAM: if you're using frame batching, then batch size acts as a maximum frame length for any samples in your dataset. that is, any audio samples with frame length greater than batch size are dropped altogether and are not used during the finetuning. this doesn't matter with like batch size >= 3200 since training samples shouldn't be over 30s anyway, but if you're running, say, 1600 or 2000 batch size (hence probably most relevant to people with low VRAM), you're probably dropping some stuff from your dataset. i suppose this should be obvious in retrospect, but as someone who isn't knowledgeable about what some of these parameters actually mean, i think it's an easy oversight to make (i didn't even notice until i was experimenting with batch size <1000 and suddenly got a division by zero exception somewhere as a consequence of my entire dataset being greater than the batch size...) |
Beta Was this translation helpful? Give feedback.
-
|
Is it possible to finetune this model from the F5TTS_Base checkpoint with around 166 samples of training data? Each clip around 10sec long, normalized, re-sampled to 24000, noise-reduced. At update 1100 the gen still sounds like white noise, and loss is not decreasing. Samples uploaded. Yaml file looks like: hydra: datasets: optim: model: ckpts: custom training samples and metacsv file generated with preapare_csv_wavs.py. Are there any improvements I can make in the YAML file so that this finetuned model can sound as good as the zero-shot examples? Thanks a lot! |
Beta Was this translation helpful? Give feedback.
-
|
Hi @jpgallegoar I tried downloading your .safetensors and vocab spanish version and overwrite them in the 1.08 release. (I changed the file name from model_1200000 to model_1250000 ) It loaded and ran an spanish test but I only got gibberish as audio result. Do you know what could be the reason? Thanks in advance. |
Beta Was this translation helpful? Give feedback.
-
|
I tried finetuning the model for Persian data. I changed vocab.txt to include persian characters and used a 'custom' tokenizer with this new vocab.txt. My results weren't promising and it could be because of the lack of data (only 105 hours) or insufficient training (1.2 million steps). batch_size = 4200 Am I missing something else? |
Beta Was this translation helpful? Give feedback.
-
|
How do you structure the dataset folder or metadata.csv when you have multiple speakers and want to train a new language? There is no documentation regarding that. Should I just put all the training data into the wavs folder with all the speakers and proceed with the training? Like mentioned in #57 (comment) There are a lot of training data for many different languages on huggingface, they all show a speaker_ID in their database (sql browser), but when it comes to training with those data, there is no information about how to structure the folder or the metadata.csv |
Beta Was this translation helpful? Give feedback.
-
|
I think this model trains pretty well on multi-speaker datasets - even with only 100 hours of data. I have recorded 10s of hours of my own speech, quality microphone, no errors in transcripts/audio, and lengths are from 5-20 seconds. I also have some other speakers that have a few hours of audio also, and then I have 400 speakers that have around 20-50 minutes each. In theory, would training on my own speech skew the model away from performing well on the other speakers (my voice would then be maybe 30% of the overall audio)? Or would it just improve more if I add my 10s of hours to the mix? And does shuffling the dataset give any better results when training, when my own recordings are shuffled in with the other 500 speakers? I don't care as much about the voice-cloning part of the tts - I just really want a clear pronunciation in my language, so I don't mind the model being skewed towards my voice, if only it has good pronunciation. |
Beta Was this translation helpful? Give feedback.
-
|
So I am trying to finetune on my dataset which contains the Indian English accent. I have around 60 hours of audio data. Although the inference audio is clear, no noise, the audio that is generated does not resemble English at all. Any ideas so as to why this is happening? I made minimal changes to the hyperparameters while finetuning. And followed the procedure exactly as provided in the documentation. |
Beta Was this translation helpful? Give feedback.
-
|
Hi I was trying to finetune the pretrained weights of hindi model, I changed the path in train.py (checkpoint_paths to the pretrained directory and my config file is hydra: datasets: optim: model: ckpts: the training finish instantly I am not able to understand what mistake I am making this below is the output : (f5tts) root@t1-le-45-gra7:/home/ubuntu/F5-TTS/src/f5_tts/configs# cd ../../.. |
Beta Was this translation helpful? Give feedback.
-
|
Hello! Super awesome model and repo! |
Beta Was this translation helpful? Give feedback.










Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
Full finetune is currently supported, lora or adapter not yet.
checkpoint_pathto pretrained model dir intest_train.py,model/trainer.pywill load from there to resume. Reuse thevocab.txtunderdata /Emilia_ZH_EN_pinyin(Emilia_ZH_EN_pinyin <- tokenizer = "pinyin"; dataset_name = "Emilia_ZH_EN" intest_train.pysetting)model/dataset.py. Just need e.g. the audio path, text (tokenized, leverageconvert_char_to_pinyinfunc inmodel/utils.pysee script/prepare_xxxx.py), duration of audio in seconds.grad_accumulation_stepscould be used to simulate a large batchsize. Also other settings, e.g. few warmup steps, 1e-4 lr, etc.We didn't specifically experiment with finetuning, so if you get positive results, welcome to share :)
Some helpful issues, #16 #27
Welcome to share your successful results with finetuning, maybe also start a new tutorial doc helping others to get start with it.
Many Thanks !
Beta Was this translation helpful? Give feedback.
All reactions