|
2 | 2 |
|
3 | 3 | import librosa |
4 | 4 | import soundfile as sf |
5 | | -import torch |
6 | 5 | from tqdm import tqdm |
7 | 6 |
|
8 | 7 | from InferenceInterfaces.ToucanTTSInterface import ToucanTTSInterface |
9 | 8 | from Utility.utils import float2pcm |
10 | 9 |
|
| 10 | +PATH_TO_MLS_ENGLISH_TRAIN = "/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_english/train" |
11 | 11 | PATH_TO_GENERATION_FILE = "p1_ttsvc_surrogate.tsv" |
12 | 12 | PATH_TO_OUTPUT_DIR = "asv_spoof_outputs_no_pros" |
13 | 13 | DEVICE = "cuda" |
14 | 14 |
|
| 15 | + |
| 16 | +def build_path_to_transcript_dict_mls_english(): |
| 17 | + path_to_transcript = dict() |
| 18 | + with open(os.path.join(PATH_TO_MLS_ENGLISH_TRAIN, "transcripts.txt"), "r", encoding="utf8") as file: |
| 19 | + lookup = file.read() |
| 20 | + for line in lookup.split("\n"): |
| 21 | + if line.strip() != "": |
| 22 | + fields = line.split("\t") |
| 23 | + wav_folders = fields[0].split("_") |
| 24 | + wav_path = f"{PATH_TO_MLS_ENGLISH_TRAIN}/audio/{wav_folders[0]}/{wav_folders[1]}/{fields[0]}.flac" |
| 25 | + path_to_transcript[wav_path] = fields[1] |
| 26 | + return path_to_transcript |
| 27 | + |
| 28 | + |
15 | 29 | if __name__ == '__main__': |
| 30 | + print("loading model...") |
16 | 31 | tts = ToucanTTSInterface(device=DEVICE, tts_model_path="ASVSpoof") |
17 | | - path_to_transcript_dict = torch.load("mls_transcript_cache.pt") |
| 32 | + print("prepare path to transcript lookup...") |
| 33 | + path_to_transcript_dict = build_path_to_transcript_dict_mls_english() |
18 | 34 | filename_to_path = dict() |
19 | 35 | for p in path_to_transcript_dict: |
20 | 36 | filename_to_path[p.split("/")[-1].rstrip(".flac")] = p |
21 | 37 | with open(PATH_TO_GENERATION_FILE, "r") as file: |
22 | 38 | generation_list = file.read().split("\n") |
23 | 39 | os.makedirs(PATH_TO_OUTPUT_DIR, exist_ok=True) |
24 | | - |
| 40 | + print("generating audios...") |
25 | 41 | for generation_item in tqdm(generation_list): |
26 | 42 | if generation_item == "": |
27 | 43 | continue |
|
34 | 50 | tts.set_utterance_embedding(path_to_reference_audio=source_list) |
35 | 51 | cloned_utterance = tts(transcript) |
36 | 52 | resampled_utt = librosa.resample(cloned_utterance, orig_sr=24000, target_sr=16000) |
37 | | - sf.write(file=f"{PATH_TO_OUTPUT_DIR}/" + output_name + ".flac", |
38 | | - data=float2pcm(resampled_utt), |
39 | | - samplerate=16000, |
40 | | - subtype="PCM_16") |
| 53 | + sf.write(file=f"{PATH_TO_OUTPUT_DIR}/" + output_name + ".flac", data=float2pcm(resampled_utt), samplerate=16000, subtype="PCM_16") |
0 commit comments