add prosody cloning and bigvgan vocoder for increased variance

Flux9665 · Flux9665 · commit 5db335c72100 · 2023-12-13T14:23:46.000+01:00
diff --git a/Architectures/Vocoder/BigVGAN.py b/Architectures/Vocoder/BigVGAN.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import torch
+from alias_free_torch import Activation1d
+from torch.nn import Conv1d
+from torch.nn import ConvTranspose1d
+from torch.nn import ModuleList
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils import weight_norm
+
+from Architectures.Vocoder.AMP import AMPBlock1
+from Architectures.Vocoder.Snake import SnakeBeta
+
+
+class BigVGAN(torch.nn.Module):
+    # this is the main BigVGAN model. Applies anti-aliased periodic activation for resblocks.
+
+    def __init__(self,
+                 num_mels=128,
+                 upsample_initial_channel=512,
+                 upsample_rates=(8, 6, 4, 2),  # CAREFUL: Avocodo discriminator assumes that there are always 4 upsample scales, because it takes intermediate results.
+                 upsample_kernel_sizes=(16, 12, 8, 4),
+                 resblock_kernel_sizes=(3, 7, 11),
+                 resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5)),
+                 weights=None
+                 ):
+        super(BigVGAN, self).__init__()
+
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+
+        # pre conv
+        self.conv_pre = weight_norm(Conv1d(num_mels, upsample_initial_channel, 7, 1, padding=3))
+
+        # transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(ModuleList([
+                weight_norm(ConvTranspose1d(upsample_initial_channel // (2 ** i),
+                                            upsample_initial_channel // (2 ** (i + 1)),
+                                            k, u, padding=(k - u) // 2))
+            ]))
+
+        # residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(AMPBlock1(ch, k, d))
+
+        # post conv
+        activation_post = SnakeBeta(ch, alpha_logscale=True)
+        self.activation_post = Activation1d(activation=activation_post)
+
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+
+        # weight initialization
+        for i in range(len(self.ups)):
+            self.ups[i].apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+        # for Avocodo discriminator
+        self.out_proj_x1 = torch.nn.Conv1d(upsample_initial_channel // 4, 1, 7, 1, padding=3)
+        self.out_proj_x2 = torch.nn.Conv1d(upsample_initial_channel // 8, 1, 7, 1, padding=3)
+
+        if weights is not None:
+            self.load_state_dict(weights)
+
+    def forward(self, x):
+        # pre conv
+        x = self.conv_pre(x)
+
+        for i in range(self.num_upsamples):
+            # upsampling
+            for i_up in range(len(self.ups[i])):
+                x = self.ups[i][i_up](x)
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+            if i == 1:
+                x1 = self.out_proj_x1(x)
+            elif i == 2:
+                x2 = self.out_proj_x2(x)
+
+        # post conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x, x2, x1
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            for l_i in l:
+                remove_weight_norm(l_i)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+if __name__ == '__main__':
+    print(BigVGAN()(torch.randn([1, 128, 100]))[0].shape)
diff --git a/InferenceInterfaces/ToucanTTSInterface.py b/InferenceInterfaces/ToucanTTSInterface.py
@@ -11,6 +11,7 @@
 
 from Architectures.EmbeddingModel.StyleEmbedding import StyleEmbedding
 from Architectures.ToucanTTS.InferenceToucanTTS import ToucanTTS
+from Architectures.Vocoder.BigVGAN import BigVGAN
 from Architectures.Vocoder.HiFiGAN_Generator import HiFiGAN
 from Preprocessing.AudioPreprocessor import AudioPreprocessor
 from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
@@ -28,7 +29,7 @@ def __init__(self,
                  vocoder_model_path=os.path.join(MODELS_DIR, f"Vocoder", "best.pt"),  # path to the Vocoder checkpoint
                  embedding_model_path=None,
                  language="eng",  # initial language of the model, can be changed later with the setter methods
-                 ):
+                 use_bigvgan=False):
         super().__init__()
         self.device = device
         if not tts_model_path.endswith(".pt"):
@@ -67,8 +68,13 @@ def __init__(self,
         ################################
         #  load mel to wave model      #
         ################################
-        vocoder_checkpoint = torch.load(vocoder_model_path, map_location="cpu")
-        self.vocoder = HiFiGAN()
+        if use_bigvgan:
+            vocoder_checkpoint = torch.load(os.path.join(MODELS_DIR, f"Vocoder", "bigvgan.pt"), map_location="cpu")
+            self.vocoder = BigVGAN()
+        else:
+            vocoder_checkpoint = torch.load(vocoder_model_path, map_location="cpu")
+            self.vocoder = HiFiGAN()
+
         self.vocoder.load_state_dict(vocoder_checkpoint)
         self.vocoder = self.vocoder.to(device).eval()
         self.vocoder.remove_weight_norm()
diff --git a/InferenceInterfaces/UtteranceCloner.py b/InferenceInterfaces/UtteranceCloner.py
@@ -23,8 +23,8 @@ class UtteranceCloner:
     Useful for Privacy Applications
     """
 
-    def __init__(self, model_id, device, language="eng"):
-        self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id)
+    def __init__(self, model_id, device, language="eng", use_bigvgan=False):
+        self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id, use_bigvgan=use_bigvgan)
         self.ap = AudioPreprocessor(input_sr=100, output_sr=16000, cut_silence=False)
         self.tf = ArticulatoryCombinedTextFrontend(language=language)
         self.device = device
@@ -41,7 +41,7 @@ def __init__(self, model_id, device, language="eng"):
         torch.set_grad_enabled(True)  # finding this issue was very infuriating: silero sets
         # this to false globally during model loading rather than using inference_mode or no_grad
 
-    def extract_prosody(self, transcript, ref_audio_path, lang="eng", on_line_fine_tune=True):
+    def extract_prosody(self, transcript, ref_audio_path, lang="eng", on_line_fine_tune=False):
         acoustic_model = Aligner()
         acoustic_model.load_state_dict(self.aligner_weights)
         acoustic_model = acoustic_model.to(self.device)
diff --git a/run_asvspoof_generation.py b/run_asvspoof_generation.py
@@ -11,6 +11,7 @@
 PATH_TO_GENERATION_FILE = "p1_ttsvc_surrogate.tsv"
 PATH_TO_OUTPUT_DIR = "asv_spoof_outputs_no_pros"
 DEVICE = "cuda"
+USE_BIGVGAN = False
 
 
 def build_path_to_transcript_dict_mls_english():
@@ -28,7 +29,7 @@ def build_path_to_transcript_dict_mls_english():
 
 if __name__ == '__main__':
     print("loading model...")
-    tts = ToucanTTSInterface(device=DEVICE, tts_model_path="ASVSpoof")
+    tts = ToucanTTSInterface(device=DEVICE, tts_model_path="ASVSpoof", use_bigvgan=USE_BIGVGAN)
     print("prepare path to transcript lookup...")
     path_to_transcript_dict = build_path_to_transcript_dict_mls_english()
     filename_to_path = dict()
diff --git a/run_asvspoof_generation_with_prosody_cloning.py b/run_asvspoof_generation_with_prosody_cloning.py
@@ -0,0 +1,57 @@
+import os
+
+import librosa
+import soundfile as sf
+from tqdm import tqdm
+
+from InferenceInterfaces.UtteranceCloner import UtteranceCloner
+from Utility.utils import float2pcm
+
+PATH_TO_MLS_ENGLISH_TRAIN = "/mount/resources/speech/corpora/MultiLingLibriSpeech/mls_english/train"
+PATH_TO_GENERATION_FILE = "p1_ttsvc_surrogate.tsv"
+PATH_TO_OUTPUT_DIR = "asv_spoof_outputs_with_pros"
+DEVICE = "cuda"
+USE_BIGVGAN = False
+
+
+def build_path_to_transcript_dict_mls_english():
+    path_to_transcript = dict()
+    with open(os.path.join(PATH_TO_MLS_ENGLISH_TRAIN, "transcripts.txt"), "r", encoding="utf8") as file:
+        lookup = file.read()
+    for line in lookup.split("\n"):
+        if line.strip() != "":
+            fields = line.split("\t")
+            wav_folders = fields[0].split("_")
+            wav_path = f"{PATH_TO_MLS_ENGLISH_TRAIN}/audio/{wav_folders[0]}/{wav_folders[1]}/{fields[0]}.flac"
+            path_to_transcript[wav_path] = fields[1]
+    return path_to_transcript
+
+
+if __name__ == '__main__':
+    print("loading model...")
+    uc = UtteranceCloner(model_id="ASVSpoof", device=DEVICE, language="eng", use_bigvgan=USE_BIGVGAN)
+    print("prepare path to transcript lookup...")
+    path_to_transcript_dict = build_path_to_transcript_dict_mls_english()
+    filename_to_path = dict()
+    for p in path_to_transcript_dict:
+        filename_to_path[p.split("/")[-1].rstrip(".flac")] = p
+    with open(PATH_TO_GENERATION_FILE, "r") as file:
+        generation_list = file.read().split("\n")
+    os.makedirs(PATH_TO_OUTPUT_DIR, exist_ok=True)
+    print("generating audios...")
+    for generation_item in tqdm(generation_list):
+        if generation_item == "":
+            continue
+        speaker_id, voice_sources, _, prosody_source, output_name = generation_item.split()
+        voice_source_list = voice_sources.split(",")
+        transcript = path_to_transcript_dict[filename_to_path[prosody_source]]
+        source_list = list()
+        for source in voice_source_list:
+            source_list.append(filename_to_path[source])
+
+        cloned_utterance = uc.clone_utterance(path_to_reference_audio_for_voice=source_list,
+                                              path_to_reference_audio_for_intonation=filename_to_path[prosody_source],
+                                              transcription_of_intonation_reference=transcript)
+
+        resampled_utt = librosa.resample(cloned_utterance, orig_sr=24000, target_sr=16000)
+        sf.write(file=f"{PATH_TO_OUTPUT_DIR}/" + output_name + ".flac", data=float2pcm(resampled_utt), samplerate=16000, subtype="PCM_16")
diff --git a/run_model_downloader.py b/run_model_downloader.py
@@ -32,13 +32,21 @@ def download_models():
         reporthook=report)
 
     #############
-    print("Downloading Vocoder")
+    print("Downloading HiFiGAN Vocoder")
     os.makedirs(os.path.join(MODELS_DIR, "Vocoder"), exist_ok=True)
     filename, headers = urllib.request.urlretrieve(
         url="https://github.com/DigitalPhonetics/IMS-Toucan/releases/download/v2.asvspoof/hifigan.pt",
         filename=os.path.abspath(os.path.join(MODELS_DIR, "Vocoder", "best.pt")),
         reporthook=report)
 
+    #############
+    print("Downloading BigVGAN Vocoder")
+    os.makedirs(os.path.join(MODELS_DIR, "Vocoder"), exist_ok=True)
+    filename, headers = urllib.request.urlretrieve(
+        url="https://github.com/DigitalPhonetics/IMS-Toucan/releases/download/v2.asvspoof/bigvgan.pt",
+        filename=os.path.abspath(os.path.join(MODELS_DIR, "Vocoder", "bigvgan.pt")),
+        reporthook=report)
+
     #############
     print("Downloading Embedding Model")
     os.makedirs(os.path.join(MODELS_DIR, "Embedding"), exist_ok=True)