Skip to content

Commit 5f1dce3

Browse files
committed
final preparations for release
1 parent 5144760 commit 5f1dce3

File tree

6 files changed

+24
-17
lines changed

6 files changed

+24
-17
lines changed

InferenceInterfaces/ControllableInterface.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,6 @@ def read(self,
119119
energy_variance_scale=energy_variance_scale,
120120
pause_duration_scaling_factor=pause_duration_scaling_factor,
121121
return_plot_as_filepath=True)
122-
return 24000, wav, fig
122+
wav = wav.cpu().numpy()
123+
wav = [val for val in wav for _ in (0, 1)] # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz)
124+
return 48000, wav, fig

InferenceInterfaces/ToucanTTSInterface.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ def read_to_file(self,
286286
pitch_variance_scale=pitch_variance_scale,
287287
energy_variance_scale=energy_variance_scale).cpu()), 0)
288288
wav = torch.cat((wav, silence), 0)
289-
wav = [val for val in wav for _ in (0, 1)]
289+
wav = [val for val in wav for _ in (0, 1)] # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz)
290290
soundfile.write(file=file_location, data=wav, samplerate=48000)
291291

292292
def read_aloud(self,
@@ -304,7 +304,7 @@ def read_aloud(self,
304304
pitch_variance_scale=pitch_variance_scale,
305305
energy_variance_scale=energy_variance_scale).cpu()
306306
wav = torch.cat((wav, torch.zeros([12000])), 0)
307-
wav = [val for val in wav for _ in (0, 1)]
307+
wav = [val for val in wav for _ in (0, 1)] # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz)
308308
sounddevice.play(wav, samplerate=48000)
309309
if blocking:
310310
sounddevice.wait()

InferenceInterfaces/UtteranceCloner.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,18 @@
1717

1818

1919
class UtteranceCloner:
20+
"""
21+
Clone the prosody of an utterance, but exchange the speaker (or don't)
2022
21-
def __init__(self, model_id, device):
22-
self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id)
23+
Useful for Privacy Applications
24+
"""
25+
26+
def __init__(self, model_id, device, language="en", speed_over_quality=False):
27+
if (device == torch.device("cpu") or device == "cpu") and not speed_over_quality:
28+
print("Warning: You are running BigVGAN on CPU. Consider either switching to GPU or setting the speed_over_quality option to True.")
29+
self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id, faster_vocoder=speed_over_quality)
2330
self.ap = AudioPreprocessor(input_sr=16000, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
24-
self.tf = ArticulatoryCombinedTextFrontend(language="en")
31+
self.tf = ArticulatoryCombinedTextFrontend(language=language)
2532
self.device = device
2633
acoustic_checkpoint_path = os.path.join(MODELS_DIR, "Aligner", "aligner.pt")
2734
self.aligner_weights = torch.load(acoustic_checkpoint_path, map_location='cpu')["asr_model"]
@@ -153,12 +160,12 @@ def clone_utterance(self,
153160
start_sil = torch.zeros([silence_frames_start * 3]).to(self.device) # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required
154161
end_sil = torch.zeros([silence_frames_end * 3]).to(self.device) # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required
155162
cloned_speech = self.tts(reference_transcription, view=False, durations=duration, pitch=pitch, energy=energy)
156-
cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0)
163+
cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0).cpu().numpy()
157164
if filename_of_result is not None:
158-
sf.write(file=filename_of_result, data=cloned_utt.cpu().numpy(), samplerate=24000)
165+
sf.write(file=filename_of_result, data=cloned_utt, samplerate=24000)
159166
if clone_speaker_identity:
160167
self.tts.default_utterance_embedding = prev_embedding.to(self.device) # return to normal
161-
return cloned_utt.cpu().numpy()
168+
return cloned_utt
162169

163170
def biblical_accurate_angel_mode(self,
164171
path_to_reference_audio,
@@ -178,8 +185,8 @@ def biblical_accurate_angel_mode(self,
178185
self.tts.set_utterance_embedding(path_to_reference_audio=path)
179186
list_of_cloned_speeches.append(self.tts(reference_transcription, view=False, durations=duration, pitch=pitch, energy=energy))
180187
cloned_speech = torch.stack(list_of_cloned_speeches).mean(dim=0)
181-
cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0)
188+
cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0).cpu().numpy()
182189
if filename_of_result is not None:
183-
sf.write(file=filename_of_result, data=cloned_utt.cpu().numpy(), samplerate=24000)
190+
sf.write(file=filename_of_result, data=cloned_utt, samplerate=24000)
184191
self.tts.default_utterance_embedding = prev_embedding.to(self.device) # return to normal
185-
return cloned_utt.cpu().numpy()
192+
return cloned_utt

run_controllable_GUI.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def read(self,
120120
0.0, # slider 4 did not have a meaningful interpretation, too many properties mixed
121121
emb5,
122122
emb6)
123-
return (sr, float2pcm(wav.cpu().numpy())), fig
123+
return (sr, float2pcm(wav)), fig
124124

125125

126126
if __name__ == '__main__':

run_interactive_demo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
if __name__ == '__main__':
1111
warnings.filterwarnings("ignore", category=UserWarning)
1212

13-
PATH_TO_TTS_MODEL = os.path.join(MODELS_DIR, "ToucanTTS_Nancy", "best.pt")
13+
PATH_TO_TTS_MODEL = os.path.join(MODELS_DIR, "ToucanTTS_Meta", "best.pt")
1414
PATH_TO_VOCODER_MODEL = os.path.join(MODELS_DIR, "BigVGAN", "best.pt")
1515
PATH_TO_REFERENCE_SPEAKER = "" # audios/speaker_references_for_testing/female_high_voice.wav
1616
LANGUAGE = "en"

run_prosody_override.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,5 @@
1616
filename_of_result="audios/test_cloned_angelic.wav",
1717
list_of_speaker_references_for_ensemble=["audios/speaker_references_for_testing/female_high_voice.wav",
1818
"audios/speaker_references_for_testing/female_mid_voice.wav",
19-
"audios/speaker_references_for_testing/male_low_voice.wav",
20-
"audios/LibriTTS/174/168635/174_168635_000019_000001.wav",
21-
"audios/test.wav"],
19+
"audios/speaker_references_for_testing/male_low_voice.wav"],
2220
lang="en")

0 commit comments

Comments
 (0)