1717
1818
1919class UtteranceCloner :
20+ """
21+ Clone the prosody of an utterance, but exchange the speaker (or don't)
2022
21- def __init__ (self , model_id , device ):
22- self .tts = ToucanTTSInterface (device = device , tts_model_path = model_id )
23+ Useful for Privacy Applications
24+ """
25+
26+ def __init__ (self , model_id , device , language = "en" , speed_over_quality = False ):
27+ if (device == torch .device ("cpu" ) or device == "cpu" ) and not speed_over_quality :
28+ print ("Warning: You are running BigVGAN on CPU. Consider either switching to GPU or setting the speed_over_quality option to True." )
29+ self .tts = ToucanTTSInterface (device = device , tts_model_path = model_id , faster_vocoder = speed_over_quality )
2330 self .ap = AudioPreprocessor (input_sr = 16000 , output_sr = 16000 , melspec_buckets = 80 , hop_length = 256 , n_fft = 1024 , cut_silence = False )
24- self .tf = ArticulatoryCombinedTextFrontend (language = "en" )
31+ self .tf = ArticulatoryCombinedTextFrontend (language = language )
2532 self .device = device
2633 acoustic_checkpoint_path = os .path .join (MODELS_DIR , "Aligner" , "aligner.pt" )
2734 self .aligner_weights = torch .load (acoustic_checkpoint_path , map_location = 'cpu' )["asr_model" ]
@@ -153,12 +160,12 @@ def clone_utterance(self,
153160 start_sil = torch .zeros ([silence_frames_start * 3 ]).to (self .device ) # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required
154161 end_sil = torch .zeros ([silence_frames_end * 3 ]).to (self .device ) # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required
155162 cloned_speech = self .tts (reference_transcription , view = False , durations = duration , pitch = pitch , energy = energy )
156- cloned_utt = torch .cat ((start_sil , cloned_speech , end_sil ), dim = 0 )
163+ cloned_utt = torch .cat ((start_sil , cloned_speech , end_sil ), dim = 0 ). cpu (). numpy ()
157164 if filename_of_result is not None :
158- sf .write (file = filename_of_result , data = cloned_utt . cpu (). numpy () , samplerate = 24000 )
165+ sf .write (file = filename_of_result , data = cloned_utt , samplerate = 24000 )
159166 if clone_speaker_identity :
160167 self .tts .default_utterance_embedding = prev_embedding .to (self .device ) # return to normal
161- return cloned_utt . cpu (). numpy ()
168+ return cloned_utt
162169
163170 def biblical_accurate_angel_mode (self ,
164171 path_to_reference_audio ,
@@ -178,8 +185,8 @@ def biblical_accurate_angel_mode(self,
178185 self .tts .set_utterance_embedding (path_to_reference_audio = path )
179186 list_of_cloned_speeches .append (self .tts (reference_transcription , view = False , durations = duration , pitch = pitch , energy = energy ))
180187 cloned_speech = torch .stack (list_of_cloned_speeches ).mean (dim = 0 )
181- cloned_utt = torch .cat ((start_sil , cloned_speech , end_sil ), dim = 0 )
188+ cloned_utt = torch .cat ((start_sil , cloned_speech , end_sil ), dim = 0 ). cpu (). numpy ()
182189 if filename_of_result is not None :
183- sf .write (file = filename_of_result , data = cloned_utt . cpu (). numpy () , samplerate = 24000 )
190+ sf .write (file = filename_of_result , data = cloned_utt , samplerate = 24000 )
184191 self .tts .default_utterance_embedding = prev_embedding .to (self .device ) # return to normal
185- return cloned_utt . cpu (). numpy ()
192+ return cloned_utt
0 commit comments