@@ -218,6 +218,22 @@ def load_model(
218218 return model
219219
220220
221+ def remove_silence_edges (audio , silence_threshold = - 42 ):
222+ # Remove silence from the start
223+ non_silent_start_idx = silence .detect_leading_silence (audio , silence_threshold = silence_threshold )
224+ audio = audio [non_silent_start_idx :]
225+
226+ # Remove silence from the end
227+ non_silent_end_duration = audio .duration_seconds
228+ for ms in reversed (audio ):
229+ if ms .dBFS > silence_threshold :
230+ break
231+ non_silent_end_duration -= 0.001
232+ trimmed_audio = audio [: int (non_silent_end_duration * 1000 )]
233+
234+ return trimmed_audio
235+
236+
221237# preprocess reference audio and text
222238
223239
@@ -229,7 +245,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
229245 if clip_short :
230246 # 1. try to find long silence for clipping
231247 non_silent_segs = silence .split_on_silence (
232- aseg , min_silence_len = 1000 , silence_thresh = - 50 , keep_silence = 1000
248+ aseg , min_silence_len = 1000 , silence_thresh = - 50 , keep_silence = 1000 , seek_step = 10
233249 )
234250 non_silent_wave = AudioSegment .silent (duration = 0 )
235251 for non_silent_seg in non_silent_segs :
@@ -241,7 +257,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
241257 # 2. try to find short silence for clipping if 1. failed
242258 if len (non_silent_wave ) > 15000 :
243259 non_silent_segs = silence .split_on_silence (
244- aseg , min_silence_len = 100 , silence_thresh = - 40 , keep_silence = 1000
260+ aseg , min_silence_len = 100 , silence_thresh = - 40 , keep_silence = 1000 , seek_step = 10
245261 )
246262 non_silent_wave = AudioSegment .silent (duration = 0 )
247263 for non_silent_seg in non_silent_segs :
@@ -256,7 +272,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
256272 if len (aseg ) > 15000 :
257273 aseg = aseg [:15000 ]
258274 show_info ("Audio is over 15s, clipping short. (3)" )
259-
275+ aseg = remove_silence_edges ( aseg ) + AudioSegment . silent ( duration = 50 )
260276 aseg .export (f .name , format = "wav" )
261277 ref_audio = f .name
262278
@@ -473,7 +489,9 @@ def infer_batch_process(
473489
474490def remove_silence_for_generated_wav (filename ):
475491 aseg = AudioSegment .from_file (filename )
476- non_silent_segs = silence .split_on_silence (aseg , min_silence_len = 1000 , silence_thresh = - 50 , keep_silence = 500 )
492+ non_silent_segs = silence .split_on_silence (
493+ aseg , min_silence_len = 1000 , silence_thresh = - 50 , keep_silence = 500 , seek_step = 10
494+ )
477495 non_silent_wave = AudioSegment .silent (duration = 0 )
478496 for non_silent_seg in non_silent_segs :
479497 non_silent_wave += non_silent_seg
0 commit comments