Skip to content

Commit 8718b1d

Browse files
authored
Merge pull request #394 from lpscr/main
Fix the glitch effect at the beginning audio
2 parents b03e9b2 + 2f91414 commit 8718b1d

File tree

1 file changed

+22
-4
lines changed

1 file changed

+22
-4
lines changed

src/f5_tts/infer/utils_infer.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,22 @@ def load_model(
218218
return model
219219

220220

221+
def remove_silence_edges(audio, silence_threshold=-42):
222+
# Remove silence from the start
223+
non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
224+
audio = audio[non_silent_start_idx :]
225+
226+
# Remove silence from the end
227+
non_silent_end_duration = audio.duration_seconds
228+
for ms in reversed(audio):
229+
if ms.dBFS > silence_threshold:
230+
break
231+
non_silent_end_duration -= 0.001
232+
trimmed_audio = audio[: int(non_silent_end_duration * 1000)]
233+
234+
return trimmed_audio
235+
236+
221237
# preprocess reference audio and text
222238

223239

@@ -229,7 +245,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
229245
if clip_short:
230246
# 1. try to find long silence for clipping
231247
non_silent_segs = silence.split_on_silence(
232-
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000
248+
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
233249
)
234250
non_silent_wave = AudioSegment.silent(duration=0)
235251
for non_silent_seg in non_silent_segs:
@@ -241,7 +257,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
241257
# 2. try to find short silence for clipping if 1. failed
242258
if len(non_silent_wave) > 15000:
243259
non_silent_segs = silence.split_on_silence(
244-
aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000
260+
aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
245261
)
246262
non_silent_wave = AudioSegment.silent(duration=0)
247263
for non_silent_seg in non_silent_segs:
@@ -256,7 +272,7 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
256272
if len(aseg) > 15000:
257273
aseg = aseg[:15000]
258274
show_info("Audio is over 15s, clipping short. (3)")
259-
275+
aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
260276
aseg.export(f.name, format="wav")
261277
ref_audio = f.name
262278

@@ -473,7 +489,9 @@ def infer_batch_process(
473489

474490
def remove_silence_for_generated_wav(filename):
475491
aseg = AudioSegment.from_file(filename)
476-
non_silent_segs = silence.split_on_silence(aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500)
492+
non_silent_segs = silence.split_on_silence(
493+
aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10
494+
)
477495
non_silent_wave = AudioSegment.silent(duration=0)
478496
for non_silent_seg in non_silent_segs:
479497
non_silent_wave += non_silent_seg

0 commit comments

Comments
 (0)