Skip to content

Commit 9f903c7

Browse files
committed
clearer implementation of remove_non_speech with respect to audio without speech
1 parent bdee5d3 commit 9f903c7

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

whisper_timestamped/transcribe.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
__author__ = "Jérôme Louradour"
44
__credits__ = ["Jérôme Louradour"]
55
__license__ = "GPLv3"
6-
__version__ = "1.15.1"
6+
__version__ = "1.15.2"
77

88
# Set some environment variables
99
import os
@@ -277,9 +277,9 @@ def transcribe_timestamped(
277277
compression_ratio_threshold=compression_ratio_threshold,
278278
)
279279

280-
if vad:
280+
if vad is not None:
281281
audio = get_audio_tensor(audio)
282-
audio, vad_segments, convert_timestamps = remove_non_speech(audio, method=vad, sample_rate=SAMPLE_RATE, plot=plot_word_alignment)
282+
audio, vad_segments, convert_timestamps = remove_non_speech(audio, method=vad, sample_rate=SAMPLE_RATE, plot=plot_word_alignment, avoid_empty_speech=True)
283283
else:
284284
vad_segments = None
285285

@@ -1856,8 +1856,8 @@ def check_vad_method(method, with_version=False):
18561856
"""
18571857
if method in [True, "True", "true"]:
18581858
return check_vad_method("silero") # default method
1859-
elif method in [False, "False", "false"]:
1860-
return False
1859+
elif method in [None, False, "False", "false", "None", "none"]:
1860+
return None
18611861
elif not isinstance(method, str) and hasattr(method, '__iter__'):
18621862
# list of explicit timestamps
18631863
checked_pairs = []
@@ -2063,6 +2063,7 @@ def remove_non_speech(audio,
20632063
dilatation=0.5,
20642064
sample_rate=SAMPLE_RATE,
20652065
method="silero",
2066+
avoid_empty_speech=False,
20662067
plot=False,
20672068
):
20682069
"""
@@ -2083,6 +2084,8 @@ def remove_non_speech(audio,
20832084
how much (in sec) to enlarge each speech segment detected by the VAD
20842085
method: str
20852086
method to use to remove non-speech segments
2087+
avoid_empty_speech: bool
2088+
if True, avoid returning an empty speech segment (re)
20862089
plot: bool or str
20872090
if True, plot the result.
20882091
If a string, save the plot to the given file
@@ -2100,7 +2103,10 @@ def remove_non_speech(audio,
21002103

21012104
segments = [(seg["start"], seg["end"]) for seg in segments]
21022105
if len(segments) == 0:
2103-
segments = [(0, audio.shape[-1])]
2106+
if avoid_empty_speech:
2107+
segments = [(0, audio.shape[-1])]
2108+
else:
2109+
return torch.Tensor([]), [], lambda t, t2 = None: do_convert_timestamps(segments, t, t2)
21042110

21052111
audio_speech = torch.cat([audio[..., s:e] for s,e in segments], dim=-1)
21062112

@@ -2121,7 +2127,7 @@ def remove_non_speech(audio,
21212127
if not use_sample:
21222128
segments = [(float(s)/sample_rate, float(e)/sample_rate) for s,e in segments]
21232129

2124-
return audio_speech, segments, lambda t, t2 = None: do_convert_timestamps(segments, t, t2)
2130+
return audio_speech, segments, lambda t, t2 = None: t if t2 is None else [t, t2]
21252131

21262132
def do_convert_timestamps(segments, t, t2 = None):
21272133
"""

0 commit comments

Comments
 (0)