33__author__ = "Jérôme Louradour"
44__credits__ = ["Jérôme Louradour" ]
55__license__ = "GPLv3"
6- __version__ = "1.15.1 "
6+ __version__ = "1.15.2 "
77
88# Set some environment variables
99import os
@@ -277,9 +277,9 @@ def transcribe_timestamped(
277277 compression_ratio_threshold = compression_ratio_threshold ,
278278 )
279279
280- if vad :
280+ if vad is not None :
281281 audio = get_audio_tensor (audio )
282- audio , vad_segments , convert_timestamps = remove_non_speech (audio , method = vad , sample_rate = SAMPLE_RATE , plot = plot_word_alignment )
282+ audio , vad_segments , convert_timestamps = remove_non_speech (audio , method = vad , sample_rate = SAMPLE_RATE , plot = plot_word_alignment , avoid_empty_speech = True )
283283 else :
284284 vad_segments = None
285285
@@ -1856,8 +1856,8 @@ def check_vad_method(method, with_version=False):
18561856 """
18571857 if method in [True , "True" , "true" ]:
18581858 return check_vad_method ("silero" ) # default method
1859- elif method in [False , "False" , "false" ]:
1860- return False
1859+ elif method in [None , False , "False" , "false" , "None" , "none " ]:
1860+ return None
18611861 elif not isinstance (method , str ) and hasattr (method , '__iter__' ):
18621862 # list of explicit timestamps
18631863 checked_pairs = []
@@ -2063,6 +2063,7 @@ def remove_non_speech(audio,
20632063 dilatation = 0.5 ,
20642064 sample_rate = SAMPLE_RATE ,
20652065 method = "silero" ,
2066+ avoid_empty_speech = False ,
20662067 plot = False ,
20672068 ):
20682069 """
@@ -2083,6 +2084,8 @@ def remove_non_speech(audio,
20832084 how much (in sec) to enlarge each speech segment detected by the VAD
20842085 method: str
20852086 method to use to remove non-speech segments
2087+ avoid_empty_speech: bool
2088+ if True, avoid returning an empty speech segment (re)
20862089 plot: bool or str
20872090 if True, plot the result.
20882091 If a string, save the plot to the given file
@@ -2100,7 +2103,10 @@ def remove_non_speech(audio,
21002103
21012104 segments = [(seg ["start" ], seg ["end" ]) for seg in segments ]
21022105 if len (segments ) == 0 :
2103- segments = [(0 , audio .shape [- 1 ])]
2106+ if avoid_empty_speech :
2107+ segments = [(0 , audio .shape [- 1 ])]
2108+ else :
2109+ return torch .Tensor ([]), [], lambda t , t2 = None : do_convert_timestamps (segments , t , t2 )
21042110
21052111 audio_speech = torch .cat ([audio [..., s :e ] for s ,e in segments ], dim = - 1 )
21062112
@@ -2121,7 +2127,7 @@ def remove_non_speech(audio,
21212127 if not use_sample :
21222128 segments = [(float (s )/ sample_rate , float (e )/ sample_rate ) for s ,e in segments ]
21232129
2124- return audio_speech , segments , lambda t , t2 = None : do_convert_timestamps ( segments , t , t2 )
2130+ return audio_speech , segments , lambda t , t2 = None : t if t2 is None else [ t , t2 ]
21252131
21262132def do_convert_timestamps (segments , t , t2 = None ):
21272133 """
0 commit comments