Skip to content

Commit fba061d

Browse files
authored
Merge pull request #677 from snakers4/adamnsandle
get rid of hop_size_ratio
2 parents 34dea51 + 1163135 commit fba061d

File tree

1 file changed

+8
-7
lines changed

1 file changed

+8
-7
lines changed

src/silero_vad/utils_vad.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,6 @@ def get_speech_timestamps(audio: torch.Tensor,
202202
progress_tracking_callback: Callable[[float], None] = None,
203203
neg_threshold: float = None,
204204
window_size_samples: int = 512,
205-
hop_size_ratio: float = 1,
206205
min_silence_at_max_speech: float = 98,
207206
use_max_poss_sil_at_max_speech: bool = True):
208207

@@ -252,12 +251,14 @@ def get_speech_timestamps(audio: torch.Tensor,
252251
neg_threshold: float (default = threshold - 0.15)
253252
Negative threshold (noise or exit threshold). If model's current state is SPEECH, values BELOW this value are considered as NON-SPEECH.
254253
254+
min_silence_at_max_speech: float (default - 98ms)
255+
Minimum silence duration in ms which is used to avoid abrupt cuts when max_speech_duration_s is reached
256+
257+
use_max_poss_sil_at_max_speech: bool (default - True)
258+
Whether to use the maximum possible silence at max_speech_duration_s or not. If not, the last silence is used.
259+
255260
window_size_samples: int (default - 512 samples)
256261
!!! DEPRECATED, DOES NOTHING !!!
257-
258-
hop_size_ratio: float (default - 1), number of samples by which the window is shifted, 1 means hop_size_samples = window_size_samples
259-
min_silence_at_max_speech: float (default - 25ms), minimum silence duration in ms which is used to avoid abrupt cuts when max_speech_duration_s is reached
260-
use_max_poss_sil_at_max_speech: bool (default - True), whether to use the maximum possible silence at max_speech_duration_s or not. If not, the last silence is used.
261262
262263
Returns
263264
----------
@@ -288,7 +289,7 @@ def get_speech_timestamps(audio: torch.Tensor,
288289
raise ValueError("Currently silero VAD models support 8000 and 16000 (or multiply of 16000) sample rates")
289290

290291
window_size_samples = 512 if sampling_rate == 16000 else 256
291-
hop_size_samples = int(window_size_samples * hop_size_ratio)
292+
hop_size_samples = int(window_size_samples)
292293

293294
model.reset_states()
294295
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
@@ -326,7 +327,7 @@ def get_speech_timestamps(audio: torch.Tensor,
326327
temp_end = 0 # to save potential segment end (and tolerate some silence)
327328
prev_end = next_start = 0 # to save potential segment limits in case of maximum segment size reached
328329
possible_ends = []
329-
330+
330331
for i, speech_prob in enumerate(speech_probs):
331332
if (speech_prob >= threshold) and temp_end:
332333
if temp_end != 0:

0 commit comments

Comments
 (0)