Skip to content

Commit 9623ce7

Browse files
authored
Merge pull request #710 from Purfview/patch-3
Fixes and refines - use_max_poss_sil_at_max_speech arg
2 parents b6dd059 + b15a216 commit 9623ce7

File tree

1 file changed

+44
-27
lines changed

1 file changed

+44
-27
lines changed

src/silero_vad/utils_vad.py

Lines changed: 44 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -346,53 +346,70 @@ def get_speech_timestamps(audio: torch.Tensor,
346346
possible_ends = []
347347

348348
for i, speech_prob in enumerate(speech_probs):
349+
cur_sample = window_size_samples * i
350+
351+
# If speech returns after a temp_end, record candidate silence if long enough and clear temp_end
349352
if (speech_prob >= threshold) and temp_end:
350-
if temp_end != 0:
351-
sil_dur = (window_size_samples * i) - temp_end
352-
if sil_dur > min_silence_samples_at_max_speech:
353-
possible_ends.append((temp_end, sil_dur))
354-
temp_end = 0
353+
sil_dur = cur_sample - temp_end
354+
if sil_dur > min_silence_samples_at_max_speech:
355+
possible_ends.append((temp_end, sil_dur))
356+
temp_end = 0
355357
if next_start < prev_end:
356-
next_start = window_size_samples * i
358+
next_start = cur_sample
357359

360+
# Start of speech
358361
if (speech_prob >= threshold) and not triggered:
359362
triggered = True
360-
current_speech['start'] = window_size_samples * i
363+
current_speech['start'] = cur_sample
361364
continue
362365

363-
if triggered and (window_size_samples * i) - current_speech['start'] > max_speech_samples:
364-
if possible_ends:
365-
if use_max_poss_sil_at_max_speech:
366-
prev_end, dur = max(possible_ends, key=lambda x: x[1]) # use the longest possible silence segment in the current speech chunk
367-
else:
368-
prev_end, dur = possible_ends[-1] # use the last possible silence segment
366+
# Max speech length reached: decide where to cut
367+
if triggered and (cur_sample - current_speech['start'] > max_speech_samples):
368+
if use_max_poss_sil_at_max_speech and possible_ends:
369+
prev_end, dur = max(possible_ends, key=lambda x: x[1]) # use the longest possible silence segment in the current speech chunk
369370
current_speech['end'] = prev_end
370371
speeches.append(current_speech)
371372
current_speech = {}
372373
next_start = prev_end + dur
373-
if next_start < prev_end + window_size_samples * i: # previously reached silence (< neg_thres) and is still not speech (< thres)
374-
#triggered = False
374+
375+
if next_start < prev_end + cur_sample: # previously reached silence (< neg_thres) and is still not speech (< thres)
375376
current_speech['start'] = next_start
376377
else:
377378
triggered = False
378-
#current_speech['start'] = next_start
379379
prev_end = next_start = temp_end = 0
380380
possible_ends = []
381381
else:
382-
current_speech['end'] = window_size_samples * i
383-
speeches.append(current_speech)
384-
current_speech = {}
385-
prev_end = next_start = temp_end = 0
386-
triggered = False
387-
possible_ends = []
388-
continue
382+
# Legacy max-speech cut (use_max_poss_sil_at_max_speech=False): prefer last valid silence (prev_end) if available
383+
if prev_end:
384+
current_speech['end'] = prev_end
385+
speeches.append(current_speech)
386+
current_speech = {}
387+
if next_start < prev_end:
388+
triggered = False
389+
else:
390+
current_speech['start'] = next_start
391+
prev_end = next_start = temp_end = 0
392+
possible_ends = []
393+
else:
394+
# No prev_end -> fallback to cutting at current sample
395+
current_speech['end'] = cur_sample
396+
speeches.append(current_speech)
397+
current_speech = {}
398+
prev_end = next_start = temp_end = 0
399+
triggered = False
400+
possible_ends = []
401+
continue
389402

403+
# Silence detection while in speech
390404
if (speech_prob < neg_threshold) and triggered:
391405
if not temp_end:
392-
temp_end = window_size_samples * i
393-
# if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech: # condition to avoid cutting in very short silence
394-
# prev_end = temp_end
395-
if (window_size_samples * i) - temp_end < min_silence_samples:
406+
temp_end = cur_sample
407+
sil_dur_now = cur_sample - temp_end
408+
409+
if not use_max_poss_sil_at_max_speech and sil_dur_now > min_silence_samples_at_max_speech: # condition to avoid cutting in very short silence
410+
prev_end = temp_end
411+
412+
if sil_dur_now < min_silence_samples:
396413
continue
397414
else:
398415
current_speech['end'] = temp_end

0 commit comments

Comments
 (0)