@@ -346,53 +346,70 @@ def get_speech_timestamps(audio: torch.Tensor,
346346 possible_ends = []
347347
348348 for i , speech_prob in enumerate (speech_probs ):
349+ cur_sample = window_size_samples * i
350+
351+ # If speech returns after a temp_end, record candidate silence if long enough and clear temp_end
349352 if (speech_prob >= threshold ) and temp_end :
350- if temp_end != 0 :
351- sil_dur = (window_size_samples * i ) - temp_end
352- if sil_dur > min_silence_samples_at_max_speech :
353- possible_ends .append ((temp_end , sil_dur ))
354- temp_end = 0
353+ sil_dur = cur_sample - temp_end
354+ if sil_dur > min_silence_samples_at_max_speech :
355+ possible_ends .append ((temp_end , sil_dur ))
356+ temp_end = 0
355357 if next_start < prev_end :
356- next_start = window_size_samples * i
358+ next_start = cur_sample
357359
360+ # Start of speech
358361 if (speech_prob >= threshold ) and not triggered :
359362 triggered = True
360- current_speech ['start' ] = window_size_samples * i
363+ current_speech ['start' ] = cur_sample
361364 continue
362365
363- if triggered and (window_size_samples * i ) - current_speech ['start' ] > max_speech_samples :
364- if possible_ends :
365- if use_max_poss_sil_at_max_speech :
366- prev_end , dur = max (possible_ends , key = lambda x : x [1 ]) # use the longest possible silence segment in the current speech chunk
367- else :
368- prev_end , dur = possible_ends [- 1 ] # use the last possible silence segment
366+ # Max speech length reached: decide where to cut
367+ if triggered and (cur_sample - current_speech ['start' ] > max_speech_samples ):
368+ if use_max_poss_sil_at_max_speech and possible_ends :
369+ prev_end , dur = max (possible_ends , key = lambda x : x [1 ]) # use the longest possible silence segment in the current speech chunk
369370 current_speech ['end' ] = prev_end
370371 speeches .append (current_speech )
371372 current_speech = {}
372373 next_start = prev_end + dur
373- if next_start < prev_end + window_size_samples * i : # previously reached silence (< neg_thres) and is still not speech (< thres)
374- #triggered = False
374+
375+ if next_start < prev_end + cur_sample : # previously reached silence (< neg_thres) and is still not speech (< thres)
375376 current_speech ['start' ] = next_start
376377 else :
377378 triggered = False
378- #current_speech['start'] = next_start
379379 prev_end = next_start = temp_end = 0
380380 possible_ends = []
381381 else :
382- current_speech ['end' ] = window_size_samples * i
383- speeches .append (current_speech )
384- current_speech = {}
385- prev_end = next_start = temp_end = 0
386- triggered = False
387- possible_ends = []
388- continue
382+ # Legacy max-speech cut (use_max_poss_sil_at_max_speech=False): prefer last valid silence (prev_end) if available
383+ if prev_end :
384+ current_speech ['end' ] = prev_end
385+ speeches .append (current_speech )
386+ current_speech = {}
387+ if next_start < prev_end :
388+ triggered = False
389+ else :
390+ current_speech ['start' ] = next_start
391+ prev_end = next_start = temp_end = 0
392+ possible_ends = []
393+ else :
394+ # No prev_end -> fallback to cutting at current sample
395+ current_speech ['end' ] = cur_sample
396+ speeches .append (current_speech )
397+ current_speech = {}
398+ prev_end = next_start = temp_end = 0
399+ triggered = False
400+ possible_ends = []
401+ continue
389402
403+ # Silence detection while in speech
390404 if (speech_prob < neg_threshold ) and triggered :
391405 if not temp_end :
392- temp_end = window_size_samples * i
393- # if ((window_size_samples * i) - temp_end) > min_silence_samples_at_max_speech: # condition to avoid cutting in very short silence
394- # prev_end = temp_end
395- if (window_size_samples * i ) - temp_end < min_silence_samples :
406+ temp_end = cur_sample
407+ sil_dur_now = cur_sample - temp_end
408+
409+ if not use_max_poss_sil_at_max_speech and sil_dur_now > min_silence_samples_at_max_speech : # condition to avoid cutting in very short silence
410+ prev_end = temp_end
411+
412+ if sil_dur_now < min_silence_samples :
396413 continue
397414 else :
398415 current_speech ['end' ] = temp_end
0 commit comments