Skip to content

Commit 409a691

Browse files
Prevent timestamps restoration when clip timestamps are provided in batched inference (#1376)
1 parent 00a5b26 commit 409a691

File tree

2 files changed

+40
-3
lines changed

2 files changed

+40
-3
lines changed

faster_whisper/transcribe.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,23 +418,34 @@ def transcribe(
418418
"Set 'vad_filter' to True or provide 'clip_timestamps'."
419419
)
420420

421+
clip_timestamps_provided = False
421422
audio_chunks, chunks_metadata = collect_chunks(
422423
audio, clip_timestamps, max_duration=chunk_length
423424
)
424425

425426
else:
427+
clip_timestamps_provided = True
426428
clip_timestamps = [
427429
{k: int(v * sampling_rate) for k, v in segment.items()}
428430
for segment in clip_timestamps
429431
]
430432

431433
audio_chunks, chunks_metadata = [], []
432-
for clip in clip_timestamps:
434+
for i, clip in enumerate(clip_timestamps):
433435
audio_chunks.append(audio[clip["start"] : clip["end"]])
436+
437+
clip_duration = (clip["end"] - clip["start"]) / sampling_rate
438+
if clip_duration > 30:
439+
self.model.logger.warning(
440+
"Segment %d is longer than 30 seconds, "
441+
"only the first 30 seconds will be transcribed",
442+
i,
443+
)
444+
434445
chunks_metadata.append(
435446
{
436447
"offset": clip["start"] / sampling_rate,
437-
"duration": (clip["end"] - clip["start"]) / sampling_rate,
448+
"duration": clip_duration,
438449
"segments": [clip],
439450
}
440451
)
@@ -559,7 +570,10 @@ def transcribe(
559570
options,
560571
log_progress,
561572
)
562-
segments = restore_speech_timestamps(segments, clip_timestamps, sampling_rate)
573+
if not clip_timestamps_provided:
574+
segments = restore_speech_timestamps(
575+
segments, clip_timestamps, sampling_rate
576+
)
563577

564578
return segments, info
565579

tests/test_transcribe.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,3 +290,26 @@ def test_cliptimestamps_segments(jfk_path):
290290
" And so my fellow Americans ask not what your country can do for you, "
291291
"ask what you can do for your country."
292292
)
293+
294+
295+
def test_cliptimestamps_timings(physcisworks_path):
296+
model = WhisperModel("tiny")
297+
pipeline = BatchedInferencePipeline(model=model)
298+
299+
audio = decode_audio(physcisworks_path)
300+
clip_timestamps = [{"start": 0.0, "end": 5.0}, {"start": 6.0, "end": 15.0}]
301+
transcripts = [
302+
" Now I want to return to the conservation of mechanical energy.",
303+
(
304+
" I have here a pendulum. I have an object that weighs 15 kilograms"
305+
" and I can lift it up one meter, which I have done now."
306+
),
307+
]
308+
segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
309+
segments = list(segments)
310+
311+
assert len(segments) == 2
312+
for segment, clip, transcript in zip(segments, clip_timestamps, transcripts):
313+
assert clip["start"] == segment.start
314+
assert clip["end"] == segment.end
315+
assert segment.text == transcript

0 commit comments

Comments
 (0)