Prevent timestamps restoration when clip timestamps are provided in batched inference (#1376)

MahmoudAshraf97 · web-flow · commit 409a6919f98e · 2025-10-31T14:26:17.000+03:00
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
@@ -418,23 +418,34 @@ def transcribe(
                     "Set 'vad_filter' to True or provide 'clip_timestamps'."
                 )
 
+            clip_timestamps_provided = False
             audio_chunks, chunks_metadata = collect_chunks(
                 audio, clip_timestamps, max_duration=chunk_length
             )
 
         else:
+            clip_timestamps_provided = True
             clip_timestamps = [
                 {k: int(v * sampling_rate) for k, v in segment.items()}
                 for segment in clip_timestamps
             ]
 
             audio_chunks, chunks_metadata = [], []
-            for clip in clip_timestamps:
+            for i, clip in enumerate(clip_timestamps):
                 audio_chunks.append(audio[clip["start"] : clip["end"]])
+
+                clip_duration = (clip["end"] - clip["start"]) / sampling_rate
+                if clip_duration > 30:
+                    self.model.logger.warning(
+                        "Segment %d is longer than 30 seconds, "
+                        "only the first 30 seconds will be transcribed",
+                        i,
+                    )
+
                 chunks_metadata.append(
                     {
                         "offset": clip["start"] / sampling_rate,
-                        "duration": (clip["end"] - clip["start"]) / sampling_rate,
+                        "duration": clip_duration,
                         "segments": [clip],
                     }
                 )
@@ -559,7 +570,10 @@ def transcribe(
             options,
             log_progress,
         )
-        segments = restore_speech_timestamps(segments, clip_timestamps, sampling_rate)
+        if not clip_timestamps_provided:
+            segments = restore_speech_timestamps(
+                segments, clip_timestamps, sampling_rate
+            )
 
         return segments, info
 
diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
@@ -290,3 +290,26 @@ def test_cliptimestamps_segments(jfk_path):
             " And so my fellow Americans ask not what your country can do for you, "
             "ask what you can do for your country."
         )
+
+
+def test_cliptimestamps_timings(physcisworks_path):
+    model = WhisperModel("tiny")
+    pipeline = BatchedInferencePipeline(model=model)
+
+    audio = decode_audio(physcisworks_path)
+    clip_timestamps = [{"start": 0.0, "end": 5.0}, {"start": 6.0, "end": 15.0}]
+    transcripts = [
+        " Now I want to return to the conservation of mechanical energy.",
+        (
+            " I have here a pendulum. I have an object that weighs 15 kilograms"
+            " and I can lift it up one meter, which I have done now."
+        ),
+    ]
+    segments, info = pipeline.transcribe(audio, clip_timestamps=clip_timestamps)
+    segments = list(segments)
+
+    assert len(segments) == 2
+    for segment, clip, transcript in zip(segments, clip_timestamps, transcripts):
+        assert clip["start"] == segment.start
+        assert clip["end"] == segment.end
+        assert segment.text == transcript