@@ -418,23 +418,34 @@ def transcribe(
418418 "Set 'vad_filter' to True or provide 'clip_timestamps'."
419419 )
420420
421+ clip_timestamps_provided = False
421422 audio_chunks , chunks_metadata = collect_chunks (
422423 audio , clip_timestamps , max_duration = chunk_length
423424 )
424425
425426 else :
427+ clip_timestamps_provided = True
426428 clip_timestamps = [
427429 {k : int (v * sampling_rate ) for k , v in segment .items ()}
428430 for segment in clip_timestamps
429431 ]
430432
431433 audio_chunks , chunks_metadata = [], []
432- for clip in clip_timestamps :
434+ for i , clip in enumerate ( clip_timestamps ) :
433435 audio_chunks .append (audio [clip ["start" ] : clip ["end" ]])
436+
437+ clip_duration = (clip ["end" ] - clip ["start" ]) / sampling_rate
438+ if clip_duration > 30 :
439+ self .model .logger .warning (
440+ "Segment %d is longer than 30 seconds, "
441+ "only the first 30 seconds will be transcribed" ,
442+ i ,
443+ )
444+
434445 chunks_metadata .append (
435446 {
436447 "offset" : clip ["start" ] / sampling_rate ,
437- "duration" : ( clip [ "end" ] - clip [ "start" ]) / sampling_rate ,
448+ "duration" : clip_duration ,
438449 "segments" : [clip ],
439450 }
440451 )
@@ -559,7 +570,10 @@ def transcribe(
559570 options ,
560571 log_progress ,
561572 )
562- segments = restore_speech_timestamps (segments , clip_timestamps , sampling_rate )
573+ if not clip_timestamps_provided :
574+ segments = restore_speech_timestamps (
575+ segments , clip_timestamps , sampling_rate
576+ )
563577
564578 return segments , info
565579
0 commit comments