added resume to transcribe()

jianfch · jianfch · commit 765a081137f6 · 2025-10-25T22:00:38.000-04:00
-added parameter, `resume`,  to `transcribe()` (only for original models)

-added `--save_unfinished`/`-su`, `--resume_input`/`-ri`, `--delete_resume`/`-dr` to CLI

-updated `transcribe()` to return partially finished transcription if force stopped by `KeyboardInterrupt` (only for original models)

-updated docstring README.md to reflect new parameters
diff --git a/README.md b/README.md
@@ -261,6 +261,12 @@ Docstrings:
         word-timestamp extraction. Specify the number of heads or `True` for default of 6 heads.
         To specify number of iterations for finding the optimal heads,
         use string with "," to separate heads and iterations (e.g. "8,3" for 8 heads and 3 iterations).
+    clip_timestamps : str or list of float
+        Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process.
+        The last end timestamp defaults to the end of the file.
+    resume : stable_whisper.result.WhisperResult or str or dict or list
+        Path/data of an unfinished transcription output to continue transciption from.
+        Use "+" as suffix of the path to resume from the end of second last segment (e.g "output-UNFINISHED.json+").
     decode_options
         Keyword arguments to construct class:`whisper.decode.DecodingOptions` instances.
 
diff --git a/stable_whisper/result.py b/stable_whisper/result.py
@@ -938,7 +938,7 @@ def __init__(
         self.ori_dict = result.get('ori_dict') or result
         self.language = self.ori_dict.get('language')
         self._regroup_history = result.get('regroup_history', '')
-        self._nonspeech_sections = result.get('nonspeech_sections', [])
+        self._nonspeech_sections = result.get('nonspeech_sections') or []
         segments = (result.get('segments', self.ori_dict.get('segments')) or {}).copy()
         self.segments = [Segment(**s, ignore_unused_args=True) for s in segments] if segments else []
         self._forced_order = force_order
@@ -947,6 +947,7 @@ def __init__(
         self.raise_for_unsorted(check_sorted, show_unsorted)
         self.remove_no_word_segments(any(seg.has_words for seg in self.segments))
         self._ignore_special_periods = False
+        self.unfinished_start: float = result.get('unfinished', -1.0)
 
     def __getitem__(self, index: int) -> Segment:
         return self.segments[index]
@@ -1061,10 +1062,14 @@ def update_all_segs_with_words(self):
                       stacklevel=2)
         self.reassign_ids()
 
-    def update_nonspeech_sections(self, silent_starts, silent_ends):
-        self._nonspeech_sections = [
+    def update_nonspeech_sections(self, silent_starts, silent_ends, overwrite: bool = True):
+        nonspeech_sections = [
             dict(start=round(s, 3), end=round(e, 3)) for s, e in zip(silent_starts, silent_ends)
         ]
+        if overwrite:
+            self._nonspeech_sections = nonspeech_sections
+        else:
+            self._nonspeech_sections.extend(nonspeech_sections)
 
     def add_segments(
             self,
@@ -1397,7 +1402,8 @@ def to_dict(self, keep_orig: bool = True):
                     language=self.language,
                     ori_dict=ori_dict,
                     regroup_history=self._regroup_history,
-                    nonspeech_sections=self._nonspeech_sections)
+                    nonspeech_sections=self._nonspeech_sections,
+                    unfinished=self.unfinished_start)
 
     def segments_to_dicts(self, reverse_text: Union[bool, tuple] = False):
         return [s.to_dict(reverse_text=reverse_text) for s in self.segments]
diff --git a/stable_whisper/whisper_word_level/cli.py b/stable_whisper/whisper_word_level/cli.py
@@ -129,6 +129,14 @@ def url_to_path(url: str):
                         help="output filepaths(s);"
                              "if not specified, auto-named output file(s) will be saved to "
                              "[output_dir] or current dir if not specified.")
+    parser.add_argument("--save_unfinished", "-su", action='store_true',
+                        help="whether to save unfinished outputs caused by KeyboardInterrupt; "
+                             "outputs are saved as JSON with suffix '-UNFINISHED.json'")
+    parser.add_argument("--resume_input", "-ri", nargs="+", type=str,
+                        help="JSON of unfinished output filepaths(s) to continue transcription from end of last word; "
+                             "use '+' as suffix to redo the last segment (e.g 'output-UNFINISHED.json+')")
+    parser.add_argument("--delete_resume", "-dr", action='store_true',
+                        help="whether to delete file(s) from '--resume_input'/'-ri' when transcription finishes")
     parser.add_argument("--model", '-m', default="base", type=str,
                         help="name of the Whisper model to use")
     parser.add_argument("--model_dir", type=str, default=None,
@@ -439,10 +447,13 @@ def url_to_path(url: str):
     model_name: str = valid_model_name(args.pop("model"))
     model_dir: str = args.pop("model_dir")
     inputs: List[Union[str, torch.Tensor]] = args.pop("inputs")
+    resume_files: List[str] = args.pop("resume_input")
     outputs: List[str] = args.pop("output")
     output_dir: str = args.pop("output_dir")
     output_format = args.pop("output_format")
     overwrite: bool = args.pop("overwrite")
+    save_unfinished: bool = args.pop("save_unfinished")
+    delete_resume: bool = args.pop("delete_resume")
     no_stream = use_deprecated_args('no_stream', 'mel_first', pop=True, expected_default=False)
     args['stream'] = None if not no_stream else False
     if overwrite:
@@ -468,6 +479,12 @@ def url_to_path(url: str):
         from .original_whisper import load_model as load_model_func
         model_name_kwarg = dict(name=model_name)
     else:
+        if save_unfinished:
+            raise NotImplementedError('--save_unfinished is only supported on vanilla Whisper models.')
+
+        if resume_files:
+            raise NotImplementedError('--resume_input is currently only supported on vanilla Whisper models.')
+
         if is_faster_whisper:
             model_type_name = 'Faster-Whisper'
             from .faster_whisper import load_faster_whisper as load_model_func
@@ -616,6 +633,10 @@ def finalize_outputs(input_file: str, _output: str = None, _alignment: str = Non
     if args['vad'] and args['vad_onnx']:
         args['vad'] = dict(onnx=args['vad_onnx'])
 
+    if resume_files and len(inputs) != len(resume_files):
+        raise ValueError(f'--resume_input and inputs do not match in count. '
+                         f'Got {len(resume_files)} and {len(inputs)}')
+
     if debug:
         print('Input(s)  ->  Outputs(s)')
         for i, (input_audio, output_paths, alignment) in enumerate(zip(inputs, final_outputs, alignments)):
@@ -627,7 +648,11 @@ def finalize_outputs(input_file: str, _output: str = None, _alignment: str = Non
                     alignment = f' + text="{alignment}"'
                 else:
                     alignment = f' + "{alignment}"'
-            print(f'"{input_audio}"{alignment}  ->{dm_output}  {output_paths}')
+            if resume_files:
+                resume_info = f' + "{resume_files[i]}"'
+            else:
+                resume_info = ''
+            print(f'"{input_audio}"{resume_info}{alignment}  ->{dm_output}  {output_paths}')
         print('')
 
     if show_curr_task:
@@ -679,6 +704,8 @@ def _load_model():
             model = _load_model()
             args['regroup'] = False
             args['audio'] = input_audio
+            if resume_files:
+                args['resume'] = resume_files[i]
             if denoiser_outputs:
                 args['denoiser_options']['save_path'] = denoiser_outputs[i]
             transcribe_method = args.get('transcribe_method')
@@ -740,6 +767,13 @@ def _load_model():
             update_options_with_args('save_option', save_options)
             call_method_with_options(save_method, save_options)
 
+        if result.unfinished_start != -1:
+            result.save_as_json(splitext(output_paths[0])[0] + '-UNFINISHED.json')
+            break
+        elif delete_resume and 'resume' in args and os.path.isfile(args['resume']):
+            os.remove(args['resume'])
+            print(f'Removed: {os.path.abspath(args["resume"])}')
+
 
 def cli(cmd: str = None):
     cache = {}
diff --git a/stable_whisper/whisper_word_level/original_whisper.py b/stable_whisper/whisper_word_level/original_whisper.py
@@ -16,7 +16,7 @@
 from ..decode import decode_stable
 from ..stabilization import NonSpeechPredictor
 from ..timing import add_word_timestamps_stable
-from ..utils import safe_print, isolate_useful_options, update_options, exact_div
+from ..utils import safe_print, isolate_useful_options, update_options, exact_div, format_timestamp
 from ..whisper_compatibility import warn_compatibility_issues, get_tokenizer
 from ..default import get_min_word_dur, get_prepend_punctuations, get_append_punctuations
 
@@ -73,6 +73,7 @@ def transcribe_stable(
         extra_models: Optional[List["Whisper"]] = None,
         dynamic_heads: Optional[Union[bool, int, str]] = None,
         clip_timestamps: Optional[Union[str, List[float]]] = None,
+        resume: Union[WhisperResult, str, dict, list] = None,
         **decode_options) \
         -> WhisperResult:
     """
@@ -201,6 +202,9 @@ def transcribe_stable(
     clip_timestamps : str or list of float
         Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process.
         The last end timestamp defaults to the end of the file.
+    resume : stable_whisper.result.WhisperResult or str or dict or list
+        Path/data of an unfinished transcription output to continue transciption from.
+        Use "+" as suffix of the path to resume from the end of second last segment (e.g "output-UNFINISHED.json+").
     decode_options
         Keyword arguments to construct class:`whisper.decode.DecodingOptions` instances.
 
@@ -436,6 +440,28 @@ def new_segment(
 
     with tqdm(total=initial_duration, unit='sec', disable=verbose is not False, desc=task.title()) as tqdm_pbar:
 
+        if resume is not None:
+            remove_last_seg = False
+            if not isinstance(resume, WhisperResult):
+                if isinstance(resume, str) and resume.endswith('+'):
+                    resume = resume[:-1]
+                    remove_last_seg = True
+                resume = WhisperResult(resume)
+            if resume and remove_last_seg:
+                del resume[-1]
+                resume.unfinished_start = -1.0
+            if resume.unfinished_start == -1.0:
+                resume_start = resume[-1].end if resume else 0.0
+            else:
+                resume_start = resume.unfinished_start
+            seek_sample = round(resume_start * SAMPLE_RATE)
+            tqdm_pbar.write(f'Resuming from {format_timestamp(resume_start)}')
+            decode_options["language"] = resume.language
+
+        interrupted_time = -1.0
+        segment_samples: int = 0
+        mel_segment: torch.Tensor = torch.zeros(0)
+
         def update_pbar(curr_total_duration=None):
             nonlocal audio_features
             audio_features = None
@@ -460,10 +486,11 @@ def fast_forward():
             update_seek()
             update_pbar()
 
-        while True:
+        def inner_transcribe():
+            nonlocal seek_sample, segment_samples, prompt_reset_since, mel_segment
             audio_segment, new_seek = audio.next_valid_chunk(seek_sample, N_SAMPLES)
             if audio_segment is None:
-                break
+                return 1
             if new_seek != seek_sample:
                 seek_sample = new_seek
                 update_pbar()
@@ -478,7 +505,7 @@ def fast_forward():
 
             if is_silent_segment:
                 fast_forward()
-                continue
+                return
 
             if nonspeech_skip and silence_preds['timings'] is not None:
                 silence_starts = silence_preds['timings'][0] - time_offset
@@ -490,7 +517,7 @@ def fast_forward():
                     if silence_starts[skip_idx] < min_word_dur or int(silence_starts[skip_idx] * SAMPLE_RATE) == 0:
                         segment_samples = round(silence_ends[skip_idx] * SAMPLE_RATE)
                         fast_forward()
-                        continue
+                        return
                     audio_segment = audio_segment[..., :int(silence_starts[skip_idx] * SAMPLE_RATE)]
                     segment_samples = audio_segment.shape[-1]
                     segment_duration = segment_samples / SAMPLE_RATE
@@ -513,7 +540,7 @@ def fast_forward():
 
                 if should_skip:
                     fast_forward()
-                    continue
+                    return
 
             current_segments = []
 
@@ -644,7 +671,7 @@ def fast_forward():
 
             if len(current_segments) == 0:
                 fast_forward()
-                continue
+                return
 
             if segment_silence_timing is not None:
                 for seg_i, segment in enumerate(current_segments):
@@ -677,8 +704,21 @@ def fast_forward():
 
             fast_forward()
 
+        while True:
+            try:
+                if inner_transcribe() is not None:
+                    break
+            except KeyboardInterrupt:
+                if all_segments:
+                    interrupted_time = all_segments[-1]['end']
+                curr_seek_time = seek_sample / SAMPLE_RATE
+                if curr_seek_time > interrupted_time:
+                    interrupted_time = curr_seek_time
+                tqdm_pbar.write(f'Interrupted at {format_timestamp(seek_sample / SAMPLE_RATE)}')
+                break
+
         # final update
-        update_pbar(seek_sample / SAMPLE_RATE)
+        update_pbar((seek_sample / SAMPLE_RATE) if interrupted_time == -1 else None)
 
     if model.device != torch.device('cpu'):
         torch.cuda.empty_cache()
@@ -696,18 +736,37 @@ def fast_forward():
         ),
         force_order=not word_timestamps
     )
-    if word_timestamps and regroup:
-        final_result.regroup(regroup)
 
     if time_scale is not None:
         final_result.rescale_time(1 / time_scale)
 
+    final_nonspeech_timings = nonspeech_predictor.nonspeech_timings if suppress_silence else None
+
+    if resume is not None:
+        if resume:
+            if final_result:
+                resume.fill_in_gaps(final_result, verbose=False)
+            if final_nonspeech_timings:
+                resume.update_nonspeech_sections(*final_nonspeech_timings, overwrite=False)
+            final_result = resume
+        else:
+            ns_starts = [sect['start'] for sect in resume.nonspeech_sections]
+            ns_ends = [sect['end'] for sect in resume.nonspeech_sections]
+            if final_nonspeech_timings:
+                ns_starts.extend(final_nonspeech_timings[0])
+                ns_ends.extend(final_nonspeech_timings[1])
+            final_result.update_nonspeech_sections(ns_starts, ns_ends, overwrite=True)
+    elif final_nonspeech_timings:
+        final_result.update_nonspeech_sections(*final_nonspeech_timings, overwrite=True)
+
+    if word_timestamps and regroup:
+        final_result.regroup(regroup)
+
+    final_result.unfinished_start = interrupted_time
+
     if len(final_result.text) == 0:
         warnings.warn(f'Failed to {task} audio. Result contains no text. ')
 
-    if suppress_silence and (final_nonspeech_timings := nonspeech_predictor.nonspeech_timings):
-        final_result.update_nonspeech_sections(*final_nonspeech_timings)
-
     return final_result