fixed refine() for Faster-Whisper and HF models

jianfch · jianfch · commit 78a223f3a85b · 2025-03-22T19:40:12.000-04:00
-fixed `refine()` failing when word tokens are missing in input `result` (i.e. `transcribe()` outputs from Faster-Whisper and HF models)

-fixed refine()` failing when word probabilities are missing in input `result` (i.e. `transcribe()` outputs from HF models)

-fixed incorrect description of alignment and refinement support for Faster-Whisper models in  README.md

-updated HF model transcription to always return detected language in its result
diff --git a/README.md b/README.md
@@ -376,19 +376,24 @@ Use with [Faster-Whisper](https://github.com/guillaumekln/faster-whisper):
 ```
 pip install -U stable-ts[fw]
 ```
-* [Refinement](#refinement) is not supported on Faster-Whisper models
-* [Alignment](#alignment) is slower on Faster-Whisper models than on vanilla models (i.e. ones loaded with `stable_whisper.load_model()`) 
+* [Refinement](#refinement) is slower on Faster-Whisper models than on vanilla models (i.e. ones loaded with `stable_whisper.load_model()`)
 ```python
 model = stable_whisper.load_faster_whisper('base')
-result = model.transcribe_stable('audio.mp3')
-
-# For version 2.18.0+:
 result = model.transcribe('audio.mp3')
+
+# For versions < 2.18.0:
+result = model.transcribe_stable('audio.mp3')
 ```
-Note: `model.transcribe_stable()` is deprecated in 2.18.0 and will be removed in future versions.
+
+<details>
+<summary>CLI</summary>
+
 ```commandline
 stable-ts audio.mp3 -o audio.srt -fw
 ```
+
+</details>
+
 Docstring:
 <details>
 <summary>load_faster_whisper()</summary>
diff --git a/stable_whisper/alignment.py b/stable_whisper/alignment.py
@@ -547,19 +547,18 @@ def refine(
     Saved 'audio.srt'
     """
     model = as_vanilla(model)
-    if result:
-        if not result.has_words:
-            if not result.language:
-                raise RuntimeError(f'cannot add word-timestamps to result with missing language')
-            align_words(model, audio, result)
-        elif not all(word.tokens for word in result.all_words()):
-            tokenizer = get_tokenizer(model)
-            for word in result.all_words():
-                word.tokens = tokenizer.encode(word.word)
-    tokenizer = get_tokenizer(model, language=result.language, task='transcribe')
+    is_faster_model = model.__module__.startswith('faster_whisper.')
+    if result and (not result.has_words or any(word.probability is None for word in result.all_words())):
+        if not result.language:
+            raise RuntimeError(f'cannot align words with result missing language')
+        align_words(model, audio, result)
+    tokenizer = get_tokenizer(model, is_faster_model=is_faster_model, language=result.language, task='transcribe')
+    if result and not all(word.tokens for word in result.all_words()):
+        for word in result.all_words():
+            word.tokens = tokenizer.encode(word.word)
 
     options = AllOptions(options, post=False, silence=False, align=False)
-    model_type = 'fw' if (is_faster_model := model.__module__.startswith('faster_whisper.')) else None
+    model_type = 'fw' if is_faster_model else None
     inference_func = get_whisper_refinement_func(model, tokenizer, model_type, single_batch)
     max_inference_tokens = (model.max_length if is_faster_model else model.dims.n_text_ctx) - 6
 
diff --git a/stable_whisper/whisper_word_level/hf_whisper.py b/stable_whisper/whisper_word_level/hf_whisper.py
@@ -140,12 +140,15 @@ def _inner_transcribe(
             print(f'Transcribing with Hugging Face Whisper ({self.model_name})...')
         pipe_kwargs = dict(
             generate_kwargs=generate_kwargs,
-            return_timestamps='word' if word_timestamps else True
+            return_timestamps='word' if word_timestamps else True,
+            return_language=True
         )
         if batch_size is not None:
             pipe_kwargs['batch_size'] = batch_size
         output = self._pipe(audio, **pipe_kwargs)
         result = output['chunks']
+        if not language and result and 'language' in result[0]:
+            language = result[0]['language']
         if verbose is not None:
             print(f'Transcription completed.')
 
@@ -200,13 +203,24 @@ def _curr_max_end(start: float, next_idx: float) -> float:
                 for word in result
             ]
             replace_none_ts(words)
-            return [words]
-        segs = [
-            dict(start=seg['timestamp'][0], end=seg['timestamp'][1], text=seg['text'])
-            for seg in result
-        ]
-        replace_none_ts(segs)
-        return segs
+            if words:
+                segs = [
+                        dict(
+                            start=words[0]['start'],
+                            end=words[-1]['end'],
+                            text=''.join(w['word'] for w in words),
+                            words=words
+                        )
+                ]
+            else:
+                segs = []
+        else:
+            segs = [
+                dict(start=seg['timestamp'][0], end=seg['timestamp'][1], text=seg['text'])
+                for seg in result
+            ]
+            replace_none_ts(segs)
+        return dict(segments=segs, language=language)
 
     def transcribe(
             self,