added alignment and refinement support for HF models

jianfch · jianfch · commit 751b041ae256 · 2025-03-22T16:42:41.000-04:00
-added full alignment and refinement support for Hugging Face models
diff --git a/README.md b/README.md
@@ -504,17 +504,22 @@ Docstring:
 
 
 <details>
-<summary>Hugging Face Transformers (~9x faster)</summary>
+<summary>Hugging Face Transformers</summary>
 
-Run Whisper up to 9x faster with [Hugging Face Transformer](https://huggingface.co/openai/whisper-large-v3):
+Transcribe up to 9x faster with [Hugging Face Transformer](https://huggingface.co/openai/whisper-large-v3):
 ```
 pip install -U stable-ts[hf]
 ```
-* [Alignment](#alignment) and [Refinement](#refinement) are not supported on Hugging Face models
+
 ```python
 model = stable_whisper.load_hf_whisper('base')
 result = model.transcribe('audio.mp3')
 ```
+Supports the [various versions on Hugging Face](https://huggingface.co/models?other=whisper&sort=downloads):
+```python
+model = stable_whisper.load_hf_whisper('openai/whisper-base.en')
+```
+
 
 <details>
 <summary>CLI</summary>
diff --git a/stable_whisper/alignment.py b/stable_whisper/alignment.py
@@ -14,7 +14,7 @@
 
 from .whisper_compatibility import (
     SAMPLE_RATE, N_FRAMES, N_FFT, pad_or_trim, log_mel_spectrogram, FRAMES_PER_SECOND, CHUNK_LENGTH, N_SAMPLES,
-    median_filter, DecodingTask, DecodingOptions, SuppressTokens, whisper, TOKENS_PER_SECOND
+    median_filter, DecodingTask, DecodingOptions, SuppressTokens, whisper, TOKENS_PER_SECOND, as_vanilla
 )
 
 if TYPE_CHECKING:
@@ -171,6 +171,7 @@ def align(
     >>> result.to_srt_vtt('helloword.srt')
     Saved 'helloworld.srt'
     """
+    model = as_vanilla(model)
     is_faster_model = model.__module__.startswith('faster_whisper.')
     if not is_faster_model:
         warn_compatibility_issues(whisper, ignore_compatibility)
@@ -333,6 +334,7 @@ def align_words(
     >>> result = [dict(start=0.0, end=0.5, text='hello world 1'), dict(start=0.5, end=1.0, text='hello world 2')]
     >>> result = model.align_words('audio.mp3', result, 'English')
     """
+    model = as_vanilla(model)
     is_faster_model = model.__module__.startswith('faster_whisper.')
     if not is_faster_model:
         warn_compatibility_issues(whisper, ignore_compatibility)
@@ -544,6 +546,7 @@ def refine(
     >>> result.to_srt_vtt('audio.srt')
     Saved 'audio.srt'
     """
+    model = as_vanilla(model)
     if result:
         if not result.has_words:
             if not result.language:
diff --git a/stable_whisper/timing.py b/stable_whisper/timing.py
@@ -82,6 +82,8 @@ def _compute_atten_weights(
     if cache['qks'] is None:
         _compute_qks(model, tokenizer, text_tokens, mel, tokens, cache)
     QKs = cache['qks']
+    if getattr(model, 'missing_alignment_heads', False) and not dynamic_heads_count:
+        dynamic_heads_count = 6
     if dynamic_heads_count:
         max_qk_len = round(num_samples / N_SAMPLES_PER_TOKEN)
         if not cache.get('is_processed_qks'):
diff --git a/stable_whisper/whisper_compatibility.py b/stable_whisper/whisper_compatibility.py
@@ -67,7 +67,7 @@ def _dummy_contextmanager():
     from whisper.tokenizer import get_tokenizer as get_whisper_tokenizer
 
     from whisper.tokenizer import Tokenizer
-    from whisper.model import Whisper
+    from whisper.model import Whisper, ModelDimensions, LayerNorm
     from whisper.decoding import DecodingTask, DecodingOptions, DecodingResult, SuppressTokens
     try:
         from whisper.model import disable_sdpa
@@ -90,7 +90,9 @@ def _dummy_contextmanager():
 
     log_mel_spectrogram = median_filter = dtw = merge_punctuations = get_whisper_tokenizer \
         = whisper_not_available
-    Tokenizer = Whisper = DecodingTask = DecodingOptions = DecodingResult = SuppressTokens = Unavailable
+    Tokenizer = Whisper = ModelDimensions = LayerNorm = \
+        DecodingTask = DecodingOptions = DecodingResult = SuppressTokens \
+        = Unavailable
     LANGUAGES = {
         "en": "english",
         "zh": "chinese",
@@ -330,3 +332,20 @@ def get_tokenizer(model=None, is_faster_model: bool = False, **kwargs):
         del kwargs['num_languages']
     kwargs['language'] = get_valid_language(kwargs.get('language'), is_faster_model, model)
     return tokenizer(**kwargs)
+
+
+def as_vanilla(model):
+    return model.as_vanilla_model() if hasattr(model, 'as_vanilla_model') else model
+
+
+def ln_to_fp32(module):
+    """
+    Convert all parameters in LayerNorm of model to float32.
+    """
+    for child in module.children():
+        if isinstance(child, LayerNorm):
+            child.weight.data = child.weight.data.float()
+            if child.bias is not None:
+                child.bias.data = child.bias.data.float()
+        else:
+            ln_to_fp32(child)
diff --git a/stable_whisper/whisper_word_level/hf_whisper.py b/stable_whisper/whisper_word_level/hf_whisper.py
@@ -7,6 +7,8 @@
 from ..non_whisper import transcribe_any
 from ..utils import isolate_useful_options
 
+from ..alignment import align, align_words, refine
+
 
 HF_MODELS = {
     "tiny.en": "openai/whisper-tiny.en",
@@ -25,6 +27,29 @@
     "turbo": "openai/whisper-large-v3-turbo"
 }
 
+WHISPER_TO_HF_MAPPING = {
+    "blocks": "layers",
+    "mlp.0": "fc1",
+    "mlp.2": "fc2",
+    "mlp_ln": "final_layer_norm",
+    ".attn.query": ".self_attn.q_proj",
+    ".attn.key": ".self_attn.k_proj",
+    ".attn.value": ".self_attn.v_proj",
+    ".attn_ln": ".self_attn_layer_norm",
+    ".attn.out": ".self_attn.out_proj",
+    ".cross_attn.query": ".encoder_attn.q_proj",
+    ".cross_attn.key": ".encoder_attn.k_proj",
+    ".cross_attn.value": ".encoder_attn.v_proj",
+    ".cross_attn_ln": ".encoder_attn_layer_norm",
+    ".cross_attn.out": ".encoder_attn.out_proj",
+    "decoder.ln.": "decoder.layer_norm.",
+    "encoder.ln.": "encoder.layer_norm.",
+    "token_embedding": "embed_tokens",
+    "encoder.positional_embedding": "encoder.embed_positions.weight",
+    "decoder.positional_embedding": "decoder.embed_positions.weight",
+    "ln_post": "layer_norm",
+}
+
 
 def get_device(device: str = None) -> str:
     if device:
@@ -81,6 +106,7 @@ def __init__(self, model_name: str, device: str = None, flash: bool = False, pip
         self._pipe = load_hf_pipe(self._model_name, device, flash=flash, **pipeline_kwargs) if pipeline is None \
             else pipeline
         self._model_name = getattr(self._pipe.model, 'name_or_path', self._model_name)
+        self._vanilla_model = None
 
     @property
     def sampling_rate(self):
@@ -263,6 +289,70 @@ def transcribe(
             **transcribe_any_options
         )
 
+    def as_vanilla_model(self):
+        """
+        Return a vanilla Whisper model instance with current weights.
+
+        The new instance is only loaded once. Most weights share the same memory as this Hugging Face model instance.
+        """
+        if self._vanilla_model is not None:
+            return self._vanilla_model
+
+        from ..whisper_compatibility import ModelDimensions, Whisper, ln_to_fp32
+        from .original_whisper import modify_model
+        try:
+            from transformers.models.whisper.convert_openai_to_hf import WHISPER_MAPPING
+            whisper2hf_mapping = WHISPER_MAPPING
+        except (ImportError, ModuleNotFoundError):
+            whisper2hf_mapping = WHISPER_TO_HF_MAPPING
+
+        hf_mapping = {v: k for k, v in whisper2hf_mapping.items()}
+        assert len(whisper2hf_mapping) == len(hf_mapping)
+
+        state_dict = self._pipe.model.model.state_dict()
+        config = self._pipe.model.config
+
+        if 'encoder.layer_norm.' in hf_mapping:
+            hf_mapping['encoder.layer_norm.'] = 'encoder.ln_post.'
+        for key in list(state_dict.keys()):
+            new_key = key
+            for k, v in hf_mapping.items():
+                if k in key:
+                    new_key = new_key.replace(k, v)
+            if new_key != key:
+                state_dict[new_key] = state_dict.pop(key)
+
+        dims = ModelDimensions(
+            n_mels=config.num_mel_bins,
+            n_audio_ctx=config.max_source_positions,
+            n_audio_state=config.d_model,
+            n_audio_head=config.encoder_attention_heads,
+            n_audio_layer=config.encoder_layers,
+            n_vocab=config.vocab_size,
+            n_text_ctx=config.max_target_positions,
+            n_text_state=self._pipe.model.model.decoder.embed_positions.embedding_dim,
+            n_text_head=config.decoder_attention_heads,
+            n_text_layer=config.decoder_layers
+        )
+        new_model = Whisper(dims)
+        if alignment_heads := getattr(self._pipe.model.generation_config, 'alignment_heads', None):
+            alignment_heads = torch.as_tensor(alignment_heads).T
+            final_heads = torch.zeros(new_model.dims.n_text_layer, new_model.dims.n_text_head, dtype=torch.bool)
+            final_heads[alignment_heads[0], alignment_heads[1]] = True
+            new_model.register_buffer("alignment_heads", final_heads.to_sparse(), persistent=False)
+        else:
+            setattr(new_model, 'missing_alignment_heads', True)
+        new_model.load_state_dict(state_dict, strict=True, assign=True)
+        new_model.to(device=self._pipe.model.device)
+        ln_to_fp32(new_model)
+        modify_model(new_model)
+        self._vanilla_model = new_model
+        return self._vanilla_model
+
+    align = align
+    align_words = align_words
+    refine = refine
+
 
 def load_hf_whisper(model_name: str, device: str = None, flash: bool = False, pipeline=None, **pipeline_kwargs):
     return WhisperHF(model_name, device, flash=flash, pipeline=pipeline, **pipeline_kwargs)