added support for MLX-Whisper (#442)

felipehertzer · jianfch · web-flow · commit c12547261162 · 2025-03-26T19:05:38.000-04:00
-added transcription, alignment, refinement support for MLX-Whisper models

-added tests for MLX-Whisper models

-updated to README.md to list support for MLX-Whisper models

Co-authored-by: jian &lt;jianchf@outlook.com&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -67,3 +67,22 @@ jobs:
       - run: python test/test_transcribe.py load_hf_whisper
       - run: python test/test_align.py load_hf_whisper
       - run: python test/test_refine.py load_hf_whisper
+
+  mlx-test:
+    runs-on: macos-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Install FFmpeg
+        run: brew install ffmpeg
+      - name: Install Package with MLX dependencies
+        run: pip3 install .["mlx"]
+      - name: Run MLX transcribe tests
+        run: python test/test_transcribe.py load_mlx_whisper
+      - name: Run MLX align tests
+        run: python test/test_align.py load_mlx_whisper
+      - name: Run MLX refine tests
+        run: python test/test_refine.py load_mlx_whisper
diff --git a/README.md b/README.md
@@ -536,6 +536,32 @@ stable-ts audio.mp3 -o audio.srt -hw
 
 </details>
 
+<details>
+<summary>MLX Whisper (on Apple Silicon)</summary>
+
+Transcribe faster on Apple devices with [MLX Whisper](https://github.com/ml-explore/mlx-examples/tree/main/whisper):
+```
+pip install -U stable-ts[mlx]
+```
+
+```python
+import stable_whisper
+
+model = stable_whisper.load_mlx_whisper('base')
+result = model.transcribe('audio.mp3')
+```
+
+
+<details>
+<summary>CLI</summary>
+
+```commandline
+stable-ts audio.mp3 -o audio.srt -mlx
+```
+</details>
+
+</details>
+
 ### Output
 
 https://github.com/jianfch/stable-ts/assets/28970749/c22dcdf9-79cb-485a-ae38-184d006e513e
diff --git a/setup.py b/setup.py
@@ -38,6 +38,9 @@ def read_me() -> str:
             "transformers>=4.23.0",
             "optimum",
             "accelerate"
+        ],
+        "mlx": [
+            "mlx-whisper"
         ]
     },
     entry_points={
diff --git a/stable_whisper/alignment.py b/stable_whisper/alignment.py
@@ -173,6 +173,7 @@ def align(
     """
     model = as_vanilla(model)
     is_faster_model = model.__module__.startswith('faster_whisper.')
+    is_mlx_model = model.__module__.startswith('mlx_whisper.')
     if not is_faster_model:
         warn_compatibility_issues(whisper, ignore_compatibility)
     max_token_step = (model.max_length if is_faster_model else model.dims.n_text_ctx) - 6
@@ -185,7 +186,7 @@ def align(
 
     options = AllOptions(options, vanilla_align=not is_faster_model)
     split_words_by_space = getattr(tokenizer, 'language_code', tokenizer.language) not in {"zh", "ja", "th", "lo", "my"}
-    model_type = 'fw' if is_faster_model else None
+    model_type = 'fw' if is_faster_model else 'mlx' if is_mlx_model else None
     inference_func = get_whisper_alignment_func(model, tokenizer, model_type, options)
 
     aligner = Aligner(
@@ -336,14 +337,15 @@ def align_words(
     """
     model = as_vanilla(model)
     is_faster_model = model.__module__.startswith('faster_whisper.')
+    is_mlx_model = model.__module__.startswith('mlx_whisper.')
     if not is_faster_model:
         warn_compatibility_issues(whisper, ignore_compatibility)
     tokenizer, supported_languages = get_alignment_tokenizer(model, is_faster_model, result, language, tokenizer)
 
     options = AllOptions(options)
     split_words_by_space = getattr(tokenizer, 'language_code', tokenizer.language) not in {"zh", "ja", "th", "lo", "my"}
     max_segment_tokens = model.max_length if is_faster_model else model.dims.n_text_ctx
-    inference_func = get_whisper_alignment_func(model, tokenizer, 'fw' if is_faster_model else None, options)
+    inference_func = get_whisper_alignment_func(model, tokenizer, 'fw' if is_faster_model else 'mlx' if is_mlx_model else None, options)
 
     aligner = Aligner(
         inference_func=inference_func,
@@ -393,7 +395,7 @@ def get_whisper_alignment_func(
         model_type: Optional[str] = None,
         options: Optional[AllOptions] = None
 ):
-    assert model_type in (None, 'fw')
+    assert model_type in (None, 'fw', 'mlx')
 
     if model_type is None:
         def compute_timestamps(audio_segment: torch.Tensor, word_tokens: List[WordToken]) -> List[dict]:
@@ -421,6 +423,53 @@ def compute_timestamps(audio_segment: torch.Tensor, word_tokens: List[WordToken]
             )
             return [w for seg in temp_segments for w in seg['words']]
 
+    elif model_type == 'mlx':
+        from mlx_whisper.audio import (
+            N_FRAMES as MLX_N_FRAMES,
+            SAMPLE_RATE as MLX_SAMPLE_RATE,
+            log_mel_spectrogram as log_mel_spectrogram_mx,
+            pad_or_trim as pad_or_trim_mx
+        )
+        import mlx.core as mx
+        import mlx_whisper.timing as timing
+
+        def compute_timestamps(audio_segment_torch: torch.Tensor, word_tokens: List[WordToken]) -> List[dict]:
+            audio_segment_np = audio_segment_torch.squeeze().numpy().astype('float32')
+            audio_segment_mx = mx.array(audio_segment_np)
+
+            segment_samples = audio_segment_mx.shape[-1]
+
+            temp_segment = dict(
+                seek=0,
+                start=0.0,
+                end=round(segment_samples / MLX_SAMPLE_RATE, 3),
+                tokens=[t for wt in word_tokens for t in wt.tokens],
+                words=[]
+            )
+
+            mel_segments_raw = log_mel_spectrogram_mx(
+                audio=np.array(audio_segment_mx),
+                n_mels=model.dims.n_mels,
+                padding=0
+            )
+
+            num_frames_unpadded = mel_segments_raw.shape[0]
+
+            mel_segments_padded_time = pad_or_trim_mx(mel_segments_raw.T, MLX_N_FRAMES)
+
+            mel_segments_nlc = mel_segments_padded_time.T
+
+            timing.add_word_timestamps(
+                segments=[temp_segment],
+                model=model,
+                tokenizer=tokenizer,
+                mel=mel_segments_nlc,
+                num_frames=num_frames_unpadded,
+                last_speech_timestamp=0.0
+            )
+
+            return temp_segment.get('words', [])
+
     else:
         from .whisper_compatibility import is_faster_whisper_on_pt
         from faster_whisper.version import __version__ as fw_ver
@@ -548,6 +597,7 @@ def refine(
     """
     model = as_vanilla(model)
     is_faster_model = model.__module__.startswith('faster_whisper.')
+    is_mlx_model = model.__module__.startswith('mlx_whisper')
     if result and (not result.has_words or any(word.probability is None for word in result.all_words())):
         if not result.language:
             raise RuntimeError(f'cannot align words with result missing language')
@@ -558,7 +608,7 @@ def refine(
             word.tokens = tokenizer.encode(word.word)
 
     options = AllOptions(options, post=False, silence=False, align=False)
-    model_type = 'fw' if is_faster_model else None
+    model_type = 'fw' if is_faster_model else 'mlx' if is_mlx_model else None
     inference_func = get_whisper_refinement_func(model, tokenizer, model_type, single_batch)
     max_inference_tokens = (model.max_length if is_faster_model else model.dims.n_text_ctx) - 6
 
@@ -588,7 +638,7 @@ def get_whisper_refinement_func(
         model_type: Optional[str] = None,
         single_batch: bool = False
 ):
-    assert model_type in (None, 'fw')
+    assert model_type in (None, 'fw', 'mlx')
 
     if model_type is None:
         def inference_func(audio_segment: torch, tokens: List[int]) -> torch.Tensor:
@@ -616,6 +666,56 @@ def inference_func(audio_segment: torch, tokens: List[int]) -> torch.Tensor:
             token_probs = sampled_logits.softmax(dim=-1)
             return token_probs
 
+    elif model_type == 'mlx':
+        from mlx_whisper.audio import (
+            N_FRAMES_MLX,
+            log_mel_spectrogram as log_mel_spectrogram_mx,
+            pad_or_trim as pad_or_trim_mx
+        )
+        import mlx.core as mx
+
+        def inference_func(audio_batch_torch: torch.Tensor, tokens: List[int]) -> torch.Tensor:
+            input_tokens_mx = mx.array(
+                [
+                    *tokenizer.sot_sequence,
+                    tokenizer.no_timestamps,
+                    *tokens,
+                    tokenizer.eot,
+                ]
+            )
+
+            audio_batch_np = audio_batch_torch.numpy().astype('float32')
+            audio_batch_mx = mx.array(audio_batch_np)
+
+            mel_list = []
+            for audio_segment_mx in audio_batch_mx:
+                mel_raw = log_mel_spectrogram_mx(audio=np.array(audio_segment_mx), n_mels=model.dims.n_mels, padding=0)
+
+                mel_transposed = mel_raw.T
+
+                mel_padded_cl = pad_or_trim_mx(mel_transposed, N_FRAMES_MLX)
+
+                mel_nlc = mel_padded_cl.T
+                mel_list.append(mel_nlc)
+
+            mel_batch_nlc = mx.stack(mel_list, axis=0)
+
+            logits_list = []
+            for single_mel_nlc in mel_batch_nlc:
+                logits_single = model(single_mel_nlc[None], input_tokens_mx[None])
+                logits_list.append(logits_single)
+
+            logits = mx.concatenate(logits_list, axis=0)
+
+            sot_len = len(tokenizer.sot_sequence)
+            start_idx = sot_len + 1
+            end_idx = start_idx + len(tokens)
+
+            sampled_logits = logits[:, start_idx:end_idx, :tokenizer.eot]
+            token_probs = mx.softmax(sampled_logits, axis=-1)
+
+            token_probs_np = np.array(token_probs, copy=True)
+            return torch.from_numpy(token_probs_np)
     else:
         from .whisper_compatibility import is_faster_whisper_on_pt
         from faster_whisper.version import __version__ as fw_ver
diff --git a/stable_whisper/whisper_word_level/__init__.py b/stable_whisper/whisper_word_level/__init__.py
@@ -3,9 +3,10 @@
 from .original_whisper import transcribe_stable, transcribe_minimal, load_model, modify_model
 from .faster_whisper import load_faster_whisper
 from .hf_whisper import load_hf_whisper
+from .mlx_whisper import load_mlx_whisper
 
 
-__all__ = ['load_model', 'modify_model', 'load_faster_whisper', 'load_hf_whisper']
+__all__ = ['load_model', 'modify_model', 'load_faster_whisper', 'load_hf_whisper', 'load_mlx_whisper',]
 
 warnings.filterwarnings('ignore', module='whisper', message='.*Triton.*', category=UserWarning)
 
diff --git a/stable_whisper/whisper_word_level/cli.py b/stable_whisper/whisper_word_level/cli.py
@@ -52,7 +52,7 @@ def valid_model_name(name: str) -> str:
             from whisper import available_models
         elif is_faster_whisper:
             from faster_whisper.utils import available_models
-        _models = None if is_hf_whisper or available_models is None else available_models()
+        _models = None if is_hf_whisper or is_mlx_whisper or available_models is None else available_models()
 
         if not _models or name in _models or os.path.exists(name):
             return name
@@ -373,6 +373,10 @@ def url_to_path(url: str):
                              ' and even more speed with Flash Attention enabled on supported GPUs'
                              '(https://huggingface.co/openai/whisper-large-v3); '
                              'note: some features may not be available')
+    parser.add_argument('--mlx_whisper', '-mlx', action='store_true',
+                        help='whether to use mlx-whisper '
+                             '(https://github.com/ml-explore/mlx-examples/tree/main/whisper); '
+                             'note: some features may not be available')
 
     parser.add_argument('--persist', '-p', action='store_true',
                         help='Keep previous model loaded for the future sets of commands in the same CLI instance')
@@ -403,12 +407,14 @@ def url_to_path(url: str):
             if '--model' not in args and '-m' not in args:
                 args.extend(['-m', _cache['model']['name']])
             model_type = _cache['model']['type']
-            type_arg = '--faster_whisper' in args or '-fw' in args or '--huggingface_whisper' in args or '-hw' in args
+            type_arg = '--faster_whisper' in args or '-fw' in args or '--huggingface_whisper' in args or '-hw' in args or '--mlx_whisper' in args or '-mlx' in args
             if not type_arg:
                 if model_type == 'Faster-Whisper':
                     args.append('-fw')
                 elif model_type == 'Hugging Face Whisper':
                     args.append('-hw')
+                elif model_type == 'MLX Whisper':
+                    args.append('-mlx')
 
         _, invalid_args = parser.parse_known_args(args)
         if invalid_args:
@@ -423,9 +429,12 @@ def url_to_path(url: str):
         raise ValueError('langauge is required for --align / --locate')
 
     is_faster_whisper = args.pop('faster_whisper')
+    is_mlx_whisper = args.pop('mlx_whisper')
     is_hf_whisper = args.pop('huggingface_whisper')
     assert not (is_faster_whisper and is_hf_whisper), f'--huggingface_whisper cannot be used with --faster_whisper'
-    is_original_whisper = not (is_faster_whisper or is_hf_whisper)
+    assert not (is_faster_whisper and is_mlx_whisper), f'--mlx_whisper cannot be used with --faster_whisper'
+    assert not (is_hf_whisper and is_mlx_whisper), f'--mlx_whisper cannot be used with --huggingface_whisper'
+    is_original_whisper = not (is_faster_whisper or is_hf_whisper or is_mlx_whisper)
     args['language'] = valid_language(args['language'])
     model_name: str = valid_model_name(args.pop("model"))
     model_dir: str = args.pop("model_dir")
@@ -463,6 +472,10 @@ def url_to_path(url: str):
             model_type_name = 'Faster-Whisper'
             from .faster_whisper import load_faster_whisper as load_model_func
             model_name_kwarg = dict(model_size_or_path=model_name)
+        elif is_mlx_whisper:
+            model_type_name = 'MLX Whisper'
+            from .mlx_whisper import load_mlx_whisper as load_model_func
+            model_name_kwarg = dict(model_name=model_name)
         else:
             model_type_name = 'Hugging Face Whisper'
             from .hf_whisper import load_hf_whisper as load_model_func
diff --git a/stable_whisper/whisper_word_level/mlx_whisper.py b/stable_whisper/whisper_word_level/mlx_whisper.py