From 2aff0953039d35246e80b5f32553c1b63060e2ef Mon Sep 17 00:00:00 2001 From: Metric Void Date: Mon, 14 Jul 2025 14:28:40 -0400 Subject: [PATCH 1/8] Bump Versions Forward --- .github/workflows/test.yml | 2 +- setup.py | 4 +- .../whisper_word_level/hf_whisper.py | 70 ++++++++++++++++--- 3 files changed, 63 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1edcfe8..272dea9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -63,7 +63,7 @@ jobs: - run: python test/test_transcribe.py load_faster_whisper - run: python test/test_align.py load_faster_whisper - run: python test/test_refine.py load_faster_whisper - - run: pip3 install .["hf"] 'transformers<=4.46.3' + - run: pip3 install .["hf"] - run: python test/test_transcribe.py load_hf_whisper - run: python test/test_align.py load_hf_whisper - run: python test/test_refine.py load_hf_whisper diff --git a/setup.py b/setup.py index 30b9f51..aeefdbf 100644 --- a/setup.py +++ b/setup.py @@ -28,14 +28,14 @@ def read_me() -> str: "torch", "torchaudio", "tqdm", - "openai-whisper>=20230314,<=20240930" + "openai-whisper>=20250625" ], extras_require={ "fw": [ "faster-whisper" ], "hf": [ - "transformers>=4.23.0", + "transformers>=4.49", "optimum", "accelerate" ], diff --git a/stable_whisper/whisper_word_level/hf_whisper.py b/stable_whisper/whisper_word_level/hf_whisper.py index 8de045e..9bab37a 100644 --- a/stable_whisper/whisper_word_level/hf_whisper.py +++ b/stable_whisper/whisper_word_level/hf_whisper.py @@ -63,28 +63,37 @@ def get_device(device: str = None) -> str: def load_hf_pipe(model_name: str, device: str = None, flash: bool = False, **pipeline_kwargs): from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline + from transformers.configuration_utils import PretrainedConfig device = get_device(device) is_cpu = (device if isinstance(device, str) else getattr(device, 'type', None)) == 'cpu' dtype = torch.float32 if is_cpu or not torch.cuda.is_available() else torch.float16 model_id = HF_MODELS.get(model_name, model_name) + + if flash: + config = PretrainedConfig( + attn_implementation="flash_attention_2", + ) + else: + config = None + model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=dtype, low_cpu_mem_usage=True, use_safetensors=True, - use_flash_attention_2=flash + config=config ).to(device) processor = AutoProcessor.from_pretrained(model_id) - if not flash: - try: - model = model.to_bettertransformer() - except (ValueError, ImportError) as e: - import warnings - warnings.warn( - f'Failed convert model to BetterTransformer due to: {e}' - ) + # if not flash: + # try: + # model = model.to_bettertransformer() + # except (ValueError, ImportError) as e: + # import warnings + # warnings.warn( + # f'Failed convert model to BetterTransformer due to: {e}' + # ) final_pipe_kwargs = dict( task="automatic-speech-recognition", @@ -92,9 +101,10 @@ def load_hf_pipe(model_name: str, device: str = None, flash: bool = False, **pip tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, - chunk_length_s=30, + # chunk_length_s=30, torch_dtype=dtype, device=device, + return_language=True ) final_pipe_kwargs.update(**pipeline_kwargs) pipe = pipeline(**final_pipe_kwargs) @@ -106,6 +116,7 @@ class WhisperHF: def __init__(self, model_name: str, device: str = None, flash: bool = False, pipeline=None, **pipeline_kwargs): self._model_name = model_name + pipeline_kwargs['return_language'] = True self._pipe = load_hf_pipe(self._model_name, device, flash=flash, **pipeline_kwargs) if pipeline is None \ else pipeline self._model_name = getattr(self._pipe.model, 'name_or_path', self._model_name) @@ -154,6 +165,45 @@ def _inner_transcribe( language = 'en' if not language and result and 'language' in result[0]: language = result[0]['language'] + if not language and hasattr(output, 'get') and 'detected_language' in output: + language = output['detected_language'] + if not language: + # Use the pipeline's language detection by accessing the generated tokens + try: + # Get the raw generated tokens from the model + import torch + sample_audio = audio[:int(self.sampling_rate * 10)] # Use first 10 seconds + inputs = self._pipe.feature_extractor(sample_audio, sampling_rate=self.sampling_rate, return_tensors="pt") + + # Ensure input features match model dtype and device + model_dtype = next(self._pipe.model.parameters()).dtype + model_device = next(self._pipe.model.parameters()).device + inputs.input_features = inputs.input_features.to(dtype=model_dtype, device=model_device) + + # Generate with minimal tokens to detect language + with torch.no_grad(): + generated_ids = self._pipe.model.generate( + inputs.input_features, + max_new_tokens=10, + do_sample=False, + output_scores=True, + return_dict_in_generate=True + ) + + # Decode the tokens to extract language information + tokens = self._pipe.tokenizer.batch_decode(generated_ids.sequences, skip_special_tokens=False)[0] + + # Extract language token (format: <|en|>, <|fr|>, etc.) + import re + lang_match = re.search(r'<\|(\w{2})\|>', tokens) + if lang_match: + language = lang_match.group(1) + else: + language = None + + except Exception as e: + print(f'Error detecting language: {e}') + language = None if verbose is not None: print(f'Transcription completed.') From cf98c8d5567f35d0164ddfbab6156852a3edc1ca Mon Sep 17 00:00:00 2001 From: Metric Void Date: Mon, 14 Jul 2025 14:30:07 -0400 Subject: [PATCH 2/8] Extend tests --- .github/workflows/test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 272dea9..3e1c694 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,6 +3,8 @@ on: push: branches: - main + - forward + - backward pull_request: branches: - main From b5f611eeb740eee531bf4e5f5fe60c790b6bde63 Mon Sep 17 00:00:00 2001 From: Metric Void Date: Mon, 14 Jul 2025 14:34:32 -0400 Subject: [PATCH 3/8] Remove 3.8 from the compatibility matrix --- .github/workflows/test.yml | 12 ------------ setup.py | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3e1c694..029b5a4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,18 +15,6 @@ jobs: strategy: matrix: include: - - python-version: '3.8' - pytorch-version: 1.10.1 - numpy-requirement: "'numpy<2'" - tokenizers-requirement: "'tokenizers<=0.20.3'" - - python-version: '3.8' - pytorch-version: 1.13.1 - numpy-requirement: "'numpy<2'" - tokenizers-requirement: "'tokenizers<=0.20.3'" - - python-version: '3.8' - pytorch-version: 2.0.1 - numpy-requirement: "'numpy<2'" - tokenizers-requirement: "'tokenizers<=0.20.3'" - python-version: '3.9' pytorch-version: 2.1.2 numpy-requirement: "'numpy<2'" diff --git a/setup.py b/setup.py index aeefdbf..c8fcd3a 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def read_me() -> str: description="Modifies OpenAI's Whisper to produce more reliable timestamps.", long_description=read_me(), long_description_content_type='text/markdown', - python_requires=">=3.8", + python_requires=">=3.9", author="Jian", url="https://github.com/jianfch/stable-ts", license="MIT", From e71ac1e21f7e2b4d1a53ceb890bdb4d0c0c02cad Mon Sep 17 00:00:00 2001 From: Metric Void Date: Mon, 14 Jul 2025 14:44:01 -0400 Subject: [PATCH 4/8] Add new whisper version --- stable_whisper/whisper_compatibility.py | 1 + 1 file changed, 1 insertion(+) diff --git a/stable_whisper/whisper_compatibility.py b/stable_whisper/whisper_compatibility.py index 8cd1118..ca9efa9 100644 --- a/stable_whisper/whisper_compatibility.py +++ b/stable_whisper/whisper_compatibility.py @@ -16,6 +16,7 @@ '20231117', '20240927', '20240930', + '20250625', ) _required_whisper_ver = _COMPATIBLE_WHISPER_VERSIONS[-1] From d30c74fcff2cfa7172bb42d179ffb1458c5a4da6 Mon Sep 17 00:00:00 2001 From: Metric Void Date: Mon, 14 Jul 2025 15:01:19 -0400 Subject: [PATCH 5/8] Remove local branches from PR --- .github/workflows/test.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 029b5a4..2e2e865 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,8 +3,6 @@ on: push: branches: - main - - forward - - backward pull_request: branches: - main From 297915b75fdfca01956a44b2ad30c6861f43f6ff Mon Sep 17 00:00:00 2001 From: Metric Void Date: Mon, 14 Jul 2025 15:03:04 -0400 Subject: [PATCH 6/8] Code cleanup --- stable_whisper/whisper_word_level/hf_whisper.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/stable_whisper/whisper_word_level/hf_whisper.py b/stable_whisper/whisper_word_level/hf_whisper.py index 9bab37a..d00671c 100644 --- a/stable_whisper/whisper_word_level/hf_whisper.py +++ b/stable_whisper/whisper_word_level/hf_whisper.py @@ -86,15 +86,6 @@ def load_hf_pipe(model_name: str, device: str = None, flash: bool = False, **pip processor = AutoProcessor.from_pretrained(model_id) - # if not flash: - # try: - # model = model.to_bettertransformer() - # except (ValueError, ImportError) as e: - # import warnings - # warnings.warn( - # f'Failed convert model to BetterTransformer due to: {e}' - # ) - final_pipe_kwargs = dict( task="automatic-speech-recognition", model=model, @@ -168,9 +159,9 @@ def _inner_transcribe( if not language and hasattr(output, 'get') and 'detected_language' in output: language = output['detected_language'] if not language: - # Use the pipeline's language detection by accessing the generated tokens + # HF Pipelines have broken language detection. + # Manually detect language by generating tokens from the first 10 seconds of the audio. try: - # Get the raw generated tokens from the model import torch sample_audio = audio[:int(self.sampling_rate * 10)] # Use first 10 seconds inputs = self._pipe.feature_extractor(sample_audio, sampling_rate=self.sampling_rate, return_tensors="pt") From 01ce23adffa6c64eb8e558c003e06efd57fa569a Mon Sep 17 00:00:00 2001 From: Metric Void Date: Mon, 14 Jul 2025 18:32:07 -0400 Subject: [PATCH 7/8] Add back 3.8 but limit test matrix --- .github/workflows/test.yml | 24 +++++++++++++++++-- setup.py | 2 +- .../whisper_word_level/hf_whisper.py | 9 ------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 029b5a4..97d19b0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -4,7 +4,6 @@ on: branches: - main - forward - - backward pull_request: branches: - main @@ -15,30 +14,51 @@ jobs: strategy: matrix: include: + - python-version: '3.8' + pytorch-version: 1.10.1 + numpy-requirement: "'numpy<2'" + tokenizers-requirement: "'tokenizers<=0.20.3'" + transformers-requirement: "'transformers==4.46.3'" + - python-version: '3.8' + pytorch-version: 1.13.1 + numpy-requirement: "'numpy<2'" + tokenizers-requirement: "'tokenizers<=0.20.3'" + transformers-requirement: "'transformers==4.46.3'" + - python-version: '3.8' + pytorch-version: 2.0.1 + numpy-requirement: "'numpy<2'" + tokenizers-requirement: "'tokenizers<=0.20.3'" + transformers-requirement: "'transformers==4.46.3'" - python-version: '3.9' pytorch-version: 2.1.2 numpy-requirement: "'numpy<2'" tokenizers-requirement: "'tokenizers'" + transformers-requirement: "'transformers'" - python-version: '3.10' pytorch-version: 2.2.2 numpy-requirement: "'numpy<2'" tokenizers-requirement: "'tokenizers'" + transformers-requirement: "'transformers'" - python-version: '3.11' pytorch-version: 2.3.1 numpy-requirement: "'numpy'" tokenizers-requirement: "'tokenizers'" + transformers-requirement: "'transformers'" - python-version: '3.12' pytorch-version: 2.4.1 numpy-requirement: "'numpy'" tokenizers-requirement: "'tokenizers'" + transformers-requirement: "'transformers'" - python-version: '3.12' pytorch-version: 2.5.0 numpy-requirement: "'numpy'" tokenizers-requirement: "'tokenizers'" + transformers-requirement: "'transformers'" - python-version: '3.12' pytorch-version: 2.6.0 numpy-requirement: "'numpy'" tokenizers-requirement: "'tokenizers'" + transformers-requirement: "'transformers'" steps: - uses: conda-incubator/setup-miniconda@v3 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} @@ -53,7 +73,7 @@ jobs: - run: python test/test_transcribe.py load_faster_whisper - run: python test/test_align.py load_faster_whisper - run: python test/test_refine.py load_faster_whisper - - run: pip3 install .["hf"] + - run: pip3 install .["hf"] ${{ matrix.transformers-requirement }} - run: python test/test_transcribe.py load_hf_whisper - run: python test/test_align.py load_hf_whisper - run: python test/test_refine.py load_hf_whisper diff --git a/setup.py b/setup.py index c8fcd3a..08038a1 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ def read_me() -> str: "faster-whisper" ], "hf": [ - "transformers>=4.49", + "transformers>=4.23.0", "optimum", "accelerate" ], diff --git a/stable_whisper/whisper_word_level/hf_whisper.py b/stable_whisper/whisper_word_level/hf_whisper.py index 9bab37a..19ba36f 100644 --- a/stable_whisper/whisper_word_level/hf_whisper.py +++ b/stable_whisper/whisper_word_level/hf_whisper.py @@ -86,15 +86,6 @@ def load_hf_pipe(model_name: str, device: str = None, flash: bool = False, **pip processor = AutoProcessor.from_pretrained(model_id) - # if not flash: - # try: - # model = model.to_bettertransformer() - # except (ValueError, ImportError) as e: - # import warnings - # warnings.warn( - # f'Failed convert model to BetterTransformer due to: {e}' - # ) - final_pipe_kwargs = dict( task="automatic-speech-recognition", model=model, From 361eced09dc49c2f21fb45d3b0a614acc99c365e Mon Sep 17 00:00:00 2001 From: Metric Void Date: Mon, 14 Jul 2025 18:33:42 -0400 Subject: [PATCH 8/8] relax setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 08038a1..afd6f4b 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ def read_me() -> str: description="Modifies OpenAI's Whisper to produce more reliable timestamps.", long_description=read_me(), long_description_content_type='text/markdown', - python_requires=">=3.9", + python_requires=">=3.8", author="Jian", url="https://github.com/jianfch/stable-ts", license="MIT",