added new alignment algorithm

jianfch · jianfch · commit 55669d7e51aa · 2025-10-28T21:39:32.000-04:00
-added parameter, `aligner`, to `transcribe()`/`align()`/`align_words()` (only for vanilla models); `aligner="new"` uses implementation of new alignment algorithm (https://arxiv.org/abs/2509.09987) -updated doctstrings to reflect new parameter
diff --git a/README.md b/README.md
@@ -261,6 +261,8 @@ Docstrings:
         word-timestamp extraction. Specify the number of heads or `True` for default of 6 heads.
         To specify number of iterations for finding the optimal heads,
         use string with "," to separate heads and iterations (e.g. "8,3" for 8 heads and 3 iterations).
+    aligner : "legacy" or "new" or dict, default "legacy"
+        Algorithm for selecting attention heads for alignment. Use dictionary to specify keyword arguments for 'new'.
     clip_timestamps : str or list of float
         Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process.
         The last end timestamp defaults to the end of the file.
@@ -1007,6 +1009,8 @@ Docstring:
         word-timestamp extraction. Specify the number of heads or `True` for default of 6 heads.
         To specify number of iterations for finding the optimal heads,
         use string with "," to separate heads and iterations (e.g. "8,3" for 8 heads and 3 iterations).
+    aligner : "legacy" or "new" or dict, default "legacy"
+        Algorithm for selecting attention heads for alignment. Use dictionary to specify keyword arguments for 'new'.
 
     Returns
     -------
@@ -1126,6 +1130,8 @@ Docstring:
         word-timestamp extraction. Specify the number of heads or `True` for default of 6 heads.
         To specify number of iterations for finding the optimal heads,
         use string with "," to separate heads and iterations (e.g. "8,3" for 8 heads and 3 iterations).
+    aligner : "legacy" or "new" or dict, default "legacy"
+        Algorithm for selecting attention heads for alignment. Use dictionary to specify keyword arguments for 'new'.
     normalize_text : bool or dict, default True
         Whether to normalize text of each segment.
     inplace : bool, default True
diff --git a/stable_whisper/alignment.py b/stable_whisper/alignment.py
@@ -149,6 +149,8 @@ def align(
         word-timestamp extraction. Specify the number of heads or `True` for default of 6 heads.
         To specify number of iterations for finding the optimal heads,
         use string with "," to separate heads and iterations (e.g. "8,3" for 8 heads and 3 iterations).
+    aligner : "legacy" or "new" or dict, default "legacy"
+        Algorithm for selecting attention heads for alignment. Use dictionary to specify keyword arguments for 'new'.
 
     Returns
     -------
@@ -316,6 +318,8 @@ def align_words(
         word-timestamp extraction. Specify the number of heads or `True` for default of 6 heads.
         To specify number of iterations for finding the optimal heads,
         use string with "," to separate heads and iterations (e.g. "8,3" for 8 heads and 3 iterations).
+    aligner : "legacy" or "new" or dict, default "legacy"
+        Algorithm for selecting attention heads for alignment. Use dictionary to specify keyword arguments for 'new'.
     normalize_text : bool or dict, default True
         Whether to normalize text of each segment.
     inplace : bool, default True
@@ -419,7 +423,8 @@ def compute_timestamps(audio_segment: torch.Tensor, word_tokens: List[WordToken]
                 append_punctuations='',
                 gap_padding=None,
                 extra_models=options.align.extra_models,
-                dynamic_heads=options.align.dynamic_heads
+                dynamic_heads=options.align.dynamic_heads,
+                aligner=options.align.aligner
             )
             return [w for seg in temp_segments for w in seg['words']]
 
diff --git a/stable_whisper/non_whisper/alignment.py b/stable_whisper/non_whisper/alignment.py
@@ -181,6 +181,13 @@ def __init__(
             Only if ``presplit=True``, ``gap_padding`` is prepended to each segments for word timing alignment.
             Used to reduce the probability of model predicting timestamps earlier than the first utterance.
             Ignored if ``model`` is a faster-whisper model.
+        dynamic_heads : bool or int or str, optional
+            Whether to find optimal cross-attention heads during runtime instead of using the predefined heads for
+            word-timestamp extraction. Specify the number of heads or `True` for default of 6 heads.
+            To specify number of iterations for finding the optimal heads,
+            use string with "," to separate heads and iterations (e.g. "8,3" for 8 heads and 3 iterations).
+        aligner : "legacy" or "new" or dict, default "legacy"
+            Algorithm for selecting attention heads for alignment. Use dictionary to specify keyword arguments for 'new'.
 
         Notes
         -----
diff --git a/stable_whisper/options.py b/stable_whisper/options.py
@@ -164,6 +164,7 @@ def __init__(self, **kwargs):
         self.presplit: Union[bool, List[str]] = self._pop('presplit', True)
         self.extra_models: Optional[list] = self._pop('extra_models', None)
         self.dynamic_heads: Optional[Union[bool, int, str]] = self._pop('dynamic_heads', None)
+        self.aligner: Union[str, dict] = self._pop('aligner', 'legacy')
 
     def to_non_vanilla(self):
         if self.extra_models:
diff --git a/stable_whisper/timing.py b/stable_whisper/timing.py
@@ -112,26 +112,86 @@ def _compute_atten_weights(
     return weights
 
 
+def _compute_atten_weights_new(
+        model: "Whisper",
+        tokenizer: "Tokenizer",
+        text_tokens: List[int],
+        mel: torch.Tensor,
+        num_samples: int,
+        tokens: torch.tensor,
+        cache: dict,
+        medfilt_width: int = 7,
+        qk_scale: float = 1.0,
+        *,
+        topk=20,
+        w_colnorm=1,
+        w_rownorm=1,
+        w_coverage=0
+) -> torch.Tensor:
+    """
+    Implementation of https://arxiv.org/abs/2509.09987 (https://github.com/30stomercury/whisper-char-alignment).
+    """
+    if cache['qks'] is None:
+        _compute_qks(model, tokenizer, text_tokens, mel, tokens, cache)
+    weights = torch.cat(cache['qks'])
+    weights = weights[..., :round(num_samples / N_SAMPLES_PER_TOKEN)]
+    weights = median_filter(weights, medfilt_width)
+    weights = (weights * qk_scale).softmax(dim=-1)
+
+    n_layers = weights.size(0)
+    n_heads = weights.size(1)
+    score_matix = torch.zeros(n_layers, n_heads, device=weights.device)
+    if w_colnorm > 0:
+        col_norm_sum = weights.norm(dim=-2).sum(-1)
+        score_matix += w_colnorm * col_norm_sum
+    if w_rownorm > 0:
+        row_norm_sum = weights.norm(dim=-1).sum(-1)
+        score_matix += w_rownorm * row_norm_sum
+    if w_coverage > 0:
+        coverage = torch.sum(weights, dim=2)
+        penalty = torch.max(coverage, coverage.clone().fill_(0.5)).sum(-1)
+        penalty = penalty - coverage.size(-1) * 0.5
+        penalty = w_coverage * penalty
+        score_matix -= penalty
+
+    top_idxs = score_matix.flatten().topk(topk).indices
+    matrix = weights[top_idxs // n_heads, top_idxs % n_heads]
+    col_norm = matrix.norm(dim=-2, keepdim=True)
+    matrix = torch.mean(matrix / col_norm, 0)
+    matrix = matrix[len(tokenizer.sot_sequence):-1]
+
+    return matrix
+
+
 def _compute_jump_indices(
         model: "Whisper",
         cache: dict,
         extra_models: List["Whisper"] = None,
+        new: bool = False,
         **kwargs
 ):
-    weights = _compute_atten_weights(model, cache=cache, **kwargs)
-    if extra_models:
-        extra_weights = [weights]
-        for mi, other_model in enumerate(extra_models):
-            m = _compute_atten_weights(other_model, cache=cache['extra_caches'][mi], **kwargs)
-            extra_weights.append(m)
-        weights = torch.cat(extra_weights, dim=0)
-        extra_text_token_probs = [c['text_token_probs'] for c in cache['extra_caches']] + [cache['text_token_probs']]
-        cache['text_token_probs'] = torch.tensor(
-            extra_text_token_probs,
-            device=extra_weights[0].device
-        ).mean(dim=0).tolist()
-
-    matrix = weights.mean(dim=0)
+    if new:
+        weights = _compute_atten_weights_new(model, cache=cache, **kwargs)
+    else:
+        weights = _compute_atten_weights(model, cache=cache, **kwargs)
+        if extra_models:
+            extra_weights = [weights]
+            for mi, other_model in enumerate(extra_models):
+                m = _compute_atten_weights(other_model, cache=cache['extra_caches'][mi], **kwargs)
+                extra_weights.append(m)
+            weights = torch.cat(extra_weights, dim=0)
+            extra_text_token_probs = (
+                    [c['text_token_probs'] for c in cache['extra_caches']] + [cache['text_token_probs']]
+            )
+            cache['text_token_probs'] = torch.tensor(
+                extra_text_token_probs,
+                device=extra_weights[0].device
+            ).mean(dim=0).tolist()
+
+    if new:
+        matrix = weights
+    else:
+        matrix = weights.mean(dim=0)
     text_indices, time_indices = dtw(-matrix)
 
     jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
@@ -153,11 +213,14 @@ def find_alignment_stable(
         token_split=None,
         audio_features: torch.Tensor = None,
         extra_models: List["Whisper"] = None,
-        dynamic_heads: Optional[Union[bool, int, str]] = None
+        dynamic_heads: Optional[Union[bool, int, str]] = None,
+        aligner: Union[str, dict] = 'legacy'
 ) -> List[WordTiming]:
     if extra_models and (invalid_model_types := set(map(type, extra_models)) - {type(model)}):
         raise NotImplementedError(f'Got unsupported model type(s): {invalid_model_types}')
 
+    assert isinstance(aligner, dict) or aligner in ('new', 'legacy'), f'aligner must be "new"/"legacy", got "{aligner}"'
+
     if ts_num:
         warnings.warn('``ts_num`` is deprecated and will be removed in future versions.',
                       stacklevel=2)
@@ -173,13 +236,21 @@ def find_alignment_stable(
         ]
     ).to(model.device)
 
+    word_tokens_orig = itk = None
     if token_split is None:
         words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot])
     else:
         words, word_tokens = token_split
+        if isinstance(word_tokens, dict):
+            word_tokens_orig = word_tokens['tokens_orig']
+            itk = word_tokens['ignore_tokens']
+            word_tokens = word_tokens['tokens']
+            word_tokens_orig.append([tokenizer.eot])
         words.append(tokenizer.decode([tokenizer.eot]))
         word_tokens.append([tokenizer.eot])
     word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0))
+    if itk:
+        word_boundaries += np.array([tk[:len(itk)] == itk for tk in word_tokens], dtype=word_boundaries.dtype)
     if dynamic_heads:
         if dynamic_heads is True:
             dynamic_heads_count = 6
@@ -203,12 +274,18 @@ def find_alignment_stable(
         tokens=tokens,
         qk_scale=qk_scale,
         medfilt_width=medfilt_width,
-        extra_models=extra_models,
-        dynamic_heads_count=dynamic_heads_count
+        extra_models=extra_models
     )
+    if aligner != 'legacy':
+        new = True
+        if isinstance(aligner, dict):
+            kwargs.update(aligner)
+    else:
+        new = False
+        kwargs['dynamic_heads_count'] = dynamic_heads_count
     cache = _new_cache(audio_features=audio_features, extras=0 if extra_models is None else len(extra_models))
     for _ in range(dynamic_iterations or 1):
-        _compute_jump_indices(cache=cache, **kwargs)
+        _compute_jump_indices(cache=cache, new=new, **kwargs)
     jump_times = cache['jump_indices'] / TOKENS_PER_SECOND
     start_times = jump_times[word_boundaries[:-1]]
     end_times = jump_times[word_boundaries[1:]]
@@ -217,6 +294,10 @@ def find_alignment_stable(
         for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
     ]
 
+    if word_tokens_orig is not None:
+        assert len(word_tokens) == len(word_tokens_orig)
+        word_tokens = word_tokens_orig
+
     return [
         WordTiming(word, tokens, start, end, probability)
         for word, tokens, start, end, probability in zip(
@@ -265,7 +346,8 @@ def split_word_tokens(segments: List[dict],
                       *,
                       padding: (str, int) = None,
                       split_callback: Callable = None,
-                      pad_first_seg: bool = True):
+                      pad_first_seg: bool = True,
+                      char_split: bool = False):
     if padding is not None:
         if isinstance(padding, str):
             padding = tokenizer.encode(padding)
@@ -275,6 +357,7 @@ def split_word_tokens(segments: List[dict],
     seg_indices = []
     words = []
     word_tokens = []
+    word_char_tokens = []
     for i, s in enumerate(segments):
         temp_word_tokens = [t for t in s['tokens'] if not isinstance(t, int) or t < tokenizer.eot]
         curr_words, curr_word_tokens = (
@@ -294,10 +377,18 @@ def split_word_tokens(segments: List[dict],
             words.append(None)
             word_tokens.append(padding)
         seg_indices.extend([i] * len(curr_words))
-        tokens.extend(list(chain.from_iterable(curr_word_tokens)))
+        if char_split:
+            curr_word_char_tokens = [[ct for char in word for ct in tokenizer.encode(char)] for word in curr_words]
+            word_char_tokens.extend(curr_word_char_tokens)
+            tokens.extend(list(chain.from_iterable(curr_word_char_tokens)))
+        else:
+            tokens.extend(list(chain.from_iterable(curr_word_tokens)))
         words.extend(curr_words)
         word_tokens.extend(curr_word_tokens)
 
+    if char_split:
+        word_tokens = dict(tokens=word_char_tokens, tokens_orig=word_tokens, ignore_tokens=tokenizer.encode(' '))
+
     return tokens, (words, word_tokens), seg_indices
 
 
@@ -333,6 +424,7 @@ def add_word_timestamps_stable(
         split_callback: Callable = None,
         gap_padding: Optional[str] = ' ...',
         pad_first_seg: bool = True,
+        aligner: Union[str, dict] = 'legacy',
         **kwargs,
 ):
     if len(segments) == 0:
@@ -347,6 +439,10 @@ def add_word_timestamps_stable(
     if append_punctuations is None:
         append_punctuations = "\"'.。,，!！?？:：”)]}、"
 
+    char_split = isinstance(aligner, dict) and aligner.pop('char_split', False)
+    if char_split:
+        gap_padding = None
+
     def align():
         for seg in segments:
             seg['words'] = []
@@ -356,15 +452,17 @@ def align():
             tokenizer,
             padding=gap_padding,
             split_callback=split_callback,
-            pad_first_seg=pad_first_seg
+            pad_first_seg=pad_first_seg,
+            char_split=char_split
         )
 
         alignment = find_alignment_stable(model, tokenizer, text_tokens, mel, num_samples,
                                           **kwargs,
                                           token_split=token_split,
                                           audio_features=audio_features,
                                           ts_num=ts_num,
-                                          ts_noise=ts_noise)
+                                          ts_noise=ts_noise,
+                                          aligner=aligner)
         alt_beginning_alignment = pop_empty_alignment(alignment, seg_indices)
 
         merge_punctuations(alignment, prepend_punctuations, append_punctuations)
diff --git a/stable_whisper/whisper_word_level/original_whisper.py b/stable_whisper/whisper_word_level/original_whisper.py
@@ -72,6 +72,7 @@ def transcribe_stable(
         ignore_compatibility: bool = False,
         extra_models: Optional[List["Whisper"]] = None,
         dynamic_heads: Optional[Union[bool, int, str]] = None,
+        aligner: Union[str, dict] = 'legacy',
         clip_timestamps: Optional[Union[str, List[float]]] = None,
         resume: Union[WhisperResult, str, dict, list] = None,
         **decode_options) \
@@ -199,6 +200,8 @@ def transcribe_stable(
         word-timestamp extraction. Specify the number of heads or `True` for default of 6 heads.
         To specify number of iterations for finding the optimal heads,
         use string with "," to separate heads and iterations (e.g. "8,3" for 8 heads and 3 iterations).
+    aligner : "legacy" or "new" or dict, default "legacy"
+        Algorithm for selecting attention heads for alignment. Use dictionary to specify keyword arguments for 'new'.
     clip_timestamps : str or list of float
         Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process.
         The last end timestamp defaults to the end of the file.
@@ -644,7 +647,8 @@ def inner_transcribe():
                     split_callback=split_callback,
                     gap_padding=gap_padding,
                     extra_models=extra_models,
-                    dynamic_heads=dynamic_heads
+                    dynamic_heads=dynamic_heads,
+                    aligner=aligner
                 )
 
                 for i in reversed(range(len(current_segments))):
@@ -673,6 +677,10 @@ def inner_transcribe():
                 fast_forward()
                 return
 
+            all_tokens.extend(
+                [token for segment in current_segments for token in segment["tokens"]]
+            )
+
             if segment_silence_timing is not None:
                 for seg_i, segment in enumerate(current_segments):
                     segment = Segment(**segment, ignore_unused_args=True).suppress_silence(
@@ -692,9 +700,6 @@ def inner_transcribe():
                     for i, segment in enumerate(current_segments, start=len(all_segments))
                 ]
             )
-            all_tokens.extend(
-                [token for segment in current_segments for token in segment["tokens"]]
-            )
             if not single_timestamp_ending or avg_prob_threshold:
                 segment_samples = num_samples