Add workflow for faster-whisper (ctranslate2)

entn-at · entn-at · commit 21a23cb761ba · 2023-04-05T22:32:14.000-07:00
diff --git a/lhotse/bin/modes/workflows.py b/lhotse/bin/modes/workflows.py
@@ -114,6 +114,121 @@ def annotate_with_whisper(
             writer.write(cut, flush=True)
 
 
+@workflows.command()
+@click.argument("out_cuts", type=click.Path(allow_dash=True))
+@click.option(
+    "-m",
+    "--recordings-manifest",
+    type=click.Path(exists=True, dir_okay=False, allow_dash=True),
+    help="Path to an existing recording manifest.",
+)
+@click.option(
+    "-r",
+    "--recordings-dir",
+    type=click.Path(exists=True, file_okay=False),
+    help="Directory with recordings. We will create a RecordingSet for it automatically.",
+)
+@click.option(
+    "-c",
+    "--cuts-manifest",
+    type=click.Path(exists=True, dir_okay=False, allow_dash=True),
+    help="Path to an existing cuts manifest.",
+)
+@click.option(
+    "-e",
+    "--extension",
+    default="wav",
+    help="Audio file extension to search for. Used with RECORDINGS_DIR.",
+)
+@click.option(
+    "-n",
+    "--model-name",
+    default="base",
+    help="One of Whisper variants (base, medium, large, etc.)",
+)
+@click.option(
+    "-l",
+    "--language",
+    help="Language spoken in the audio. Inferred by default.",
+)
+@click.option(
+    "-d", "--device", default="cpu", help="Device on which to run the inference."
+)
+@click.option(
+    "--device-index", default=0, help="Device index on which to run the inference."
+)
+@click.option(
+    "--cpu-threads", default=0, help="Number of threads to use when running on CPU."
+)
+@click.option(
+    "--num-workers", default=1, help="Number of workers for parallelizing across multiple GPUs."
+)
+@click.option("-j", "--jobs", default=1, help="Number of jobs for audio scanning.")
+@click.option(
+    "--force-nonoverlapping/--keep-overlapping",
+    default=False,
+    help="If True, the Whisper segment time-stamps will be processed to make sure they are non-overlapping.",
+)
+def annotate_with_faster_whisper(
+    out_cuts: str,
+    recordings_manifest: Optional[str],
+    recordings_dir: Optional[str],
+    cuts_manifest: Optional[str],
+    extension: str,
+    model_name: str,
+    language: Optional[str],
+    device: str,
+    device_index: int,
+    cpu_threads: int,
+    num_workers: int,
+    jobs: int,
+    force_nonoverlapping: bool,
+):
+    """
+    Use OpenAI Whisper model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
+    It will perform automatic segmentation, transcription, and language identification.
+
+    RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive. If CUTS_MANIFEST
+    is provided, its supervisions will be overwritten with the results of the inference.
+
+    Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
+    high quality of data.
+    """
+    from lhotse import annotate_with_faster_whisper as annotate_with_whisper_
+
+    assert exactly_one_not_null(recordings_manifest, recordings_dir, cuts_manifest), (
+        "Options RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive "
+        "and at least one is required."
+    )
+
+    if recordings_manifest is not None:
+        manifest = RecordingSet.from_file(recordings_manifest)
+    elif recordings_dir is not None:
+        manifest = RecordingSet.from_dir(
+            recordings_dir, pattern=f"*.{extension}", num_jobs=jobs
+        )
+    else:
+        manifest = CutSet.from_file(cuts_manifest).to_eager()
+
+    with CutSet.open_writer(out_cuts) as writer:
+        for cut in tqdm(
+            annotate_with_whisper_(
+                manifest,
+                language=language,
+                model_name=model_name,
+                device=device,
+                device_index=device_index,
+                force_nonoverlapping=force_nonoverlapping,
+                compute_type="float16",
+                cpu_threads=cpu_threads,
+                num_workers=num_workers,
+            ),
+            total=len(manifest),
+            desc="Annotating with faster-whisper",
+        ):
+            writer.write(cut, flush=True)
+
+
 @workflows.command()
 @click.argument(
     "in_cuts", type=click.Path(exists=True, dir_okay=False, allow_dash=True)
diff --git a/lhotse/workflows/__init__.py b/lhotse/workflows/__init__.py
@@ -1,3 +1,4 @@
 from .forced_alignment import align_with_torchaudio
 from .meeting_simulation import *
 from .whisper import annotate_with_whisper
+from .faster_whisper import annotate_with_faster_whisper
diff --git a/lhotse/workflows/faster_whisper.py b/lhotse/workflows/faster_whisper.py
@@ -0,0 +1,247 @@
+import logging
+from typing import Any, Generator, List, Optional, Union
+
+import numpy as np
+
+from lhotse import (
+    CutSet,
+    MonoCut,
+    Recording,
+    RecordingSet,
+    SupervisionSegment,
+    add_durations,
+)
+from lhotse.qa import trim_supervisions_to_recordings
+from lhotse.utils import fastcopy, is_module_available
+from lhotse.supervision import AlignmentItem
+
+
+def annotate_with_faster_whisper(
+    manifest: Union[RecordingSet, CutSet],
+    model_name: str = "base",
+    device: str = "cpu",
+    device_index: int = 0,
+    force_nonoverlapping: bool = False,
+    compute_type: str = "default",
+    cpu_threads: int = 0,
+    num_workers: int = 1,
+    **decode_options,
+) -> Generator[MonoCut, None, None]:
+    """
+    Use OpenAI Whisper model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
+    It will perform automatic segmentation, transcription, and language identification. If
+    the first argument is a CutSet, it will overwrite the supervisions with the results of the inference.
+
+    Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
+    high quality of data.
+
+    See the original repo for more details: https://github.com/guillaumekln/faster-whisper
+
+    :param manifest: a ``RecordingSet`` or ``CutSet`` object.
+    :param language: specify the language if known upfront, otherwise it will be auto-detected.
+    :param model_name: one of available Whisper variants (base, medium, large, etc.).
+    :param device: Where to run the inference (cpu, cuda, etc.).
+    :param force_nonoverlapping: if True, the Whisper segment time-stamps will be processed to make
+        sure they are non-overlapping.
+    :param download_root: if specified, the model will be downloaded to this directory. Otherwise,
+        it will be downloaded to the default location specfied by whisper.
+    :param decode_options: additional options to pass to the ``whisper.transcribe`` function.
+    :return: a generator of cuts (use ``CutSet.open_writer()`` to write them).
+    """
+    assert is_module_available("faster_whisper"), (
+        "This function expects faster-whisper to be installed. "
+        "You can install it via 'pip install faster-whisper' "
+        "(see https://github.com/guillaumekln/faster-whisper/ for details)."
+    )
+
+    if isinstance(manifest, RecordingSet):
+        yield from _annotate_recordings(
+            manifest,
+            model_name,
+            device,
+            device_index,
+            force_nonoverlapping,
+            compute_type=compute_type,
+            cpu_threads=cpu_threads,
+            num_workers=num_workers,
+            **decode_options,
+        )
+    elif isinstance(manifest, CutSet):
+        yield from _annotate_cuts(
+            manifest,
+            model_name,
+            device,
+            device_index,
+            force_nonoverlapping,
+            compute_type=compute_type,
+            cpu_threads=cpu_threads,
+            num_workers=num_workers,
+            **decode_options,
+        )
+    else:
+        raise ValueError("The ``manifest`` must be either a RecordingSet or a CutSet.")
+
+
+def _annotate_recordings(
+    recordings: RecordingSet,
+    model_name: str,
+    device: str,
+    device_index: int,
+    force_nonoverlapping: bool,
+    compute_type: str = "default",
+    cpu_threads: int = 0,
+    num_workers: int = 1,
+    **decode_options,
+):
+    """
+    Helper function that annotates a RecordingSet with Whisper.
+    """
+    from faster_whisper import WhisperModel
+
+    model = WhisperModel(
+        model_name,
+        device=device,
+        device_index=device_index,
+        compute_type=compute_type,
+        cpu_threads=cpu_threads,
+        num_workers=num_workers,
+    )
+
+    for recording in recordings:
+        if recording.num_channels > 1:
+            logging.warning(
+                f"Skipping recording '{recording.id}'. It has {recording.num_channels} channels, "
+                f"but we currently only support mono input."
+            )
+            continue
+        audio = np.squeeze(recording.resample(16000).load_audio())
+        segments, info = model.transcribe(audio=audio, word_timestamps=True, vad_filter=True, **decode_options)
+        # Create supervisions from segments while filtering out those with negative duration.
+        supervisions = [
+            SupervisionSegment(
+                id=f"{recording.id}-{segment_id:06d}",
+                recording_id=recording.id,
+                start=round(segment.start, ndigits=8),
+                duration=add_durations(
+                    segment.end, -segment.start, sampling_rate=16000
+                ),
+                text=segment.text.strip(),
+                language=info.language,
+            ).with_alignment(
+                "word",
+                [
+                    AlignmentItem(
+                        symbol=ws.word.strip(),
+                        start=ws.start,
+                        duration=(ws.end - ws.start),
+                        score=ws.probability,
+                    )
+                    for ws in segment.words
+                ]            
+            )
+            for segment_id, segment in enumerate(segments)
+            if segment.end - segment.start > 0
+        ]
+        cut = recording.to_cut()
+        if supervisions:
+            supervisions = (
+                _postprocess_timestamps(supervisions)
+                if force_nonoverlapping
+                else supervisions
+            )
+            cut.supervisions = list(
+                trim_supervisions_to_recordings(
+                    recordings=recording, supervisions=supervisions, verbose=False
+                )
+            )
+        yield cut
+
+
+def _annotate_cuts(
+    cuts: CutSet,
+    model_name: str,
+    device: str,
+    device_index: int,
+    force_nonoverlapping: bool,
+    download_root: Optional[str] = None,
+    **decode_options,
+):
+    """
+    Helper function that annotates a CutSet with Whisper.
+    """
+    from faster_whisper import WhisperModel
+
+    model = WhisperModel(
+        model_name,
+        device=device,
+        device_index=device_index,
+        compute_type=compute_type,
+        cpu_threads=cpu_threads,
+        num_workers=num_workers,
+    )
+
+    for cut in cuts:
+        if cut.num_channels > 1:
+            logging.warning(
+                f"Skipping cut '{cut.id}'. It has {cut.num_channels} channels, "
+                f"but we currently only support mono input."
+            )
+            continue
+        audio = np.squeeze(cut.resample(16000).load_audio())
+        segments, info = model.transcribe(audio=audio, word_timestamps=True, **decode_options)
+        # Create supervisions from segments while filtering out those with negative duration.
+        supervisions = [
+            SupervisionSegment(
+                id=f"{cut.id}-{segment_id:06d}",
+                recording_id=cut.recording_id,
+                start=round(segment.start, ndigits=8),
+                duration=add_durations(
+                    min(segment.end, cut.duration),
+                    -segment.start,
+                    sampling_rate=16000,
+                ),
+                text=segment.text.strip(),
+                language=info.language,
+            ).with_alignment(
+                "word",
+                [
+                    AlignmentItem(
+                        symbol=ws.word.strip(),
+                        start=ws.start,
+                        duration=(ws.end - ws.start),
+                        score=ws.probability,
+                    )
+                    for ws in segment.words
+                ]            
+            )
+            for segment_id, segment in enumerate(segments)
+            if segment.end - segment.start > 0
+        ]
+        new_cut = fastcopy(
+            cut,
+            supervisions=_postprocess_timestamps(supervisions)
+            if force_nonoverlapping
+            else supervisions,
+        )
+        yield new_cut
+
+
+def _postprocess_timestamps(supervisions: List[SupervisionSegment]):
+    """
+    Whisper tends to have a lot of overlapping segments due to inaccurate end timestamps.
+    Under a strong assumption that the input speech is non-overlapping, we can fix that
+    by always truncating to the start timestamp of the next segment.
+    """
+    from cytoolz import sliding_window
+
+    supervisions = sorted(supervisions, key=lambda s: s.start)
+
+    if len(supervisions) < 2:
+        return supervisions
+    out = []
+    for cur, nxt in sliding_window(2, supervisions):
+        if cur.end > nxt.start:
+            cur = cur.trim(end=nxt.start)
+        out.append(cur)
+    out.append(nxt)
+    return out