lhotse-speech
diff --git a/‎lhotse/bin/modes/workflows.py‎
Lines changed: 33 additions & 109 deletions b/‎lhotse/bin/modes/workflows.py‎
Lines changed: 33 additions & 109 deletions
@@ -1,3 +1,4 @@
+from functools import partial
 from typing import List, Optional, Union
 
 import click
@@ -55,121 +56,37 @@ def workflows():
 @click.option(
     "-d", "--device", default="cpu", help="Device on which to run the inference."
 )
-@click.option("-j", "--jobs", default=1, help="Number of jobs for audio scanning.")
 @click.option(
-    "--force-nonoverlapping/--keep-overlapping",
+    "--faster-whisper",
+    is_flag=True,
     default=False,
-    help="If True, the Whisper segment time-stamps will be processed to make sure they are non-overlapping.",
-)
-def annotate_with_whisper(
-    out_cuts: str,
-    recordings_manifest: Optional[str],
-    recordings_dir: Optional[str],
-    cuts_manifest: Optional[str],
-    extension: str,
-    model_name: str,
-    language: Optional[str],
-    device: str,
-    jobs: int,
-    force_nonoverlapping: bool,
-):
-    """
-    Use OpenAI Whisper model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
-    It will perform automatic segmentation, transcription, and language identification.
-
-    RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive. If CUTS_MANIFEST
-    is provided, its supervisions will be overwritten with the results of the inference.
-
-    Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
-    high quality of data.
-    """
-    from lhotse import annotate_with_whisper as annotate_with_whisper_
-
-    assert exactly_one_not_null(recordings_manifest, recordings_dir, cuts_manifest), (
-        "Options RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive "
-        "and at least one is required."
-    )
-
-    if recordings_manifest is not None:
-        manifest = RecordingSet.from_file(recordings_manifest)
-    elif recordings_dir is not None:
-        manifest = RecordingSet.from_dir(
-            recordings_dir, pattern=f"*.{extension}", num_jobs=jobs
-        )
-    else:
-        manifest = CutSet.from_file(cuts_manifest).to_eager()
-
-    with CutSet.open_writer(out_cuts) as writer:
-        for cut in tqdm(
-            annotate_with_whisper_(
-                manifest,
-                language=language,
-                model_name=model_name,
-                device=device,
-                force_nonoverlapping=force_nonoverlapping,
-            ),
-            total=len(manifest),
-            desc="Annotating with Whisper",
-        ):
-            writer.write(cut, flush=True)
-
-
-@workflows.command()
-@click.argument("out_cuts", type=click.Path(allow_dash=True))
-@click.option(
-    "-m",
-    "--recordings-manifest",
-    type=click.Path(exists=True, dir_okay=False, allow_dash=True),
-    help="Path to an existing recording manifest.",
-)
-@click.option(
-    "-r",
-    "--recordings-dir",
-    type=click.Path(exists=True, file_okay=False),
-    help="Directory with recordings. We will create a RecordingSet for it automatically.",
-)
-@click.option(
-    "-c",
-    "--cuts-manifest",
-    type=click.Path(exists=True, dir_okay=False, allow_dash=True),
-    help="Path to an existing cuts manifest.",
-)
-@click.option(
-    "-e",
-    "--extension",
-    default="wav",
-    help="Audio file extension to search for. Used with RECORDINGS_DIR.",
+    help="If True, use faster-whisper's implementation based on CTranslate2.",
 )
 @click.option(
-    "-n",
-    "--model-name",
-    default="base",
-    help="One of Whisper variants (base, medium, large, etc.)",
-)
-@click.option(
-    "-l",
-    "--language",
-    help="Language spoken in the audio. Inferred by default.",
-)
-@click.option(
-    "-d", "--device", default="cpu", help="Device on which to run the inference."
-)
-@click.option(
-    "--device-index", default=0, help="Device index on which to run the inference."
+    "--faster-whisper-use-vad",
+    is_flag=True,
+    default=False,
+    help="If True, use faster-whisper's built-in voice activity detection (SileroVAD).",
 )
 @click.option(
-    "--cpu-threads", default=0, help="Number of threads to use when running on CPU."
+    "--faster-whisper-add-alignments",
+    is_flag=True,
+    default=False,
+    help="If True, add word alignments using timestamps obtained using the cross-attention"
+    "pattern and dynamic time warping (Note: Less accurate than forced alignment).",
 )
 @click.option(
-    "--num-workers", default=1, help="Number of workers for parallelizing across multiple GPUs."
+    "--faster-whisper-compute-type",
+    default="float16",
+    help="Type to use for computation. See https://opennmt.net/CTranslate2/quantization.html.",
 )
 @click.option("-j", "--jobs", default=1, help="Number of jobs for audio scanning.")
 @click.option(
     "--force-nonoverlapping/--keep-overlapping",
     default=False,
     help="If True, the Whisper segment time-stamps will be processed to make sure they are non-overlapping.",
 )
-def annotate_with_faster_whisper(
+def annotate_with_whisper(
     out_cuts: str,
     recordings_manifest: Optional[str],
     recordings_dir: Optional[str],
@@ -178,9 +95,10 @@ def annotate_with_faster_whisper(
     model_name: str,
     language: Optional[str],
     device: str,
-    device_index: int,
-    cpu_threads: int,
-    num_workers: int,
+    faster_whisper: bool,
+    faster_whisper_use_vad: bool,
+    faster_whisper_compute_type: str,
+    faster_whisper_add_alignments: bool,
     jobs: int,
     force_nonoverlapping: bool,
 ):
@@ -194,7 +112,17 @@ def annotate_with_faster_whisper(
     Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
     high quality of data.
     """
-    from lhotse import annotate_with_faster_whisper as annotate_with_whisper_
+    if faster_whisper:
+        from lhotse import annotate_with_faster_whisper
+
+        annotate_with_whisper_ = partial(
+            annotate_with_faster_whisper,
+            compute_type=faster_whisper_compute_type,
+            vad_filter=faster_whisper_use_vad,
+            add_alignments=faster_whisper_add_alignments,
+        )
+    else:
+        from lhotse import annotate_with_whisper as annotate_with_whisper_
 
     assert exactly_one_not_null(recordings_manifest, recordings_dir, cuts_manifest), (
         "Options RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive "
@@ -217,14 +145,10 @@ def annotate_with_faster_whisper(
                 language=language,
                 model_name=model_name,
                 device=device,
-                device_index=device_index,
                 force_nonoverlapping=force_nonoverlapping,
-                compute_type="float16",
-                cpu_threads=cpu_threads,
-                num_workers=num_workers,
             ),
             total=len(manifest),
-            desc="Annotating with faster-whisper",
+            desc="Annotating with Whisper",
         ):
             writer.write(cut, flush=True)