Skip to content

Commit 1610802

Browse files
committed
Combine whisper and faster-whisper into a single CLI/method
1 parent 21a23cb commit 1610802

File tree

2 files changed

+135
-235
lines changed

2 files changed

+135
-235
lines changed

lhotse/bin/modes/workflows.py

Lines changed: 33 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from functools import partial
12
from typing import List, Optional, Union
23

34
import click
@@ -55,121 +56,37 @@ def workflows():
5556
@click.option(
5657
"-d", "--device", default="cpu", help="Device on which to run the inference."
5758
)
58-
@click.option("-j", "--jobs", default=1, help="Number of jobs for audio scanning.")
5959
@click.option(
60-
"--force-nonoverlapping/--keep-overlapping",
60+
"--faster-whisper",
61+
is_flag=True,
6162
default=False,
62-
help="If True, the Whisper segment time-stamps will be processed to make sure they are non-overlapping.",
63-
)
64-
def annotate_with_whisper(
65-
out_cuts: str,
66-
recordings_manifest: Optional[str],
67-
recordings_dir: Optional[str],
68-
cuts_manifest: Optional[str],
69-
extension: str,
70-
model_name: str,
71-
language: Optional[str],
72-
device: str,
73-
jobs: int,
74-
force_nonoverlapping: bool,
75-
):
76-
"""
77-
Use OpenAI Whisper model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
78-
It will perform automatic segmentation, transcription, and language identification.
79-
80-
RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive. If CUTS_MANIFEST
81-
is provided, its supervisions will be overwritten with the results of the inference.
82-
83-
Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
84-
high quality of data.
85-
"""
86-
from lhotse import annotate_with_whisper as annotate_with_whisper_
87-
88-
assert exactly_one_not_null(recordings_manifest, recordings_dir, cuts_manifest), (
89-
"Options RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive "
90-
"and at least one is required."
91-
)
92-
93-
if recordings_manifest is not None:
94-
manifest = RecordingSet.from_file(recordings_manifest)
95-
elif recordings_dir is not None:
96-
manifest = RecordingSet.from_dir(
97-
recordings_dir, pattern=f"*.{extension}", num_jobs=jobs
98-
)
99-
else:
100-
manifest = CutSet.from_file(cuts_manifest).to_eager()
101-
102-
with CutSet.open_writer(out_cuts) as writer:
103-
for cut in tqdm(
104-
annotate_with_whisper_(
105-
manifest,
106-
language=language,
107-
model_name=model_name,
108-
device=device,
109-
force_nonoverlapping=force_nonoverlapping,
110-
),
111-
total=len(manifest),
112-
desc="Annotating with Whisper",
113-
):
114-
writer.write(cut, flush=True)
115-
116-
117-
@workflows.command()
118-
@click.argument("out_cuts", type=click.Path(allow_dash=True))
119-
@click.option(
120-
"-m",
121-
"--recordings-manifest",
122-
type=click.Path(exists=True, dir_okay=False, allow_dash=True),
123-
help="Path to an existing recording manifest.",
124-
)
125-
@click.option(
126-
"-r",
127-
"--recordings-dir",
128-
type=click.Path(exists=True, file_okay=False),
129-
help="Directory with recordings. We will create a RecordingSet for it automatically.",
130-
)
131-
@click.option(
132-
"-c",
133-
"--cuts-manifest",
134-
type=click.Path(exists=True, dir_okay=False, allow_dash=True),
135-
help="Path to an existing cuts manifest.",
136-
)
137-
@click.option(
138-
"-e",
139-
"--extension",
140-
default="wav",
141-
help="Audio file extension to search for. Used with RECORDINGS_DIR.",
63+
help="If True, use faster-whisper's implementation based on CTranslate2.",
14264
)
14365
@click.option(
144-
"-n",
145-
"--model-name",
146-
default="base",
147-
help="One of Whisper variants (base, medium, large, etc.)",
148-
)
149-
@click.option(
150-
"-l",
151-
"--language",
152-
help="Language spoken in the audio. Inferred by default.",
153-
)
154-
@click.option(
155-
"-d", "--device", default="cpu", help="Device on which to run the inference."
156-
)
157-
@click.option(
158-
"--device-index", default=0, help="Device index on which to run the inference."
66+
"--faster-whisper-use-vad",
67+
is_flag=True,
68+
default=False,
69+
help="If True, use faster-whisper's built-in voice activity detection (SileroVAD).",
15970
)
16071
@click.option(
161-
"--cpu-threads", default=0, help="Number of threads to use when running on CPU."
72+
"--faster-whisper-add-alignments",
73+
is_flag=True,
74+
default=False,
75+
help="If True, add word alignments using timestamps obtained using the cross-attention"
76+
"pattern and dynamic time warping (Note: Less accurate than forced alignment).",
16277
)
16378
@click.option(
164-
"--num-workers", default=1, help="Number of workers for parallelizing across multiple GPUs."
79+
"--faster-whisper-compute-type",
80+
default="float16",
81+
help="Type to use for computation. See https://opennmt.net/CTranslate2/quantization.html.",
16582
)
16683
@click.option("-j", "--jobs", default=1, help="Number of jobs for audio scanning.")
16784
@click.option(
16885
"--force-nonoverlapping/--keep-overlapping",
16986
default=False,
17087
help="If True, the Whisper segment time-stamps will be processed to make sure they are non-overlapping.",
17188
)
172-
def annotate_with_faster_whisper(
89+
def annotate_with_whisper(
17390
out_cuts: str,
17491
recordings_manifest: Optional[str],
17592
recordings_dir: Optional[str],
@@ -178,9 +95,10 @@ def annotate_with_faster_whisper(
17895
model_name: str,
17996
language: Optional[str],
18097
device: str,
181-
device_index: int,
182-
cpu_threads: int,
183-
num_workers: int,
98+
faster_whisper: bool,
99+
faster_whisper_use_vad: bool,
100+
faster_whisper_compute_type: str,
101+
faster_whisper_add_alignments: bool,
184102
jobs: int,
185103
force_nonoverlapping: bool,
186104
):
@@ -194,7 +112,17 @@ def annotate_with_faster_whisper(
194112
Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
195113
high quality of data.
196114
"""
197-
from lhotse import annotate_with_faster_whisper as annotate_with_whisper_
115+
if faster_whisper:
116+
from lhotse import annotate_with_faster_whisper
117+
118+
annotate_with_whisper_ = partial(
119+
annotate_with_faster_whisper,
120+
compute_type=faster_whisper_compute_type,
121+
vad_filter=faster_whisper_use_vad,
122+
add_alignments=faster_whisper_add_alignments,
123+
)
124+
else:
125+
from lhotse import annotate_with_whisper as annotate_with_whisper_
198126

199127
assert exactly_one_not_null(recordings_manifest, recordings_dir, cuts_manifest), (
200128
"Options RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive "
@@ -217,14 +145,10 @@ def annotate_with_faster_whisper(
217145
language=language,
218146
model_name=model_name,
219147
device=device,
220-
device_index=device_index,
221148
force_nonoverlapping=force_nonoverlapping,
222-
compute_type="float16",
223-
cpu_threads=cpu_threads,
224-
num_workers=num_workers,
225149
),
226150
total=len(manifest),
227-
desc="Annotating with faster-whisper",
151+
desc="Annotating with Whisper",
228152
):
229153
writer.write(cut, flush=True)
230154

0 commit comments

Comments
 (0)