1+ from functools import partial
12from typing import List , Optional , Union
23
34import click
@@ -55,121 +56,37 @@ def workflows():
5556@click .option (
5657 "-d" , "--device" , default = "cpu" , help = "Device on which to run the inference."
5758)
58- @click .option ("-j" , "--jobs" , default = 1 , help = "Number of jobs for audio scanning." )
5959@click .option (
60- "--force-nonoverlapping/--keep-overlapping" ,
60+ "--faster-whisper" ,
61+ is_flag = True ,
6162 default = False ,
62- help = "If True, the Whisper segment time-stamps will be processed to make sure they are non-overlapping." ,
63- )
64- def annotate_with_whisper (
65- out_cuts : str ,
66- recordings_manifest : Optional [str ],
67- recordings_dir : Optional [str ],
68- cuts_manifest : Optional [str ],
69- extension : str ,
70- model_name : str ,
71- language : Optional [str ],
72- device : str ,
73- jobs : int ,
74- force_nonoverlapping : bool ,
75- ):
76- """
77- Use OpenAI Whisper model to annotate either RECORDINGS_MANIFEST, RECORDINGS_DIR, or CUTS_MANIFEST.
78- It will perform automatic segmentation, transcription, and language identification.
79-
80- RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive. If CUTS_MANIFEST
81- is provided, its supervisions will be overwritten with the results of the inference.
82-
83- Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
84- high quality of data.
85- """
86- from lhotse import annotate_with_whisper as annotate_with_whisper_
87-
88- assert exactly_one_not_null (recordings_manifest , recordings_dir , cuts_manifest ), (
89- "Options RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive "
90- "and at least one is required."
91- )
92-
93- if recordings_manifest is not None :
94- manifest = RecordingSet .from_file (recordings_manifest )
95- elif recordings_dir is not None :
96- manifest = RecordingSet .from_dir (
97- recordings_dir , pattern = f"*.{ extension } " , num_jobs = jobs
98- )
99- else :
100- manifest = CutSet .from_file (cuts_manifest ).to_eager ()
101-
102- with CutSet .open_writer (out_cuts ) as writer :
103- for cut in tqdm (
104- annotate_with_whisper_ (
105- manifest ,
106- language = language ,
107- model_name = model_name ,
108- device = device ,
109- force_nonoverlapping = force_nonoverlapping ,
110- ),
111- total = len (manifest ),
112- desc = "Annotating with Whisper" ,
113- ):
114- writer .write (cut , flush = True )
115-
116-
117- @workflows .command ()
118- @click .argument ("out_cuts" , type = click .Path (allow_dash = True ))
119- @click .option (
120- "-m" ,
121- "--recordings-manifest" ,
122- type = click .Path (exists = True , dir_okay = False , allow_dash = True ),
123- help = "Path to an existing recording manifest." ,
124- )
125- @click .option (
126- "-r" ,
127- "--recordings-dir" ,
128- type = click .Path (exists = True , file_okay = False ),
129- help = "Directory with recordings. We will create a RecordingSet for it automatically." ,
130- )
131- @click .option (
132- "-c" ,
133- "--cuts-manifest" ,
134- type = click .Path (exists = True , dir_okay = False , allow_dash = True ),
135- help = "Path to an existing cuts manifest." ,
136- )
137- @click .option (
138- "-e" ,
139- "--extension" ,
140- default = "wav" ,
141- help = "Audio file extension to search for. Used with RECORDINGS_DIR." ,
63+ help = "If True, use faster-whisper's implementation based on CTranslate2." ,
14264)
14365@click .option (
144- "-n" ,
145- "--model-name" ,
146- default = "base" ,
147- help = "One of Whisper variants (base, medium, large, etc.)" ,
148- )
149- @click .option (
150- "-l" ,
151- "--language" ,
152- help = "Language spoken in the audio. Inferred by default." ,
153- )
154- @click .option (
155- "-d" , "--device" , default = "cpu" , help = "Device on which to run the inference."
156- )
157- @click .option (
158- "--device-index" , default = 0 , help = "Device index on which to run the inference."
66+ "--faster-whisper-use-vad" ,
67+ is_flag = True ,
68+ default = False ,
69+ help = "If True, use faster-whisper's built-in voice activity detection (SileroVAD)." ,
15970)
16071@click .option (
161- "--cpu-threads" , default = 0 , help = "Number of threads to use when running on CPU."
72+ "--faster-whisper-add-alignments" ,
73+ is_flag = True ,
74+ default = False ,
75+ help = "If True, add word alignments using timestamps obtained using the cross-attention"
76+ "pattern and dynamic time warping (Note: Less accurate than forced alignment)." ,
16277)
16378@click .option (
164- "--num-workers" , default = 1 , help = "Number of workers for parallelizing across multiple GPUs."
79+ "--faster-whisper-compute-type" ,
80+ default = "float16" ,
81+ help = "Type to use for computation. See https://opennmt.net/CTranslate2/quantization.html." ,
16582)
16683@click .option ("-j" , "--jobs" , default = 1 , help = "Number of jobs for audio scanning." )
16784@click .option (
16885 "--force-nonoverlapping/--keep-overlapping" ,
16986 default = False ,
17087 help = "If True, the Whisper segment time-stamps will be processed to make sure they are non-overlapping." ,
17188)
172- def annotate_with_faster_whisper (
89+ def annotate_with_whisper (
17390 out_cuts : str ,
17491 recordings_manifest : Optional [str ],
17592 recordings_dir : Optional [str ],
@@ -178,9 +95,10 @@ def annotate_with_faster_whisper(
17895 model_name : str ,
17996 language : Optional [str ],
18097 device : str ,
181- device_index : int ,
182- cpu_threads : int ,
183- num_workers : int ,
98+ faster_whisper : bool ,
99+ faster_whisper_use_vad : bool ,
100+ faster_whisper_compute_type : str ,
101+ faster_whisper_add_alignments : bool ,
184102 jobs : int ,
185103 force_nonoverlapping : bool ,
186104):
@@ -194,7 +112,17 @@ def annotate_with_faster_whisper(
194112 Note: this is an experimental feature of Lhotse, and is not guaranteed to yield
195113 high quality of data.
196114 """
197- from lhotse import annotate_with_faster_whisper as annotate_with_whisper_
115+ if faster_whisper :
116+ from lhotse import annotate_with_faster_whisper
117+
118+ annotate_with_whisper_ = partial (
119+ annotate_with_faster_whisper ,
120+ compute_type = faster_whisper_compute_type ,
121+ vad_filter = faster_whisper_use_vad ,
122+ add_alignments = faster_whisper_add_alignments ,
123+ )
124+ else :
125+ from lhotse import annotate_with_whisper as annotate_with_whisper_
198126
199127 assert exactly_one_not_null (recordings_manifest , recordings_dir , cuts_manifest ), (
200128 "Options RECORDINGS_MANIFEST, RECORDINGS_DIR, and CUTS_MANIFEST are mutually exclusive "
@@ -217,14 +145,10 @@ def annotate_with_faster_whisper(
217145 language = language ,
218146 model_name = model_name ,
219147 device = device ,
220- device_index = device_index ,
221148 force_nonoverlapping = force_nonoverlapping ,
222- compute_type = "float16" ,
223- cpu_threads = cpu_threads ,
224- num_workers = num_workers ,
225149 ),
226150 total = len (manifest ),
227- desc = "Annotating with faster-whisper " ,
151+ desc = "Annotating with Whisper " ,
228152 ):
229153 writer .write (cut , flush = True )
230154
0 commit comments