Skip to content

Commit a4d9430

Browse files
authored
LibriLight dataset (#1014)
This recipe enables preparing LibriLight manifests. Since LibriLight is unlabelled, no text field exists in `SupervisionSegment`. Usage: ```shell lhotse prepare librilight <librilight_dir> <output_dir> -j <num_jobs> ```
2 parents 24d682b + 1ec76ac commit a4d9430

File tree

5 files changed

+185
-0
lines changed

5 files changed

+185
-0
lines changed

docs/corpus.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ a CLI tool that create the manifests given a corpus directory.
111111
- :func:`lhotse.recipes.prepare_l2_arctic`
112112
* - LibriCSS
113113
- :func:`lhotse.recipes.prepare_libricss`
114+
* - LibriLight
115+
- :func:`lhotse.recipes.prepare_librilight`
114116
* - LibriSpeech (including "mini")
115117
- :func:`lhotse.recipes.prepare_librispeech`
116118
* - LibriTTS

lhotse/bin/modes/recipes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from .icsi import *
3737
from .l2_arctic import *
3838
from .libricss import *
39+
from .librilight import *
3940
from .librimix import *
4041
from .librispeech import *
4142
from .libritts import *
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from typing import Dict, List, Optional, Tuple, Union
2+
3+
import click
4+
5+
from lhotse.bin.modes import download, prepare
6+
from lhotse.recipes.librilight import prepare_librilight
7+
from lhotse.utils import Pathlike
8+
9+
10+
@prepare.command(context_settings=dict(show_default=True))
11+
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
12+
@click.argument("output_dir", type=click.Path())
13+
@click.option(
14+
"-j",
15+
"--num-jobs",
16+
type=int,
17+
default=1,
18+
help="How many threads to use (can give good speed-ups with slow disks).",
19+
)
20+
def librilight(
21+
corpus_dir: Pathlike,
22+
output_dir: Optional[Pathlike] = None,
23+
num_jobs: int = 1,
24+
):
25+
"""LibriLight data preparation."""
26+
prepare_librilight(
27+
corpus_dir=corpus_dir,
28+
output_dir=output_dir,
29+
num_jobs=num_jobs,
30+
)

lhotse/recipes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from .icsi import download_icsi, prepare_icsi
3333
from .l2_arctic import prepare_l2_arctic
3434
from .libricss import download_libricss, prepare_libricss
35+
from .librilight import prepare_librilight
3536
from .librimix import download_librimix, prepare_librimix
3637
from .librispeech import download_librispeech, prepare_librispeech
3738
from .libritts import download_libritts, prepare_libritts

lhotse/recipes/librilight.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
"""
2+
About the librilight corpus
3+
4+
Libri-light is a benchmark for the training of automatic speech recognition (ASR)
5+
systems with limited or no supervision.
6+
7+
It contains a large dataset of 60K hours of unlabelled speech from audiobooks in
8+
English and a small labelled dataset (10h, 1h, and 10 min) plus metrics,
9+
trainable baseline models, and pretrained models that use these datasets.
10+
11+
It is covered in more detail at https://arxiv.org/abs/1912.07875.
12+
13+
This data is very huge - please download manually at LIBRILIGHT_URL.
14+
"""
15+
16+
import logging
17+
import os
18+
from collections import defaultdict
19+
from concurrent.futures.thread import ThreadPoolExecutor
20+
from pathlib import Path
21+
from typing import Dict, List, Optional, Sequence, Tuple, Union
22+
23+
from tqdm.auto import tqdm
24+
25+
from lhotse.audio import Recording, RecordingSet
26+
from lhotse.recipes.utils import manifests_exist
27+
from lhotse.supervision import SupervisionSegment, SupervisionSet
28+
from lhotse.utils import Pathlike
29+
30+
LIBRILIGHT = ("small", "medium", "large")
31+
32+
LIBRILIGHT_URL = (
33+
"https://dl.fbaipublicfiles.com/librilight/data/small.tar",
34+
"https://dl.fbaipublicfiles.com/librilight/data/medium.tar",
35+
"https://dl.fbaipublicfiles.com/librilight/data/large.tar",
36+
)
37+
38+
39+
def _parse_utterance(
40+
corpus_dir: Pathlike,
41+
audio_path: Pathlike,
42+
) -> Optional[Tuple[Recording, SupervisionSegment]]:
43+
file_name = str(audio_path).replace(".flac", "").replace(str(corpus_dir) + "/", "")
44+
speaker = str(audio_path).split("/")[-3]
45+
audio_path = audio_path.resolve()
46+
47+
if not audio_path.is_file():
48+
logging.warning(f"No such file: {audio_path}")
49+
return None
50+
51+
recording = Recording.from_file(
52+
path=audio_path,
53+
recording_id=file_name,
54+
)
55+
segment = SupervisionSegment(
56+
id=file_name,
57+
recording_id=file_name,
58+
start=0.0,
59+
duration=recording.duration,
60+
channel=0,
61+
language="English",
62+
speaker=speaker,
63+
)
64+
65+
return recording, segment
66+
67+
68+
def _prepare_subset(
69+
subset: str,
70+
corpus_dir: Pathlike,
71+
num_jobs: int = 1,
72+
) -> Tuple[RecordingSet, SupervisionSet]:
73+
"""
74+
Returns the RecodingSet and SupervisionSet given a dataset part.
75+
:param subset: str, the name of the subset.
76+
:param corpus_dir: Pathlike, the path of the data dir.
77+
:return: the RecodingSet and SupervisionSet for train and valid.
78+
"""
79+
corpus_dir = Path(corpus_dir)
80+
part_path = corpus_dir / subset
81+
audio_paths = list(part_path.rglob("*.flac"))
82+
83+
with ThreadPoolExecutor(num_jobs) as ex:
84+
futures = []
85+
recordings = []
86+
supervisions = []
87+
for audio_path in tqdm(audio_paths, desc="Distributing tasks"):
88+
futures.append(ex.submit(_parse_utterance, corpus_dir, audio_path))
89+
90+
for future in tqdm(futures, desc="Processing"):
91+
result = future.result()
92+
if result is None:
93+
continue
94+
recording, segment = result
95+
recordings.append(recording)
96+
supervisions.append(segment)
97+
98+
recording_set = RecordingSet.from_recordings(recordings)
99+
supervision_set = SupervisionSet.from_segments(supervisions)
100+
101+
return recording_set, supervision_set
102+
103+
104+
def prepare_librilight(
105+
corpus_dir: Pathlike,
106+
output_dir: Optional[Pathlike] = None,
107+
num_jobs: int = 1,
108+
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
109+
"""
110+
Returns the manifests which consist of the Recordings and Supervisions
111+
:param corpus_dir: Path to the LibriLight dataset.
112+
:param output_dir: Pathlike, the path where to write the manifests.
113+
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
114+
"""
115+
corpus_dir = Path(corpus_dir)
116+
output_dir = Path(output_dir) if output_dir is not None else None
117+
118+
assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
119+
120+
logging.info("Preparing LibriLight...")
121+
122+
subsets = LIBRILIGHT
123+
124+
if output_dir is not None:
125+
output_dir = Path(output_dir)
126+
output_dir.mkdir(parents=True, exist_ok=True)
127+
128+
manifests = defaultdict(dict)
129+
130+
for part in tqdm(subsets, desc="Dataset parts"):
131+
logging.info(f"Processing LibriLight subset: {part}")
132+
if manifests_exist(
133+
part=part,
134+
output_dir=output_dir,
135+
prefix="librilight",
136+
suffix="jsonl.gz",
137+
):
138+
logging.info(f"LibriLight subset: {part} already prepared - skipping.")
139+
continue
140+
141+
recording_set, supervision_set = _prepare_subset(part, corpus_dir, num_jobs)
142+
143+
if output_dir is not None:
144+
supervision_set.to_file(
145+
output_dir / f"librilight_supervisions_{part}.jsonl.gz"
146+
)
147+
recording_set.to_file(output_dir / f"librilight_recordings_{part}.jsonl.gz")
148+
149+
manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}
150+
151+
return manifests

0 commit comments

Comments
 (0)