|
| 1 | +""" |
| 2 | +About the librilight corpus |
| 3 | +
|
| 4 | +Libri-light is a benchmark for the training of automatic speech recognition (ASR) |
| 5 | +systems with limited or no supervision. |
| 6 | +
|
| 7 | +It contains a large dataset of 60K hours of unlabelled speech from audiobooks in |
| 8 | +English and a small labelled dataset (10h, 1h, and 10 min) plus metrics, |
| 9 | +trainable baseline models, and pretrained models that use these datasets. |
| 10 | +
|
| 11 | +It is covered in more detail at https://arxiv.org/abs/1912.07875. |
| 12 | +
|
| 13 | +This data is very huge - please download manually at LIBRILIGHT_URL. |
| 14 | +""" |
| 15 | + |
| 16 | +import logging |
| 17 | +import os |
| 18 | +from collections import defaultdict |
| 19 | +from concurrent.futures.thread import ThreadPoolExecutor |
| 20 | +from pathlib import Path |
| 21 | +from typing import Dict, List, Optional, Sequence, Tuple, Union |
| 22 | + |
| 23 | +from tqdm.auto import tqdm |
| 24 | + |
| 25 | +from lhotse.audio import Recording, RecordingSet |
| 26 | +from lhotse.recipes.utils import manifests_exist |
| 27 | +from lhotse.supervision import SupervisionSegment, SupervisionSet |
| 28 | +from lhotse.utils import Pathlike |
| 29 | + |
| 30 | +LIBRILIGHT = ("small", "medium", "large") |
| 31 | + |
| 32 | +LIBRILIGHT_URL = ( |
| 33 | + "https://dl.fbaipublicfiles.com/librilight/data/small.tar", |
| 34 | + "https://dl.fbaipublicfiles.com/librilight/data/medium.tar", |
| 35 | + "https://dl.fbaipublicfiles.com/librilight/data/large.tar", |
| 36 | +) |
| 37 | + |
| 38 | + |
| 39 | +def _parse_utterance( |
| 40 | + corpus_dir: Pathlike, |
| 41 | + audio_path: Pathlike, |
| 42 | +) -> Optional[Tuple[Recording, SupervisionSegment]]: |
| 43 | + file_name = str(audio_path).replace(".flac", "").replace(str(corpus_dir) + "/", "") |
| 44 | + speaker = str(audio_path).split("/")[-3] |
| 45 | + audio_path = audio_path.resolve() |
| 46 | + |
| 47 | + if not audio_path.is_file(): |
| 48 | + logging.warning(f"No such file: {audio_path}") |
| 49 | + return None |
| 50 | + |
| 51 | + recording = Recording.from_file( |
| 52 | + path=audio_path, |
| 53 | + recording_id=file_name, |
| 54 | + ) |
| 55 | + segment = SupervisionSegment( |
| 56 | + id=file_name, |
| 57 | + recording_id=file_name, |
| 58 | + start=0.0, |
| 59 | + duration=recording.duration, |
| 60 | + channel=0, |
| 61 | + language="English", |
| 62 | + speaker=speaker, |
| 63 | + ) |
| 64 | + |
| 65 | + return recording, segment |
| 66 | + |
| 67 | + |
| 68 | +def _prepare_subset( |
| 69 | + subset: str, |
| 70 | + corpus_dir: Pathlike, |
| 71 | + num_jobs: int = 1, |
| 72 | +) -> Tuple[RecordingSet, SupervisionSet]: |
| 73 | + """ |
| 74 | + Returns the RecodingSet and SupervisionSet given a dataset part. |
| 75 | + :param subset: str, the name of the subset. |
| 76 | + :param corpus_dir: Pathlike, the path of the data dir. |
| 77 | + :return: the RecodingSet and SupervisionSet for train and valid. |
| 78 | + """ |
| 79 | + corpus_dir = Path(corpus_dir) |
| 80 | + part_path = corpus_dir / subset |
| 81 | + audio_paths = list(part_path.rglob("*.flac")) |
| 82 | + |
| 83 | + with ThreadPoolExecutor(num_jobs) as ex: |
| 84 | + futures = [] |
| 85 | + recordings = [] |
| 86 | + supervisions = [] |
| 87 | + for audio_path in tqdm(audio_paths, desc="Distributing tasks"): |
| 88 | + futures.append(ex.submit(_parse_utterance, corpus_dir, audio_path)) |
| 89 | + |
| 90 | + for future in tqdm(futures, desc="Processing"): |
| 91 | + result = future.result() |
| 92 | + if result is None: |
| 93 | + continue |
| 94 | + recording, segment = result |
| 95 | + recordings.append(recording) |
| 96 | + supervisions.append(segment) |
| 97 | + |
| 98 | + recording_set = RecordingSet.from_recordings(recordings) |
| 99 | + supervision_set = SupervisionSet.from_segments(supervisions) |
| 100 | + |
| 101 | + return recording_set, supervision_set |
| 102 | + |
| 103 | + |
| 104 | +def prepare_librilight( |
| 105 | + corpus_dir: Pathlike, |
| 106 | + output_dir: Optional[Pathlike] = None, |
| 107 | + num_jobs: int = 1, |
| 108 | +) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: |
| 109 | + """ |
| 110 | + Returns the manifests which consist of the Recordings and Supervisions |
| 111 | + :param corpus_dir: Path to the LibriLight dataset. |
| 112 | + :param output_dir: Pathlike, the path where to write the manifests. |
| 113 | + :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. |
| 114 | + """ |
| 115 | + corpus_dir = Path(corpus_dir) |
| 116 | + output_dir = Path(output_dir) if output_dir is not None else None |
| 117 | + |
| 118 | + assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" |
| 119 | + |
| 120 | + logging.info("Preparing LibriLight...") |
| 121 | + |
| 122 | + subsets = LIBRILIGHT |
| 123 | + |
| 124 | + if output_dir is not None: |
| 125 | + output_dir = Path(output_dir) |
| 126 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 127 | + |
| 128 | + manifests = defaultdict(dict) |
| 129 | + |
| 130 | + for part in tqdm(subsets, desc="Dataset parts"): |
| 131 | + logging.info(f"Processing LibriLight subset: {part}") |
| 132 | + if manifests_exist( |
| 133 | + part=part, |
| 134 | + output_dir=output_dir, |
| 135 | + prefix="librilight", |
| 136 | + suffix="jsonl.gz", |
| 137 | + ): |
| 138 | + logging.info(f"LibriLight subset: {part} already prepared - skipping.") |
| 139 | + continue |
| 140 | + |
| 141 | + recording_set, supervision_set = _prepare_subset(part, corpus_dir, num_jobs) |
| 142 | + |
| 143 | + if output_dir is not None: |
| 144 | + supervision_set.to_file( |
| 145 | + output_dir / f"librilight_supervisions_{part}.jsonl.gz" |
| 146 | + ) |
| 147 | + recording_set.to_file(output_dir / f"librilight_recordings_{part}.jsonl.gz") |
| 148 | + |
| 149 | + manifests[part] = {"recordings": recording_set, "supervisions": supervision_set} |
| 150 | + |
| 151 | + return manifests |
0 commit comments