LibriLight dataset (#1014)

pzelasko · web-flow · commit a4d943038f3b · 2023-04-01T12:20:49.000-04:00
This recipe enables preparing LibriLight manifests. 
Since LibriLight is unlabelled, no text field exists in
`SupervisionSegment`.

Usage:
```shell
lhotse prepare librilight &lt;librilight_dir&gt; &lt;output_dir&gt;  -j &lt;num_jobs&gt;
```
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -111,6 +111,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_l2_arctic`
   * - LibriCSS
     - :func:`lhotse.recipes.prepare_libricss`
+  * - LibriLight
+    - :func:`lhotse.recipes.prepare_librilight`
   * - LibriSpeech (including "mini")
     - :func:`lhotse.recipes.prepare_librispeech`
   * - LibriTTS
diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -36,6 +36,7 @@
 from .icsi import *
 from .l2_arctic import *
 from .libricss import *
+from .librilight import *
 from .librimix import *
 from .librispeech import *
 from .libritts import *
diff --git a/lhotse/bin/modes/recipes/librilight.py b/lhotse/bin/modes/recipes/librilight.py
@@ -0,0 +1,30 @@
+from typing import Dict, List, Optional, Tuple, Union
+
+import click
+
+from lhotse.bin.modes import download, prepare
+from lhotse.recipes.librilight import prepare_librilight
+from lhotse.utils import Pathlike
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many threads to use (can give good speed-ups with slow disks).",
+)
+def librilight(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike] = None,
+    num_jobs: int = 1,
+):
+    """LibriLight data preparation."""
+    prepare_librilight(
+        corpus_dir=corpus_dir,
+        output_dir=output_dir,
+        num_jobs=num_jobs,
+    )
diff --git a/lhotse/recipes/__init__.py b/lhotse/recipes/__init__.py
@@ -32,6 +32,7 @@
 from .icsi import download_icsi, prepare_icsi
 from .l2_arctic import prepare_l2_arctic
 from .libricss import download_libricss, prepare_libricss
+from .librilight import prepare_librilight
 from .librimix import download_librimix, prepare_librimix
 from .librispeech import download_librispeech, prepare_librispeech
 from .libritts import download_libritts, prepare_libritts
diff --git a/lhotse/recipes/librilight.py b/lhotse/recipes/librilight.py
@@ -0,0 +1,151 @@
+"""
+About the librilight corpus
+
+Libri-light is a benchmark for the training of automatic speech recognition (ASR)
+systems with limited or no supervision.
+
+It contains a large dataset of 60K hours of unlabelled speech from audiobooks in 
+English and a small labelled dataset (10h, 1h, and 10 min) plus metrics,
+trainable baseline models, and pretrained models that use these datasets.
+
+It is covered in more detail at https://arxiv.org/abs/1912.07875.
+
+This data is very huge - please download manually at LIBRILIGHT_URL.
+"""
+
+import logging
+import os
+from collections import defaultdict
+from concurrent.futures.thread import ThreadPoolExecutor
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+from tqdm.auto import tqdm
+
+from lhotse.audio import Recording, RecordingSet
+from lhotse.recipes.utils import manifests_exist
+from lhotse.supervision import SupervisionSegment, SupervisionSet
+from lhotse.utils import Pathlike
+
+LIBRILIGHT = ("small", "medium", "large")
+
+LIBRILIGHT_URL = (
+    "https://dl.fbaipublicfiles.com/librilight/data/small.tar",
+    "https://dl.fbaipublicfiles.com/librilight/data/medium.tar",
+    "https://dl.fbaipublicfiles.com/librilight/data/large.tar",
+)
+
+
+def _parse_utterance(
+    corpus_dir: Pathlike,
+    audio_path: Pathlike,
+) -> Optional[Tuple[Recording, SupervisionSegment]]:
+    file_name = str(audio_path).replace(".flac", "").replace(str(corpus_dir) + "/", "")
+    speaker = str(audio_path).split("/")[-3]
+    audio_path = audio_path.resolve()
+
+    if not audio_path.is_file():
+        logging.warning(f"No such file: {audio_path}")
+        return None
+
+    recording = Recording.from_file(
+        path=audio_path,
+        recording_id=file_name,
+    )
+    segment = SupervisionSegment(
+        id=file_name,
+        recording_id=file_name,
+        start=0.0,
+        duration=recording.duration,
+        channel=0,
+        language="English",
+        speaker=speaker,
+    )
+
+    return recording, segment
+
+
+def _prepare_subset(
+    subset: str,
+    corpus_dir: Pathlike,
+    num_jobs: int = 1,
+) -> Tuple[RecordingSet, SupervisionSet]:
+    """
+    Returns the RecodingSet and SupervisionSet given a dataset part.
+    :param subset: str, the name of the subset.
+    :param corpus_dir: Pathlike, the path of the data dir.
+    :return: the RecodingSet and SupervisionSet for train and valid.
+    """
+    corpus_dir = Path(corpus_dir)
+    part_path = corpus_dir / subset
+    audio_paths = list(part_path.rglob("*.flac"))
+
+    with ThreadPoolExecutor(num_jobs) as ex:
+        futures = []
+        recordings = []
+        supervisions = []
+        for audio_path in tqdm(audio_paths, desc="Distributing tasks"):
+            futures.append(ex.submit(_parse_utterance, corpus_dir, audio_path))
+
+        for future in tqdm(futures, desc="Processing"):
+            result = future.result()
+            if result is None:
+                continue
+            recording, segment = result
+            recordings.append(recording)
+            supervisions.append(segment)
+
+        recording_set = RecordingSet.from_recordings(recordings)
+        supervision_set = SupervisionSet.from_segments(supervisions)
+
+    return recording_set, supervision_set
+
+
+def prepare_librilight(
+    corpus_dir: Pathlike,
+    output_dir: Optional[Pathlike] = None,
+    num_jobs: int = 1,
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """
+    Returns the manifests which consist of the Recordings and Supervisions
+    :param corpus_dir: Path to the LibriLight dataset.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
+    """
+    corpus_dir = Path(corpus_dir)
+    output_dir = Path(output_dir) if output_dir is not None else None
+
+    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
+
+    logging.info("Preparing LibriLight...")
+
+    subsets = LIBRILIGHT
+
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    manifests = defaultdict(dict)
+
+    for part in tqdm(subsets, desc="Dataset parts"):
+        logging.info(f"Processing LibriLight subset: {part}")
+        if manifests_exist(
+            part=part,
+            output_dir=output_dir,
+            prefix="librilight",
+            suffix="jsonl.gz",
+        ):
+            logging.info(f"LibriLight subset: {part} already prepared - skipping.")
+            continue
+
+        recording_set, supervision_set = _prepare_subset(part, corpus_dir, num_jobs)
+
+        if output_dir is not None:
+            supervision_set.to_file(
+                output_dir / f"librilight_supervisions_{part}.jsonl.gz"
+            )
+            recording_set.to_file(output_dir / f"librilight_recordings_{part}.jsonl.gz")
+
+        manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}
+
+    return manifests