lhotse-speech · AmirHussein96 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/docs/corpus.rst b/docs/corpus.rst
@@ -75,6 +75,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_callhome_egyptian`
   * - CallHome English
     - :func:`lhotse.recipes.prepare_callhome_english`
+  * - CallHome Spanish Speech Translation
+    - :func:`lhotse.recipes.prepare_callhome_spanish_st`
   * - CHiME-6
     - :func:`lhotse.recipes.prepare_chime6`
   * - CMU Arctic
@@ -107,6 +109,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_fisher_english`
   * - Fisher Spanish
     - :func:`lhotse.recipes.prepare_fisher_spanish`
+  * - Fisher Spanish Speech Translation
+    - :func:`lhotse.recipes.prepare_fisher_spanish_st`
   * - Fluent Speech Commands
     - :func:`lhotse.recipes.slu`
   * - GALE Arabic Broadcast Speech
@@ -167,6 +171,8 @@ a CLI tool that create the manifests given a corpus directory.
     - :func:`lhotse.recipes.prepare_peoples_speech`
   * - RIRs and Noises Corpus (OpenSLR 28)
     - :func:`lhotse.recipes.prepare_rir_noise`
+  * - SEAME 
+    - :func:`lhotse.recipes.prepare_seame`
   * - Speech Commands
     - :func:`lhotse.recipes.prepare_speechcommands`
   * - SpeechIO

diff --git a/lhotse/bin/modes/recipes/__init__.py b/lhotse/bin/modes/recipes/__init__.py
@@ -16,6 +16,7 @@
 from .bvcc import *
 from .callhome_egyptian import *
 from .callhome_english import *
+from .callhome_spanish_st import *
 from .chime6 import *
 from .cmu_arctic import *
 from .cmu_indic import *
@@ -32,6 +33,7 @@
 from .eval2000 import *
 from .fisher_english import *
 from .fisher_spanish import *
+from .fisher_spanish_st import *
 from .gale_arabic import *
 from .gale_mandarin import *
 from .gigaspeech import *
@@ -63,6 +65,7 @@
 from .peoples_speech import *
 from .primewords import *
 from .rir_noise import *
+from .seame import *
 from .slu import *
 from .speechcommands import *
 from .speechio import *

diff --git a/lhotse/bin/modes/recipes/callhome_spanish_st.py b/lhotse/bin/modes/recipes/callhome_spanish_st.py
@@ -0,0 +1,69 @@
+from typing import Optional, Sequence, Union
+
+import click
+
+from lhotse.bin.modes import prepare
+from lhotse.recipes.callhome_spanish_st import prepare_callhome_spanish
+from lhotse.utils import Pathlike
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("audio_dir_path", type=click.Path(exists=True, dir_okay=True))
+@click.argument("transcript_dir_path", type=click.Path(exists=True, dir_okay=True))
+@click.argument("split_dir", type=click.Path())
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many threads to use (can give good speed-ups with slow disks).",
+)
+@click.option(
+    "--absolute_paths",
+    default=False,
+    help=" Whether to return absolute or relative (to the corpus dir) paths for recordings.",
+)
+@click.option(
+    "--remove_punc",
+    default=False,
+    help=" Remove punctuations from the text",
+)
+@click.option(
+    "--lowercase",
+    default=False,
+    help="Lower case the text",
+)
+def callhome_spanish_st(
+    audio_dir_path: Pathlike,
+    transcript_dir_path: Pathlike,
+    split_dir: Pathlike,
+    output_dir: Pathlike,
+    absolute_paths: bool,
+    remove_punc: bool,
+    lowercase: bool,
+    num_jobs: int,
+):
+    """
+    Callhome Spanish data preparation.
+    \b
+    This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data.
+    The catalog number LDC96S35 for audio corpus and LDC96T17 for transcripts.
+
+    This data is not available for free - your institution needs to have an LDC subscription.
+    You should also download and prepare the pre-defined splits with:
+        git clone https://github.com/joshua-decoder/fisher-callhome-corpus.git
+        cd fisher-callhome-corpus
+        make
+        cd ../
+    """
+    prepare_callhome_spanish(
+        audio_dir_path=audio_dir_path,
+        transcript_dir_path=transcript_dir_path,
+        split_dir=split_dir,
+        output_dir=output_dir,
+        absolute_paths=absolute_paths,
+        remove_punc=remove_punc,
+        lowercase=lowercase,
+        num_jobs=num_jobs,
+    )
diff --git a/lhotse/bin/modes/recipes/fisher_spanish_st.py b/lhotse/bin/modes/recipes/fisher_spanish_st.py
@@ -0,0 +1,69 @@
+from typing import Optional, Sequence, Union
+
+import click
+
+from lhotse.bin.modes import prepare
+from lhotse.recipes.fisher_spanish_st import prepare_fisher_spanish
+from lhotse.utils import Pathlike
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("audio_dir_path", type=click.Path(exists=True, dir_okay=True))
+@click.argument("transcript_dir_path", type=click.Path(exists=True, dir_okay=True))
+@click.argument("split_dir", type=click.Path())
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "-j",
+    "--num-jobs",
+    type=int,
+    default=1,
+    help="How many threads to use (can give good speed-ups with slow disks).",
+)
+@click.option(
+    "--absolute_paths",
+    default=False,
+    help=" Whether to return absolute or relative (to the corpus dir) paths for recordings.",
+)
+@click.option(
+    "--remove_punc",
+    default=False,
+    help=" Remove punctuations from the text",
+)
+@click.option(
+    "--lowercase",
+    default=False,
+    help="Lower case the text",
+)
+def fisher_spanish_st(
+    audio_dir_path: Pathlike,
+    transcript_dir_path: Pathlike,
+    split_dir: Pathlike,
+    output_dir: Pathlike,
+    absolute_paths: bool,
+    remove_punc: bool,
+    lowercase: bool,
+    num_jobs: int,
+):
+    """
+    Fisher Spanish data preparation.
+    \b
+    This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data.
+    The catalog number LDC2010S01 for audio corpus and LDC2010T04 for transcripts.
+
+    This data is not available for free - your institution needs to have an LDC subscription.
+    You also should download and prepare the pre-defined splits with:
+        git clone https://github.com/joshua-decoder/fisher-callhome-corpus.git
+        cd fisher-callhome-corpus
+        make
+        cd ../
+    """
+    prepare_fisher_spanish(
+        audio_dir_path=audio_dir_path,
+        transcript_dir_path=transcript_dir_path,
+        split_dir=split_dir,
+        output_dir=output_dir,
+        absolute_paths=absolute_paths,
+        remove_punc=remove_punc,
+        lowercase=lowercase,
+        num_jobs=num_jobs,
+    )
diff --git a/lhotse/bin/modes/recipes/seame.py b/lhotse/bin/modes/recipes/seame.py
@@ -0,0 +1,42 @@
+from typing import Optional, Sequence, Union
+
+import click
+
+from lhotse.bin.modes import prepare
+from lhotse.recipes.seame import prepare_seame
+from lhotse.utils import Pathlike
+
+
+@prepare.command(context_settings=dict(show_default=True))
+@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("split_dir", type=click.Path(exists=True, dir_okay=True))
+@click.argument("output_dir", type=click.Path())
+@click.option(
+    "--clean-text",
+    default=False,
+    help="Whether to perform additional text cleaning and normalization",
+)
+@click.option(
+    "--delimiter",
+    default="",
+    help="Used to split the code switching text ",
+)
+def seame(
+    corpus_dir: Pathlike,
+    split_dir: Pathlike,
+    clean_text: bool,
+    delimiter: str,
+    output_dir: Pathlike,
+):
+    """
+    SEAME data preparation.
+    \b
+    This is Singaporean Codeswitched English and Mandarin data.
+    """
+    prepare_seame(
+        corpus_dir=corpus_dir,
+        split_dir=split_dir,
+        output_dir=output_dir,
+        clean_text=clean_text,
+        delimiter=delimiter,
+    )
diff --git a/lhotse/dataset/speech_recognition.py b/lhotse/dataset/speech_recognition.py
@@ -64,6 +64,7 @@ def __init__(
         cut_transforms: List[Callable[[CutSet], CutSet]] = None,
         input_transforms: List[Callable[[torch.Tensor], torch.Tensor]] = None,
         input_strategy: BatchIO = PrecomputedFeatures(),
+        lid: bool = False,
     ):
         """
         k2 ASR IterableDataset constructor.
@@ -78,13 +79,15 @@ def __init__(
             Examples: normalization, SpecAugment, etc.
         :param input_strategy: Converts cuts into a collated batch of audio/features.
             By default, reads pre-computed features from disk.
+        :param lid: adding lid information to the batch.
         """
         super().__init__()
         # Initialize the fields
         self.return_cuts = return_cuts
         self.cut_transforms = ifnone(cut_transforms, [])
         self.input_transforms = ifnone(input_transforms, [])
         self.input_strategy = input_strategy
+        self.lid = lid
 
         # This attribute is a workaround to constantly growing HDF5 memory
         # throughout the epoch. It regularly closes open file handles to
@@ -132,19 +135,37 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]
         segments = torch.stack(list(supervision_intervals.values()), dim=1)
         for tnfm in self.input_transforms:
             inputs = tnfm(inputs, supervision_segments=segments)
-
-        batch = {
-            "inputs": inputs,
-            "supervisions": default_collate(
-                [
-                    {
-                        "text": supervision.text,
-                    }
-                    for sequence_idx, cut in enumerate(cuts)
+        if self.lid == True:
+            batch = {
+                "inputs": inputs,
+                "lids": [
+                    supervision.language
+                    for _, cut in enumerate(cuts)
                     for supervision in cut.supervisions
-                ]
-            ),
-        }
+                ],
+                "supervisions": default_collate(
+                    [
+                        {
+                            "text": supervision.text,
+                        }
+                        for sequence_idx, cut in enumerate(cuts)
+                        for supervision in cut.supervisions
+                    ]
+                ),
+            }
+        else:
+            batch = {
+                "inputs": inputs,
+                "supervisions": default_collate(
+                    [
+                        {
+                            "text": supervision.text,
+                        }
+                        for sequence_idx, cut in enumerate(cuts)
+                        for supervision in cut.supervisions
+                    ]
+                ),
+            }
         # Update the 'supervisions' field with sequence_idx and start/num frames/samples
         batch["supervisions"].update(supervision_intervals)
         if self.return_cuts: