Skip to content
6 changes: 6 additions & 0 deletions docs/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_callhome_egyptian`
* - CallHome English
- :func:`lhotse.recipes.prepare_callhome_english`
* - CallHome Spanish Speech Translation
- :func:`lhotse.recipes.prepare_callhome_spanish_st`
* - CHiME-6
- :func:`lhotse.recipes.prepare_chime6`
* - CMU Arctic
Expand Down Expand Up @@ -107,6 +109,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_fisher_english`
* - Fisher Spanish
- :func:`lhotse.recipes.prepare_fisher_spanish`
* - Fisher Spanish Speech Translation
- :func:`lhotse.recipes.prepare_fisher_spanish_st`
* - Fluent Speech Commands
- :func:`lhotse.recipes.slu`
* - GALE Arabic Broadcast Speech
Expand Down Expand Up @@ -167,6 +171,8 @@ a CLI tool that create the manifests given a corpus directory.
- :func:`lhotse.recipes.prepare_peoples_speech`
* - RIRs and Noises Corpus (OpenSLR 28)
- :func:`lhotse.recipes.prepare_rir_noise`
* - SEAME
- :func:`lhotse.recipes.prepare_seame`
* - Speech Commands
- :func:`lhotse.recipes.prepare_speechcommands`
* - SpeechIO
Expand Down
3 changes: 3 additions & 0 deletions lhotse/bin/modes/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .bvcc import *
from .callhome_egyptian import *
from .callhome_english import *
from .callhome_spanish_st import *
from .chime6 import *
from .cmu_arctic import *
from .cmu_indic import *
Expand All @@ -32,6 +33,7 @@
from .eval2000 import *
from .fisher_english import *
from .fisher_spanish import *
from .fisher_spanish_st import *
from .gale_arabic import *
from .gale_mandarin import *
from .gigaspeech import *
Expand Down Expand Up @@ -63,6 +65,7 @@
from .peoples_speech import *
from .primewords import *
from .rir_noise import *
from .seame import *
from .slu import *
from .speechcommands import *
from .speechio import *
Expand Down
69 changes: 69 additions & 0 deletions lhotse/bin/modes/recipes/callhome_spanish_st.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from typing import Optional, Sequence, Union

import click

from lhotse.bin.modes import prepare
from lhotse.recipes.callhome_spanish_st import prepare_callhome_spanish
from lhotse.utils import Pathlike


@prepare.command(context_settings=dict(show_default=True))
@click.argument("audio_dir_path", type=click.Path(exists=True, dir_okay=True))
@click.argument("transcript_dir_path", type=click.Path(exists=True, dir_okay=True))
@click.argument("split_dir", type=click.Path())
@click.argument("output_dir", type=click.Path())
@click.option(
"-j",
"--num-jobs",
type=int,
default=1,
help="How many threads to use (can give good speed-ups with slow disks).",
)
@click.option(
"--absolute_paths",
default=False,
help=" Whether to return absolute or relative (to the corpus dir) paths for recordings.",
)
@click.option(
"--remove_punc",
default=False,
help=" Remove punctuations from the text",
)
@click.option(
"--lowercase",
default=False,
help="Lower case the text",
)
def callhome_spanish_st(
audio_dir_path: Pathlike,
transcript_dir_path: Pathlike,
split_dir: Pathlike,
output_dir: Pathlike,
absolute_paths: bool,
remove_punc: bool,
lowercase: bool,
num_jobs: int,
):
"""
Callhome Spanish data preparation.
\b
This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data.
The catalog number LDC96S35 for audio corpus and LDC96T17 for transcripts.

This data is not available for free - your institution needs to have an LDC subscription.
You should also download and prepare the pre-defined splits with:
git clone https://github.com/joshua-decoder/fisher-callhome-corpus.git
cd fisher-callhome-corpus
make
cd ../
"""
prepare_callhome_spanish(
audio_dir_path=audio_dir_path,
transcript_dir_path=transcript_dir_path,
split_dir=split_dir,
output_dir=output_dir,
absolute_paths=absolute_paths,
remove_punc=remove_punc,
lowercase=lowercase,
num_jobs=num_jobs,
)
69 changes: 69 additions & 0 deletions lhotse/bin/modes/recipes/fisher_spanish_st.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from typing import Optional, Sequence, Union

import click

from lhotse.bin.modes import prepare
from lhotse.recipes.fisher_spanish_st import prepare_fisher_spanish
from lhotse.utils import Pathlike


@prepare.command(context_settings=dict(show_default=True))
@click.argument("audio_dir_path", type=click.Path(exists=True, dir_okay=True))
@click.argument("transcript_dir_path", type=click.Path(exists=True, dir_okay=True))
@click.argument("split_dir", type=click.Path())
@click.argument("output_dir", type=click.Path())
@click.option(
"-j",
"--num-jobs",
type=int,
default=1,
help="How many threads to use (can give good speed-ups with slow disks).",
)
@click.option(
"--absolute_paths",
default=False,
help=" Whether to return absolute or relative (to the corpus dir) paths for recordings.",
)
@click.option(
"--remove_punc",
default=False,
help=" Remove punctuations from the text",
)
@click.option(
"--lowercase",
default=False,
help="Lower case the text",
)
def fisher_spanish_st(
audio_dir_path: Pathlike,
transcript_dir_path: Pathlike,
split_dir: Pathlike,
output_dir: Pathlike,
absolute_paths: bool,
remove_punc: bool,
lowercase: bool,
num_jobs: int,
):
"""
Fisher Spanish data preparation.
\b
This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data.
The catalog number LDC2010S01 for audio corpus and LDC2010T04 for transcripts.

This data is not available for free - your institution needs to have an LDC subscription.
You also should download and prepare the pre-defined splits with:
git clone https://github.com/joshua-decoder/fisher-callhome-corpus.git
cd fisher-callhome-corpus
make
cd ../
"""
prepare_fisher_spanish(
audio_dir_path=audio_dir_path,
transcript_dir_path=transcript_dir_path,
split_dir=split_dir,
output_dir=output_dir,
absolute_paths=absolute_paths,
remove_punc=remove_punc,
lowercase=lowercase,
num_jobs=num_jobs,
)
42 changes: 42 additions & 0 deletions lhotse/bin/modes/recipes/seame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Optional, Sequence, Union

import click

from lhotse.bin.modes import prepare
from lhotse.recipes.seame import prepare_seame
from lhotse.utils import Pathlike


@prepare.command(context_settings=dict(show_default=True))
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("split_dir", type=click.Path(exists=True, dir_okay=True))
@click.argument("output_dir", type=click.Path())
@click.option(
"--clean-text",
default=False,
help="Whether to perform additional text cleaning and normalization",
)
@click.option(
"--delimiter",
default="",
help="Used to split the code switching text ",
)
def seame(
corpus_dir: Pathlike,
split_dir: Pathlike,
clean_text: bool,
delimiter: str,
output_dir: Pathlike,
):
"""
SEAME data preparation.
\b
This is Singaporean Codeswitched English and Mandarin data.
"""
prepare_seame(
corpus_dir=corpus_dir,
split_dir=split_dir,
output_dir=output_dir,
clean_text=clean_text,
delimiter=delimiter,
)
45 changes: 33 additions & 12 deletions lhotse/dataset/speech_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def __init__(
cut_transforms: List[Callable[[CutSet], CutSet]] = None,
input_transforms: List[Callable[[torch.Tensor], torch.Tensor]] = None,
input_strategy: BatchIO = PrecomputedFeatures(),
lid: bool = False,
):
"""
k2 ASR IterableDataset constructor.
Expand All @@ -78,13 +79,15 @@ def __init__(
Examples: normalization, SpecAugment, etc.
:param input_strategy: Converts cuts into a collated batch of audio/features.
By default, reads pre-computed features from disk.
:param lid: adding lid information to the batch.
"""
super().__init__()
# Initialize the fields
self.return_cuts = return_cuts
self.cut_transforms = ifnone(cut_transforms, [])
self.input_transforms = ifnone(input_transforms, [])
self.input_strategy = input_strategy
self.lid = lid

# This attribute is a workaround to constantly growing HDF5 memory
# throughout the epoch. It regularly closes open file handles to
Expand Down Expand Up @@ -132,19 +135,37 @@ def __getitem__(self, cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]
segments = torch.stack(list(supervision_intervals.values()), dim=1)
for tnfm in self.input_transforms:
inputs = tnfm(inputs, supervision_segments=segments)

batch = {
"inputs": inputs,
"supervisions": default_collate(
[
{
"text": supervision.text,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we can just add ”language”: supervision.language, in the line below and always return it to get rid of the extra option and code duplication.

}
for sequence_idx, cut in enumerate(cuts)
if self.lid == True:
batch = {
"inputs": inputs,
"lids": [
supervision.language
for _, cut in enumerate(cuts)
for supervision in cut.supervisions
]
),
}
],
"supervisions": default_collate(
[
{
"text": supervision.text,
}
for sequence_idx, cut in enumerate(cuts)
for supervision in cut.supervisions
]
),
}
else:
batch = {
"inputs": inputs,
"supervisions": default_collate(
[
{
"text": supervision.text,
}
for sequence_idx, cut in enumerate(cuts)
for supervision in cut.supervisions
]
),
}
# Update the 'supervisions' field with sequence_idx and start/num frames/samples
batch["supervisions"].update(supervision_intervals)
if self.return_cuts:
Expand Down
Loading