diff --git a/README.md b/README.md index 2ff0cbb..30c64a5 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,12 @@ This repository aims to centralize scripts that prepare datasets to be used with Currently available : - [AISHELL4](aishell4) - [MSDWild](msdwild) +- [PodcastFillers](podcastfillers) To setup each dataset, refer to the `README.md` contained in their respective folder. -Each dataset comes with its predefined `database.yml`, containing pyannote-database protocol(s) with already defined train+dev+test sets for out-of-the-box *speaker diarization* usage. +Each dataset comes with its predefined `database.yml`, containing pyannote-database protocol(s) with already defined train+dev+test sets for out-of-the-box *speaker diarization* or *segmentation* usage. + How these subsets are defined is entirely configurable. ## FAQ diff --git a/podcastfillers/README.md b/podcastfillers/README.md new file mode 100644 index 0000000..2324963 --- /dev/null +++ b/podcastfillers/README.md @@ -0,0 +1,14 @@ +# PodcastFillers + +Script to download [PodcastFillers](https://zenodo.org/record/7121457#.ZCwUUxXP0qs) dataset and set it up for use with [`pyannote.audio`](https://github.com/pyannote/pyannote-audio). + +Protocol `PodcastFillers.Segmentation.Fillers` can be used for training a filler word (`Uh` or `Um`) detection model with protocol . + +## Instruction + +Run `setup.sh` to download and extract the files. + + +## License and citation + +See [PodcastFillers Zenodo page](https://zenodo.org/record/7121457#.ZCwUUxXP0qs). diff --git a/podcastfillers/database.yml b/podcastfillers/database.yml new file mode 100644 index 0000000..1a27fb2 --- /dev/null +++ b/podcastfillers/database.yml @@ -0,0 +1,22 @@ +Databases: + PodcastFillers: symlinks/{uri}.wav + +Protocols: + PodcastFillers: + Segmentation: + Fillers: + classes: + - Uh + - Um + train: + uri: train.uris.lst + annotated: train.uem + annotation: consolidated.rttm + development: + uri: validation.uris.lst + annotated: development.uem + annotation: consolidated.rttm + test: + uri: test.uris.lst + annotated: test.uem + annotation: consolidated.rttm diff --git a/podcastfillers/generate.py b/podcastfillers/generate.py new file mode 100644 index 0000000..4d34840 --- /dev/null +++ b/podcastfillers/generate.py @@ -0,0 +1,41 @@ +import wave +import contextlib +import sys +import csv + +tgt_dir = sys.argv[1] + +SUBSET = {"train": "train", "development": "validation", "test": "test"} + +for subset, slug in SUBSET.items(): + + with open(f"{tgt_dir}/{slug}.uris.lst", "r") as f: + uris = [line.strip() for line in f.readlines()] + + with open(f"{tgt_dir}/{subset}.uem", "w") as f: + for uri in uris: + filename = f"{tgt_dir}/symlinks/{uri}.wav" + with contextlib.closing(wave.open(filename,'r')) as g: + frames = g.getnframes() + rate = g.getframerate() + duration = frames / float(rate) + f.write(f"{uri} 1 0.000 {duration:.3f}\n") + +with open(f'{tgt_dir}/PodcastFillers.csv', 'r') as csvfile, open(f'{tgt_dir}/full.rttm', 'w') as rttm_full, open(f'{tgt_dir}/consolidated.rttm', 'w') as rttm_consolidated: + csvreader = csv.reader(csvfile, delimiter=',') + for r, row in enumerate(csvreader): + + if r == 0: + continue + + _, _, label_full_vocab, label_consolidated_vocab, filename, start_time, end_time, *_ = row + uri = filename.replace(' ', '_') + start_time = float(start_time) + end_time = float(end_time) + + rttm_full.write(f"SPEAKER {uri} 1 {start_time:.3f} {end_time - start_time:.3f} {label_full_vocab} \n") + + if label_consolidated_vocab == 'None': + continue + + rttm_consolidated.write(f"SPEAKER {uri} 1 {start_time:.3f} {end_time - start_time:.3f} {label_consolidated_vocab} \n") diff --git a/podcastfillers/setup.sh b/podcastfillers/setup.sh new file mode 100644 index 0000000..716da4a --- /dev/null +++ b/podcastfillers/setup.sh @@ -0,0 +1,49 @@ +#!/bin/bash + + + +echo "Downloading ..." +wget -c https://zenodo.org/record/7121457/files/PodcastFillers.zip?download=1 -O PodcastFillers.zip +wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z01?download=1 -O PodcastFillers.z01 +wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z02?download=1 -O PodcastFillers.z02 +wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z03?download=1 -O PodcastFillers.z03 +wget -c https://zenodo.org/record/7121457/files/PodcastFillers.csv?download=1 -O PodcastFillers.csv + +echo "Extracting ..." +unzip PodcastFillers.zip + +echo "Generatig URI lists ..." + +SYMLINK_DIR=$PWD/symlinks +mkdir -p $SYMLINK_DIR + +OLD_IFS="$IFS" +IFS=$'\n' + +for SUBSET in test train validation; do + + # delete list of uris if it exists + if [ -f $PWD/$SUBSET.uris.lst ]; then + rm $PWD/$SUBSET.uris.lst + fi + + # FIXME: this is probably not the right relative path to PodcastFillers + for file in `find $PWD/PodcastFillers/audio/episode_wav/$SUBSET -name "*wav" -type f`; do + + # build "uri" by removing diacritics, removing extension, and replacing spaces with underscores + base=`basename "$file"` + URI=`echo $base | iconv -f utf8 -t ascii//TRANSLIT | sed 's/\.[^.]*$//' | tr ' ' '_'` + + # add "uri" to list of uris + echo $URI >> $PWD/$SUBSET.uris.lst + + # create symlink + ln -s "$file" "$SYMLINK_DIR/$URI.wav" + done + +done + +IFS="$OLD_IFS" + +echo "Generating UEMs and RTTMs..." +python generate.py $PWD