From 245c5becf76f2b776a183cf666a322eda7c7c582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Tue, 4 Apr 2023 14:10:58 +0200 Subject: [PATCH 1/7] feat: add support for PodcastFillers dataset --- podcastfillers/database.yml | 22 +++++++++++++++ podcastfillers/generate.py | 43 +++++++++++++++++++++++++++++ podcastfillers/setup.sh | 55 +++++++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 podcastfillers/database.yml create mode 100644 podcastfillers/generate.py create mode 100644 podcastfillers/setup.sh diff --git a/podcastfillers/database.yml b/podcastfillers/database.yml new file mode 100644 index 0000000..0e2b0ac --- /dev/null +++ b/podcastfillers/database.yml @@ -0,0 +1,22 @@ +Databases: + PodcastFillers: symlinks/{uri}.wav + +Protocols: + PodcastFillers: + Segmentation: + Fillers: + classes: + - Uh + - Um + train: + uri: train.uris.lst + annotated: train.uem + annotation: consolidated.rttm + development: + uri: development.uris.lst + annotated: development.uem + annotation: consolidated.rttm + test: + uri: test.uris.lst + annotated: test.uem + annotation: consolidated.rttm diff --git a/podcastfillers/generate.py b/podcastfillers/generate.py new file mode 100644 index 0000000..b9b8d32 --- /dev/null +++ b/podcastfillers/generate.py @@ -0,0 +1,43 @@ +import wave +import contextlib +import sys +import csv + +src_dir = sys.argv[1] +tgt_dir = sys.argv[2] + +SUBSET = {"train": "train", "development": "validation", "test": "test"} + + +for subset, slug in SUBSET.items(): + + with open(f"{tgt_dir}/{slug}.uris.lst", "r") as f: + uris = [line.strip() for line in f.readlines()] + + with open(f"{tgt_dir}/{subset}.uem", "w") as f: + for uri in uris: + filename = f"{tgt_dir}/symlinks/{uri}.wav" + with contextlib.closing(wave.open(filename,'r')) as g: + frames = g.getnframes() + rate = g.getframerate() + duration = frames / float(rate) + f.write(f"{uri} 1 0.000 {duration:.3f}\n") + +with open(f'{src_dir}/PodcastFillers.csv', 'r') as csvfile, open(f'{tgt_dir}/full.rttm', 'w') as rttm_full, open(f'{tgt_dir}/consolidated.rttm', 'w') as rttm_consolidated: + csvreader = csv.reader(csvfile, delimiter=',') + for r, row in enumerate(csvreader): + + if r == 0: + continue + + _, _, label_full_vocab, label_consolidated_vocab, filename, start_time, end_time, *_ = row + uri = filename.replace(' ', '_') + start_time = float(start_time) + end_time = float(end_time) + + rttm_full.write(f"SPEAKER {uri} 1 {start_time:.3f} {end_time - start_time:.3f} {label_full_vocab} \n") + + if label_consolidated_vocab == 'None': + continue + + rttm_consolidated.write(f"SPEAKER {uri} 1 {start_time:.3f} {end_time - start_time:.3f} {label_consolidated_vocab} \n") diff --git a/podcastfillers/setup.sh b/podcastfillers/setup.sh new file mode 100644 index 0000000..a3016b2 --- /dev/null +++ b/podcastfillers/setup.sh @@ -0,0 +1,55 @@ +#!/bni/bash + +echo "Downloading ..." +wget -c https://zenodo.org/record/7121457/files/PodcastFillers.zip?download=1 -O PodcastFillers.zip +wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z01?download=1 -O PodcastFillers.z01 +wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z02?download=1 -O PodcastFillers.z02 +wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z03?download=1 -O PodcastFillers.z03 + +echo "Extracting ..." +unzip PodcastFillers.zip + +echo "Generatig URI lists ..." + +SRC_DIR=PodcastFillers +# change this if PodcastFillers has been downloaded and extracted elsewhere +# SRC_DIR=/gpfsdswork/dataset/PodcastFillers + +TGT_DIR=/gpfswork/rech/eie/commun/data/PodcastFillers + +SYMLINK_DIR=$TGT_DIR/symlinks +mkdir -p $SYMLINK_DIR + +OLD_IFS="$IFS" +IFS=$'\n' + +for SUBSET in test train validation; do + + # delete list of uris if it exists + if [ -f $TGT_DIR/$SUBSET.uris.lst ]; then + rm $TGT_DIR/$SUBSET.uris.lst + fi + + for file in `find $SRC_DIR/audio/episode_wav/$SUBSET -name "*wav" -type f`; do + + # build "uri" by removing extension and replacing spaces with underscores + base=`basename "$file"` + stem=`echo $base | sed 's/\.[^.]*$//'` + URI=`echo $stem | tr ' ' '_'` + + # add "uri" to list of uris + echo $URI >> $TGT_DIR/$SUBSET.uris.lst + + # create symlink + ln -s "$file" "$SYMLINK_DIR/$URI.wav" + done + +done + +IFS="$OLD_IFS" + +echo "Generating UEMs and RTTMs..." +python generate.py $SRC_DIR $TGT_DIR + + + From 3c3cdb4f9c36b2ad84e4f36d40145e60a79ab6b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Tue, 4 Apr 2023 14:17:27 +0200 Subject: [PATCH 2/7] fix: add missing download and FIXME comments --- podcastfillers/setup.sh | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/podcastfillers/setup.sh b/podcastfillers/setup.sh index a3016b2..1452c38 100644 --- a/podcastfillers/setup.sh +++ b/podcastfillers/setup.sh @@ -1,23 +1,20 @@ -#!/bni/bash +#!/bin/bash + + echo "Downloading ..." wget -c https://zenodo.org/record/7121457/files/PodcastFillers.zip?download=1 -O PodcastFillers.zip wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z01?download=1 -O PodcastFillers.z01 wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z02?download=1 -O PodcastFillers.z02 wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z03?download=1 -O PodcastFillers.z03 +wget -c https://zenodo.org/record/7121457/files/PodcastFillers.csv?download=1 -O PodcastFillers.csv echo "Extracting ..." unzip PodcastFillers.zip echo "Generatig URI lists ..." -SRC_DIR=PodcastFillers -# change this if PodcastFillers has been downloaded and extracted elsewhere -# SRC_DIR=/gpfsdswork/dataset/PodcastFillers - -TGT_DIR=/gpfswork/rech/eie/commun/data/PodcastFillers - -SYMLINK_DIR=$TGT_DIR/symlinks +SYMLINK_DIR=$PWD/symlinks mkdir -p $SYMLINK_DIR OLD_IFS="$IFS" @@ -26,11 +23,12 @@ IFS=$'\n' for SUBSET in test train validation; do # delete list of uris if it exists - if [ -f $TGT_DIR/$SUBSET.uris.lst ]; then - rm $TGT_DIR/$SUBSET.uris.lst + if [ -f $PWD/$SUBSET.uris.lst ]; then + rm $PWD/$SUBSET.uris.lst fi - for file in `find $SRC_DIR/audio/episode_wav/$SUBSET -name "*wav" -type f`; do + # FIXME: this is probably not the right relative path to PodcastFillers + for file in `find $PWD/PodcastFillers/audio/episode_wav/$SUBSET -name "*wav" -type f`; do # build "uri" by removing extension and replacing spaces with underscores base=`basename "$file"` @@ -38,7 +36,7 @@ for SUBSET in test train validation; do URI=`echo $stem | tr ' ' '_'` # add "uri" to list of uris - echo $URI >> $TGT_DIR/$SUBSET.uris.lst + echo $URI >> $PWD/$SUBSET.uris.lst # create symlink ln -s "$file" "$SYMLINK_DIR/$URI.wav" @@ -49,7 +47,4 @@ done IFS="$OLD_IFS" echo "Generating UEMs and RTTMs..." -python generate.py $SRC_DIR $TGT_DIR - - - +python generate.py $PWD From 9e2c3d1dc6df822b07795401db16a75756372274 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Tue, 4 Apr 2023 14:18:16 +0200 Subject: [PATCH 3/7] fix: fix list of arugments --- podcastfillers/generate.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/podcastfillers/generate.py b/podcastfillers/generate.py index b9b8d32..4d34840 100644 --- a/podcastfillers/generate.py +++ b/podcastfillers/generate.py @@ -3,12 +3,10 @@ import sys import csv -src_dir = sys.argv[1] -tgt_dir = sys.argv[2] +tgt_dir = sys.argv[1] SUBSET = {"train": "train", "development": "validation", "test": "test"} - for subset, slug in SUBSET.items(): with open(f"{tgt_dir}/{slug}.uris.lst", "r") as f: @@ -23,7 +21,7 @@ duration = frames / float(rate) f.write(f"{uri} 1 0.000 {duration:.3f}\n") -with open(f'{src_dir}/PodcastFillers.csv', 'r') as csvfile, open(f'{tgt_dir}/full.rttm', 'w') as rttm_full, open(f'{tgt_dir}/consolidated.rttm', 'w') as rttm_consolidated: +with open(f'{tgt_dir}/PodcastFillers.csv', 'r') as csvfile, open(f'{tgt_dir}/full.rttm', 'w') as rttm_full, open(f'{tgt_dir}/consolidated.rttm', 'w') as rttm_consolidated: csvreader = csv.reader(csvfile, delimiter=',') for r, row in enumerate(csvreader): From c75835c2523345fd68424b2a1156432bcfab1af1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Tue, 4 Apr 2023 14:22:56 +0200 Subject: [PATCH 4/7] feat: add README --- podcastfillers/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 podcastfillers/README.md diff --git a/podcastfillers/README.md b/podcastfillers/README.md new file mode 100644 index 0000000..2324963 --- /dev/null +++ b/podcastfillers/README.md @@ -0,0 +1,14 @@ +# PodcastFillers + +Script to download [PodcastFillers](https://zenodo.org/record/7121457#.ZCwUUxXP0qs) dataset and set it up for use with [`pyannote.audio`](https://github.com/pyannote/pyannote-audio). + +Protocol `PodcastFillers.Segmentation.Fillers` can be used for training a filler word (`Uh` or `Um`) detection model with protocol . + +## Instruction + +Run `setup.sh` to download and extract the files. + + +## License and citation + +See [PodcastFillers Zenodo page](https://zenodo.org/record/7121457#.ZCwUUxXP0qs). From 9829d0abede88311eafcb9734eec88621676583d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Tue, 4 Apr 2023 14:24:21 +0200 Subject: [PATCH 5/7] feat: update main README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2ff0cbb..30c64a5 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,12 @@ This repository aims to centralize scripts that prepare datasets to be used with Currently available : - [AISHELL4](aishell4) - [MSDWild](msdwild) +- [PodcastFillers](podcastfillers) To setup each dataset, refer to the `README.md` contained in their respective folder. -Each dataset comes with its predefined `database.yml`, containing pyannote-database protocol(s) with already defined train+dev+test sets for out-of-the-box *speaker diarization* usage. +Each dataset comes with its predefined `database.yml`, containing pyannote-database protocol(s) with already defined train+dev+test sets for out-of-the-box *speaker diarization* or *segmentation* usage. + How these subsets are defined is entirely configurable. ## FAQ From 9fb84705f0b9a3aa74a80bca33621429b41abb04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Tue, 4 Apr 2023 14:54:23 +0200 Subject: [PATCH 6/7] fix: fix path to development list --- podcastfillers/database.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/podcastfillers/database.yml b/podcastfillers/database.yml index 0e2b0ac..1a27fb2 100644 --- a/podcastfillers/database.yml +++ b/podcastfillers/database.yml @@ -13,7 +13,7 @@ Protocols: annotated: train.uem annotation: consolidated.rttm development: - uri: development.uris.lst + uri: validation.uris.lst annotated: development.uem annotation: consolidated.rttm test: From fa2e21b01419ada9f675f7a0df2a6edd5906001c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Tue, 4 Apr 2023 15:27:39 +0200 Subject: [PATCH 7/7] fix: fix diacritic handling --- podcastfillers/setup.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/podcastfillers/setup.sh b/podcastfillers/setup.sh index 1452c38..716da4a 100644 --- a/podcastfillers/setup.sh +++ b/podcastfillers/setup.sh @@ -29,11 +29,10 @@ for SUBSET in test train validation; do # FIXME: this is probably not the right relative path to PodcastFillers for file in `find $PWD/PodcastFillers/audio/episode_wav/$SUBSET -name "*wav" -type f`; do - - # build "uri" by removing extension and replacing spaces with underscores + + # build "uri" by removing diacritics, removing extension, and replacing spaces with underscores base=`basename "$file"` - stem=`echo $base | sed 's/\.[^.]*$//'` - URI=`echo $stem | tr ' ' '_'` + URI=`echo $base | iconv -f utf8 -t ascii//TRANSLIT | sed 's/\.[^.]*$//' | tr ' ' '_'` # add "uri" to list of uris echo $URI >> $PWD/$SUBSET.uris.lst