FrenchKrab · hbredin · Apr 4, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 4, 2023
diff --git a/README.md b/README.md
@@ -5,10 +5,12 @@ This repository aims to centralize scripts that prepare datasets to be used with
 Currently available : 
 - [AISHELL4](aishell4)
 - [MSDWild](msdwild)
+- [PodcastFillers](podcastfillers)
 
 To setup each dataset, refer to the `README.md` contained in their respective folder.
 
-Each dataset comes with its predefined `database.yml`, containing pyannote-database protocol(s) with already defined train+dev+test sets for out-of-the-box *speaker diarization* usage.
+Each dataset comes with its predefined `database.yml`, containing pyannote-database protocol(s) with already defined train+dev+test sets for out-of-the-box *speaker diarization* or *segmentation* usage.
+
 How these subsets are defined is entirely configurable.
 
 ## FAQ

diff --git a/podcastfillers/README.md b/podcastfillers/README.md
@@ -0,0 +1,14 @@
+# PodcastFillers
+
+Script to download [PodcastFillers](https://zenodo.org/record/7121457#.ZCwUUxXP0qs) dataset and set it up for use with [`pyannote.audio`](https://github.com/pyannote/pyannote-audio).
+
+Protocol `PodcastFillers.Segmentation.Fillers` can be used for training a filler word (`Uh` or `Um`) detection model with protocol .
+
+## Instruction
+
+Run `setup.sh` to download and extract the files.
+
+
+## License and citation
+
+See [PodcastFillers Zenodo page](https://zenodo.org/record/7121457#.ZCwUUxXP0qs).
diff --git a/podcastfillers/database.yml b/podcastfillers/database.yml
@@ -0,0 +1,22 @@
+Databases:
+  PodcastFillers: symlinks/{uri}.wav
+
+Protocols:
+  PodcastFillers:
+    Segmentation:
+      Fillers:
+        classes:
+          - Uh
+          - Um
+        train:
+          uri: train.uris.lst
+          annotated: train.uem
+          annotation: consolidated.rttm
+        development:
+          uri: validation.uris.lst
+          annotated: development.uem
+          annotation: consolidated.rttm
+        test:
+          uri: test.uris.lst
+          annotated: test.uem
+          annotation: consolidated.rttm
diff --git a/podcastfillers/generate.py b/podcastfillers/generate.py
@@ -0,0 +1,41 @@
+import wave
+import contextlib
+import sys
+import csv
+
+tgt_dir = sys.argv[1]
+
+SUBSET = {"train": "train", "development": "validation", "test": "test"}
+
+for subset, slug in SUBSET.items():
+
+    with open(f"{tgt_dir}/{slug}.uris.lst", "r") as f:
+        uris = [line.strip() for line in f.readlines()]
+
+    with open(f"{tgt_dir}/{subset}.uem", "w") as f:
+        for uri in uris:
+            filename = f"{tgt_dir}/symlinks/{uri}.wav"
+            with contextlib.closing(wave.open(filename,'r')) as g:
+                frames = g.getnframes()
+                rate = g.getframerate()
+                duration = frames / float(rate)
+                f.write(f"{uri} 1 0.000 {duration:.3f}\n")
+
+with open(f'{tgt_dir}/PodcastFillers.csv', 'r') as csvfile, open(f'{tgt_dir}/full.rttm', 'w') as rttm_full, open(f'{tgt_dir}/consolidated.rttm', 'w') as rttm_consolidated:
+    csvreader = csv.reader(csvfile, delimiter=',')
+    for r, row in enumerate(csvreader):
+
+        if r == 0:
+            continue
+
+        _, _, label_full_vocab, label_consolidated_vocab, filename, start_time, end_time, *_ = row
+        uri = filename.replace(' ', '_')
+        start_time = float(start_time)
+        end_time = float(end_time)
+
+        rttm_full.write(f"SPEAKER {uri} 1 {start_time:.3f} {end_time - start_time:.3f} <NA> <NA> {label_full_vocab} <NA> <NA>\n")
+
+        if label_consolidated_vocab == 'None':
+            continue
+
+        rttm_consolidated.write(f"SPEAKER {uri} 1 {start_time:.3f} {end_time - start_time:.3f} <NA> <NA> {label_consolidated_vocab} <NA> <NA>\n")
diff --git a/podcastfillers/setup.sh b/podcastfillers/setup.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+
+
+echo "Downloading ..."
+wget -c https://zenodo.org/record/7121457/files/PodcastFillers.zip?download=1 -O PodcastFillers.zip
+wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z01?download=1 -O PodcastFillers.z01
+wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z02?download=1 -O PodcastFillers.z02
+wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z03?download=1 -O PodcastFillers.z03
+wget -c https://zenodo.org/record/7121457/files/PodcastFillers.csv?download=1 -O PodcastFillers.csv
+
+echo "Extracting ..."
+unzip PodcastFillers.zip
+
+echo "Generatig URI lists ..."
+
+SYMLINK_DIR=$PWD/symlinks
+mkdir -p $SYMLINK_DIR
+
+OLD_IFS="$IFS"
+IFS=$'\n'
+
+for SUBSET in test train validation; do
+
+    # delete list of uris if it exists
+    if [ -f $PWD/$SUBSET.uris.lst ]; then
+        rm $PWD/$SUBSET.uris.lst
+    fi
+
+    # FIXME: this is probably not the right relative path to PodcastFillers
+    for file in `find $PWD/PodcastFillers/audio/episode_wav/$SUBSET -name "*wav" -type f`; do
+
+        # build "uri" by removing diacritics, removing extension, and replacing spaces with underscores
+        base=`basename "$file"`
+        URI=`echo $base | iconv -f utf8 -t ascii//TRANSLIT | sed 's/\.[^.]*$//' | tr ' ' '_'`
+
+        # add "uri" to list of uris
+        echo $URI >> $PWD/$SUBSET.uris.lst 
+
+        # create symlink
+        ln -s "$file" "$SYMLINK_DIR/$URI.wav"
+    done
+
+done
+
+IFS="$OLD_IFS"
+
+echo "Generating UEMs and RTTMs..."
+python generate.py $PWD