From 245c5becf76f2b776a183cf666a322eda7c7c582 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Tue, 4 Apr 2023 14:10:58 +0200
Subject: [PATCH 1/7] feat: add support for PodcastFillers dataset

---
 podcastfillers/database.yml | 22 +++++++++++++++
 podcastfillers/generate.py  | 43 +++++++++++++++++++++++++++++
 podcastfillers/setup.sh     | 55 +++++++++++++++++++++++++++++++++++++
 3 files changed, 120 insertions(+)
 create mode 100644 podcastfillers/database.yml
 create mode 100644 podcastfillers/generate.py
 create mode 100644 podcastfillers/setup.sh
diff --git a/podcastfillers/database.yml b/podcastfillers/database.yml
new file mode 100644
index 0000000..0e2b0ac
--- /dev/null
+++ b/podcastfillers/database.yml
@@ -0,0 +1,22 @@
+Databases:
+  PodcastFillers: symlinks/{uri}.wav
+
+Protocols:
+  PodcastFillers:
+    Segmentation:
+      Fillers:
+        classes:
+          - Uh
+          - Um
+        train:
+          uri: train.uris.lst
+          annotated: train.uem
+          annotation: consolidated.rttm
+        development:
+          uri: development.uris.lst
+          annotated: development.uem
+          annotation: consolidated.rttm
+        test:
+          uri: test.uris.lst
+          annotated: test.uem
+          annotation: consolidated.rttm
diff --git a/podcastfillers/generate.py b/podcastfillers/generate.py
new file mode 100644
index 0000000..b9b8d32
--- /dev/null
+++ b/podcastfillers/generate.py
@@ -0,0 +1,43 @@
+import wave
+import contextlib
+import sys
+import csv
+
+src_dir = sys.argv[1]
+tgt_dir = sys.argv[2]
+
+SUBSET = {"train": "train", "development": "validation", "test": "test"}
+
+
+for subset, slug in SUBSET.items():
+
+    with open(f"{tgt_dir}/{slug}.uris.lst", "r") as f:
+        uris = [line.strip() for line in f.readlines()]
+
+    with open(f"{tgt_dir}/{subset}.uem", "w") as f:
+        for uri in uris:
+            filename = f"{tgt_dir}/symlinks/{uri}.wav"
+            with contextlib.closing(wave.open(filename,'r')) as g:
+                frames = g.getnframes()
+                rate = g.getframerate()
+                duration = frames / float(rate)
+                f.write(f"{uri} 1 0.000 {duration:.3f}\n")
+
+with open(f'{src_dir}/PodcastFillers.csv', 'r') as csvfile, open(f'{tgt_dir}/full.rttm', 'w') as rttm_full, open(f'{tgt_dir}/consolidated.rttm', 'w') as rttm_consolidated:
+    csvreader = csv.reader(csvfile, delimiter=',')
+    for r, row in enumerate(csvreader):
+        
+        if r == 0:
+            continue
+        
+        _, _, label_full_vocab, label_consolidated_vocab, filename, start_time, end_time, *_ = row
+        uri = filename.replace(' ', '_')
+        start_time = float(start_time)
+        end_time = float(end_time)
+
+        rttm_full.write(f"SPEAKER {uri} 1 {start_time:.3f} {end_time - start_time:.3f} <NA> <NA> {label_full_vocab} <NA> <NA>\n")
+
+        if label_consolidated_vocab == 'None':
+            continue
+
+        rttm_consolidated.write(f"SPEAKER {uri} 1 {start_time:.3f} {end_time - start_time:.3f} <NA> <NA> {label_consolidated_vocab} <NA> <NA>\n")
diff --git a/podcastfillers/setup.sh b/podcastfillers/setup.sh
new file mode 100644
index 0000000..a3016b2
--- /dev/null
+++ b/podcastfillers/setup.sh
@@ -0,0 +1,55 @@
+#!/bni/bash
+
+echo "Downloading ..."
+wget -c https://zenodo.org/record/7121457/files/PodcastFillers.zip?download=1 -O PodcastFillers.zip
+wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z01?download=1 -O PodcastFillers.z01
+wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z02?download=1 -O PodcastFillers.z02
+wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z03?download=1 -O PodcastFillers.z03
+
+echo "Extracting ..."
+unzip PodcastFillers.zip
+
+echo "Generatig URI lists ..."
+
+SRC_DIR=PodcastFillers
+# change this if PodcastFillers has been downloaded and extracted elsewhere
+# SRC_DIR=/gpfsdswork/dataset/PodcastFillers
+
+TGT_DIR=/gpfswork/rech/eie/commun/data/PodcastFillers
+
+SYMLINK_DIR=$TGT_DIR/symlinks
+mkdir -p $SYMLINK_DIR
+
+OLD_IFS="$IFS"
+IFS=$'\n'
+
+for SUBSET in test train validation; do
+
+    # delete list of uris if it exists
+    if [ -f $TGT_DIR/$SUBSET.uris.lst ]; then
+        rm $TGT_DIR/$SUBSET.uris.lst
+    fi
+
+    for file in `find $SRC_DIR/audio/episode_wav/$SUBSET -name "*wav" -type f`; do
+
+        # build "uri" by removing extension and replacing spaces with underscores        
+        base=`basename "$file"`
+        stem=`echo $base | sed 's/\.[^.]*$//'`
+        URI=`echo $stem | tr ' ' '_'`
+
+        # add "uri" to list of uris
+        echo $URI >> $TGT_DIR/$SUBSET.uris.lst 
+
+        # create symlink
+        ln -s "$file" "$SYMLINK_DIR/$URI.wav"
+    done
+    
+done
+
+IFS="$OLD_IFS"
+
+echo "Generating UEMs and RTTMs..."
+python generate.py $SRC_DIR $TGT_DIR
+
+
+

From 3c3cdb4f9c36b2ad84e4f36d40145e60a79ab6b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Tue, 4 Apr 2023 14:17:27 +0200
Subject: [PATCH 2/7] fix: add missing download and FIXME comments

---
 podcastfillers/setup.sh | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/podcastfillers/setup.sh b/podcastfillers/setup.sh
index a3016b2..1452c38 100644
--- a/podcastfillers/setup.sh
+++ b/podcastfillers/setup.sh
@@ -1,23 +1,20 @@
-#!/bni/bash
+#!/bin/bash
+
+
 
 echo "Downloading ..."
 wget -c https://zenodo.org/record/7121457/files/PodcastFillers.zip?download=1 -O PodcastFillers.zip
 wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z01?download=1 -O PodcastFillers.z01
 wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z02?download=1 -O PodcastFillers.z02
 wget -c https://zenodo.org/record/7121457/files/PodcastFillers.z03?download=1 -O PodcastFillers.z03
+wget -c https://zenodo.org/record/7121457/files/PodcastFillers.csv?download=1 -O PodcastFillers.csv
 
 echo "Extracting ..."
 unzip PodcastFillers.zip
 
 echo "Generatig URI lists ..."
 
-SRC_DIR=PodcastFillers
-# change this if PodcastFillers has been downloaded and extracted elsewhere
-# SRC_DIR=/gpfsdswork/dataset/PodcastFillers
-
-TGT_DIR=/gpfswork/rech/eie/commun/data/PodcastFillers
-
-SYMLINK_DIR=$TGT_DIR/symlinks
+SYMLINK_DIR=$PWD/symlinks
 mkdir -p $SYMLINK_DIR
 
 OLD_IFS="$IFS"
@@ -26,11 +23,12 @@ IFS=$'\n'
 for SUBSET in test train validation; do
 
     # delete list of uris if it exists
-    if [ -f $TGT_DIR/$SUBSET.uris.lst ]; then
-        rm $TGT_DIR/$SUBSET.uris.lst
+    if [ -f $PWD/$SUBSET.uris.lst ]; then
+        rm $PWD/$SUBSET.uris.lst
     fi
 
-    for file in `find $SRC_DIR/audio/episode_wav/$SUBSET -name "*wav" -type f`; do
+    # FIXME: this is probably not the right relative path to PodcastFillers
+    for file in `find $PWD/PodcastFillers/audio/episode_wav/$SUBSET -name "*wav" -type f`; do
 
         # build "uri" by removing extension and replacing spaces with underscores        
         base=`basename "$file"`
@@ -38,7 +36,7 @@ for SUBSET in test train validation; do
         URI=`echo $stem | tr ' ' '_'`
 
         # add "uri" to list of uris
-        echo $URI >> $TGT_DIR/$SUBSET.uris.lst 
+        echo $URI >> $PWD/$SUBSET.uris.lst 
 
         # create symlink
         ln -s "$file" "$SYMLINK_DIR/$URI.wav"
@@ -49,7 +47,4 @@ done
 IFS="$OLD_IFS"
 
 echo "Generating UEMs and RTTMs..."
-python generate.py $SRC_DIR $TGT_DIR
-
-
-
+python generate.py $PWD

From 9e2c3d1dc6df822b07795401db16a75756372274 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Tue, 4 Apr 2023 14:18:16 +0200
Subject: [PATCH 3/7] fix: fix list of arugments

---
 podcastfillers/generate.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/podcastfillers/generate.py b/podcastfillers/generate.py
index b9b8d32..4d34840 100644
--- a/podcastfillers/generate.py
+++ b/podcastfillers/generate.py
@@ -3,12 +3,10 @@
 import sys
 import csv
 
-src_dir = sys.argv[1]
-tgt_dir = sys.argv[2]
+tgt_dir = sys.argv[1]
 
 SUBSET = {"train": "train", "development": "validation", "test": "test"}
 
-
 for subset, slug in SUBSET.items():
 
     with open(f"{tgt_dir}/{slug}.uris.lst", "r") as f:
@@ -23,7 +21,7 @@
                 duration = frames / float(rate)
                 f.write(f"{uri} 1 0.000 {duration:.3f}\n")
 
-with open(f'{src_dir}/PodcastFillers.csv', 'r') as csvfile, open(f'{tgt_dir}/full.rttm', 'w') as rttm_full, open(f'{tgt_dir}/consolidated.rttm', 'w') as rttm_consolidated:
+with open(f'{tgt_dir}/PodcastFillers.csv', 'r') as csvfile, open(f'{tgt_dir}/full.rttm', 'w') as rttm_full, open(f'{tgt_dir}/consolidated.rttm', 'w') as rttm_consolidated:
     csvreader = csv.reader(csvfile, delimiter=',')
     for r, row in enumerate(csvreader):
         

From c75835c2523345fd68424b2a1156432bcfab1af1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Tue, 4 Apr 2023 14:22:56 +0200
Subject: [PATCH 4/7] feat: add README

---
 podcastfillers/README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 podcastfillers/README.md

diff --git a/podcastfillers/README.md b/podcastfillers/README.md
new file mode 100644
index 0000000..2324963
--- /dev/null
+++ b/podcastfillers/README.md
@@ -0,0 +1,14 @@
+# PodcastFillers
+
+Script to download [PodcastFillers](https://zenodo.org/record/7121457#.ZCwUUxXP0qs) dataset and set it up for use with [`pyannote.audio`](https://github.com/pyannote/pyannote-audio).
+
+Protocol `PodcastFillers.Segmentation.Fillers` can be used for training a filler word (`Uh` or `Um`) detection model with protocol .
+
+## Instruction
+
+Run `setup.sh` to download and extract the files.
+
+
+## License and citation
+
+See [PodcastFillers Zenodo page](https://zenodo.org/record/7121457#.ZCwUUxXP0qs).

From 9829d0abede88311eafcb9734eec88621676583d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Tue, 4 Apr 2023 14:24:21 +0200
Subject: [PATCH 5/7] feat: update main README

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2ff0cbb..30c64a5 100644
--- a/README.md
+++ b/README.md
@@ -5,10 +5,12 @@ This repository aims to centralize scripts that prepare datasets to be used with
 Currently available : 
 - [AISHELL4](aishell4)
 - [MSDWild](msdwild)
+- [PodcastFillers](podcastfillers)
 
 To setup each dataset, refer to the `README.md` contained in their respective folder.
 
-Each dataset comes with its predefined `database.yml`, containing pyannote-database protocol(s) with already defined train+dev+test sets for out-of-the-box *speaker diarization* usage.
+Each dataset comes with its predefined `database.yml`, containing pyannote-database protocol(s) with already defined train+dev+test sets for out-of-the-box *speaker diarization* or *segmentation* usage.
+
 How these subsets are defined is entirely configurable.
 
 ## FAQ

From 9fb84705f0b9a3aa74a80bca33621429b41abb04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Tue, 4 Apr 2023 14:54:23 +0200
Subject: [PATCH 6/7] fix: fix path to development list

---
 podcastfillers/database.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/podcastfillers/database.yml b/podcastfillers/database.yml
index 0e2b0ac..1a27fb2 100644
--- a/podcastfillers/database.yml
+++ b/podcastfillers/database.yml
@@ -13,7 +13,7 @@ Protocols:
           annotated: train.uem
           annotation: consolidated.rttm
         development:
-          uri: development.uris.lst
+          uri: validation.uris.lst
           annotated: development.uem
           annotation: consolidated.rttm
         test:

From fa2e21b01419ada9f675f7a0df2a6edd5906001c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Tue, 4 Apr 2023 15:27:39 +0200
Subject: [PATCH 7/7] fix: fix diacritic handling

---
 podcastfillers/setup.sh | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/podcastfillers/setup.sh b/podcastfillers/setup.sh
index 1452c38..716da4a 100644
--- a/podcastfillers/setup.sh
+++ b/podcastfillers/setup.sh
@@ -29,11 +29,10 @@ for SUBSET in test train validation; do
 
     # FIXME: this is probably not the right relative path to PodcastFillers
     for file in `find $PWD/PodcastFillers/audio/episode_wav/$SUBSET -name "*wav" -type f`; do
-
-        # build "uri" by removing extension and replacing spaces with underscores        
+ 
+        # build "uri" by removing diacritics, removing extension, and replacing spaces with underscores
         base=`basename "$file"`
-        stem=`echo $base | sed 's/\.[^.]*$//'`
-        URI=`echo $stem | tr ' ' '_'`
+        URI=`echo $base | iconv -f utf8 -t ascii//TRANSLIT | sed 's/\.[^.]*$//' | tr ' ' '_'`
 
         # add "uri" to list of uris
         echo $URI >> $PWD/$SUBSET.uris.lst