Skip to content

Commit d5307c0

Browse files
committed
cleaner call to ffmpeg
update of README and setup.py to mention inaGVAD
1 parent ef526b6 commit d5307c0

File tree

6 files changed

+70
-43
lines changed

6 files changed

+70
-43
lines changed

README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,21 @@ inaSpeechSegmenter has been presented at the IEEE International Conference on Ac
104104
}
105105
```
106106

107+
inaSpeechSegmenter was ranked #1 6 open-source VAD system on a French TV and radio benchmark : LIUM_SpkDiarization, Pyannote, Rvad, Silero, Speechbrain
108+
```bibtex
109+
@inproceedings{doukhan-etal-2024-inagvad,
110+
title = "{I}na{GVAD} : A Challenging {F}rench {TV} and Radio Corpus Annotated for Speech Activity Detection and Speaker Gender Segmentation",
111+
author = "Doukhan, David and Maertens, Christine and Le Personnic, William and Speroni, Ludovic and Dehak, Reda",
112+
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
113+
month = may,
114+
year = "2024",
115+
address = "Torino, Italia",
116+
publisher = "ELRA and ICCL",
117+
url = "https://aclanthology.org/2024.lrec-main.785/",
118+
pages = "8963--8974",
119+
}
120+
```
121+
107122
inaSpeechSegmenter won [MIREX 2018 speech detection challenge](http://www.music-ir.org/mirex/wiki/2018:Music_and_or_Speech_Detection_Results)
108123
Details on the speech detection submodule can be found below:
109124

inaSpeechSegmenter/io.py

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@
2424
# THE SOFTWARE.
2525

2626
import os
27-
import tempfile
28-
from subprocess import Popen, PIPE
27+
from tempfile import TemporaryFile
28+
import subprocess
2929
import soundfile as sf
3030

3131

32-
def media2sig16kmono(medianame, tmpdir=None, start_sec=None, stop_sec=None, ffmpeg='ffmpeg', dtype='float64'):
32+
def media2sig16kmono(medianame, start_sec=None, stop_sec=None, ffmpeg='ffmpeg', dtype='float64'):
3333
"""
3434
Convert media to temp wav 16k mono and return signal
3535
"""
@@ -56,26 +56,24 @@ def media2sig16kmono(medianame, tmpdir=None, start_sec=None, stop_sec=None, ffmp
5656

5757
base, _ = os.path.splitext(os.path.basename(medianame))
5858

59-
with tempfile.TemporaryDirectory(dir=tmpdir) as tmpdirname:
60-
# build ffmpeg command line
61-
tmpwav = tmpdirname + '/' + base + '.wav'
62-
args = [ffmpeg, '-y', '-i', medianame, '-ar', '16000', '-ac', '1']
63-
if start_sec is None:
64-
start_sec = 0
65-
else:
66-
args += ['-ss', '%f' % start_sec]
67-
68-
if stop_sec is not None:
69-
args += ['-to', '%f' % stop_sec]
70-
args += [tmpwav]
71-
72-
# launch ffmpeg
73-
p = Popen(args, stdout=PIPE, stderr=PIPE)
74-
output, error = p.communicate()
75-
assert p.returncode == 0, error
76-
77-
# Get Mel Power Spectrogram and Energy
78-
sig, sr = sf.read(tmpwav, dtype=dtype)
79-
assert sr == 16000
80-
return sig
8159

60+
# build ffmpeg command
61+
cmd = [ffmpeg, '-i', medianame, '-f', 'wav', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1']
62+
if start_sec is None:
63+
start_sec = 0
64+
else:
65+
cmd += ['-ss', '%f' % start_sec]
66+
if stop_sec is not None:
67+
cmd += ['-to', '%f' % stop_sec]
68+
cmd += ['pipe:1']
69+
70+
with TemporaryFile() as out, TemporaryFile() as err:
71+
ret = subprocess.run(cmd, stdout=out, stderr=err)
72+
if ret.returncode != 0:
73+
err.seek(0)
74+
msg = err.read()
75+
raise Exception(msg)
76+
out.seek(0)
77+
wav_data, fs = sf.read(out, dtype=dtype)
78+
assert(fs == 16000)
79+
return wav_data

inaSpeechSegmenter/segmenter.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@
5050

5151
from .export_funcs import seg2csv, seg2textgrid
5252

53-
def _media2feats(medianame, tmpdir, start_sec, stop_sec, ffmpeg):
54-
sig = media2sig16kmono(medianame, tmpdir, start_sec, stop_sec, ffmpeg, 'float32')
53+
def _media2feats(medianame, start_sec, stop_sec, ffmpeg):
54+
sig = media2sig16kmono(medianame, start_sec, stop_sec, ffmpeg, 'float32')
5555
with warnings.catch_warnings():
5656
# ignore warnings resulting from empty signals parts
5757
warnings.filterwarnings('ignore', message='divide by zero encountered in log', category=RuntimeWarning)
@@ -276,27 +276,25 @@ def segment_feats(self, mspec, loge, difflen, start_sec):
276276
return [(lab, start_sec + start * .02, start_sec + stop * .02) for lab, start, stop in lseg]
277277

278278

279-
def __call__(self, medianame, tmpdir=None, start_sec=None, stop_sec=None):
279+
def __call__(self, medianame, start_sec=None, stop_sec=None):
280280
"""
281281
Return segmentation of a given file
282282
* convert file to wav 16k mono with ffmpeg
283283
* call NN segmentation procedures
284284
* media_name: path to the media to be processed (including remote url)
285285
may include any format supported by ffmpeg
286-
* tmpdir: allow to define a custom path for storing temporary files
287-
fast read/write HD are a good choice
288286
* start_sec (seconds): sound stream before start_sec won't be processed
289287
* stop_sec (seconds): sound stream after stop_sec won't be processed
290288
"""
291289

292-
mspec, loge, difflen = _media2feats(medianame, tmpdir, start_sec, stop_sec, self.ffmpeg)
290+
mspec, loge, difflen = _media2feats(medianame, start_sec, stop_sec, self.ffmpeg)
293291
if start_sec is None:
294292
start_sec = 0
295293
# do segmentation
296294
return self.segment_feats(mspec, loge, difflen, start_sec)
297295

298296

299-
def batch_process(self, linput, loutput, tmpdir=None, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
297+
def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
300298

301299
if verbose:
302300
print('batch_processing %d files' % len(linput))
@@ -311,7 +309,7 @@ def batch_process(self, linput, loutput, tmpdir=None, verbose=False, skipifexist
311309
t_batch_start = time.time()
312310

313311
lmsg = []
314-
fg = featGenerator(linput.copy(), loutput.copy(), tmpdir, self.ffmpeg, skipifexist, nbtry, trydelay)
312+
fg = featGenerator(linput.copy(), loutput.copy(), self.ffmpeg, skipifexist, nbtry, trydelay)
315313
i = 0
316314
for feats, msg in fg:
317315
lmsg += msg
@@ -337,7 +335,7 @@ def batch_process(self, linput, loutput, tmpdir=None, verbose=False, skipifexist
337335
return t_batch_dur, nb_processed, avg, lmsg
338336

339337

340-
def medialist2feats(lin, lout, tmpdir, ffmpeg, skipifexist, nbtry, trydelay):
338+
def medialist2feats(lin, lout, ffmpeg, skipifexist, nbtry, trydelay):
341339
"""
342340
To be used when processing batches
343341
if resulting file exists, it is skipped
@@ -362,7 +360,7 @@ def medialist2feats(lin, lout, tmpdir, ffmpeg, skipifexist, nbtry, trydelay):
362360
itry = 0
363361
while ret is None and itry < nbtry:
364362
try:
365-
ret = _media2feats(src, tmpdir, None, None, ffmpeg)
363+
ret = _media2feats(src, None, None, ffmpeg)
366364
except:
367365
itry += 1
368366
errmsg = sys.exc_info()[0]
@@ -376,14 +374,14 @@ def medialist2feats(lin, lout, tmpdir, ffmpeg, skipifexist, nbtry, trydelay):
376374
return ret, msg
377375

378376

379-
def featGenerator(ilist, olist, tmpdir=None, ffmpeg='ffmpeg', skipifexist=False, nbtry=1, trydelay=2.):
380-
thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, tmpdir, ffmpeg, skipifexist, nbtry, trydelay])
377+
def featGenerator(ilist, olist, ffmpeg='ffmpeg', skipifexist=False, nbtry=1, trydelay=2.):
378+
thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, ffmpeg, skipifexist, nbtry, trydelay])
381379
thread.start()
382380
while True:
383381
ret, msg = thread.join()
384382
if len(ilist) == 0:
385383
break
386-
thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, tmpdir, ffmpeg, skipifexist, nbtry, trydelay])
384+
thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, ffmpeg, skipifexist, nbtry, trydelay])
387385
thread.start()
388386
yield ret, msg
389387
yield ret, msg

inaSpeechSegmenter/vbx_segmenter.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def apply_vad(self, xvectors, a_vad):
144144
# Add vectors with vad-overlap if too many predictions have been removed
145145
return add_needed_vectors(n_xvectors, midpoint_seg)
146146

147-
def __call__(self, fpath, tmpdir=None):
147+
def __call__(self, fpath):
148148
"""
149149
Return Voice Femininity Score of a given file with values before last sigmoid activation :
150150
* convert file to wav 16k mono with ffmpeg
@@ -154,10 +154,10 @@ def __call__(self, fpath, tmpdir=None):
154154
* apply gender detection model and compute femininity score
155155
* return score, duration of detected speech and number of retained x-vectors
156156
"""
157-
basename, ext = os.path.splitext(os.path.basename(fpath))[0], os.path.splitext(os.path.basename(fpath))[1]
157+
basename, _ = os.path.splitext(os.path.basename(fpath))[0], os.path.splitext(os.path.basename(fpath))[1]
158158

159159
# Read "wav" file
160-
signal = media2sig16kmono(fpath, tmpdir, dtype="float64")
160+
signal = media2sig16kmono(fpath, dtype="float64")
161161
duration = len(signal) / SR
162162

163163
# Applying voice activity detection
@@ -230,7 +230,7 @@ def __call__(self, basename, fea, duration):
230230
seg_end = round(start / 100.0 + WINLEN / 100.0, 3)
231231
xvectors.append((key, (seg_start, seg_end), xvector))
232232

233-
#  Last segment
233+
# Last segment
234234
if len(fea) - start - STEP >= 10:
235235
data = fea[start + STEP:len(fea)]
236236
xvector = self.get_embedding(data)

run_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def test_execution(self):
5555
def test_silence_features(self):
5656
# test empty signal do not result in warnings
5757
with warnings.catch_warnings(record=True) as w:
58-
_media2feats('./media/silence2sec.wav', None, None, None, 'ffmpeg')
58+
_media2feats('./media/silence2sec.wav', None, None, 'ffmpeg')
5959
assert len(w) == 0, [str(e) for e in w]
6060

6161

setup.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,22 @@
9292
}
9393
```
9494
95+
inaSpeechSegmenter was ranked #1 6 open-source VAD system on a French TV and
96+
radio benchmark : LIUM_SpkDiarization, Pyannote, Rvad, Silero, Speechbrain
97+
```bibtex
98+
@inproceedings{doukhan-etal-2024-inagvad,
99+
title = "{I}na{GVAD} : A Challenging {F}rench {TV} and Radio Corpus Annotated for Speech Activity Detection and Speaker Gender Segmentation",
100+
author = "Doukhan, David and Maertens, Christine and Le Personnic, William and Speroni, Ludovic and Dehak, Reda",
101+
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
102+
month = may,
103+
year = "2024",
104+
address = "Torino, Italia",
105+
publisher = "ELRA and ICCL",
106+
url = "https://aclanthology.org/2024.lrec-main.785/",
107+
pages = "8963--8974",
108+
}
109+
```
110+
95111
inaSpeechSegmenter won MIREX 2018 speech detection challenge.
96112
http://www.music-ir.org/mirex/wiki/2018:Music_and_or_Speech_Detection_Results
97113
Details on the speech detection submodule can be found bellow:

0 commit comments

Comments
 (0)