Skip to content

Commit 2cac771

Browse files
authored
Merge pull request #89 from valentinp72/master
Added options to bypass ffmpeg usage in some cases ; Pyro client now uses argparse
2 parents 69a78bc + 76df66e commit 2cac771

File tree

6 files changed

+83
-18
lines changed

6 files changed

+83
-18
lines changed

inaSpeechSegmenter/io.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,26 @@ def media2sig16kmono(medianame, tmpdir=None, start_sec=None, stop_sec=None, ffmp
3434
Convert media to temp wav 16k mono and return signal
3535
"""
3636

37+
if ffmpeg is None:
38+
if start_sec is not None or stop_sec is not None:
39+
raise NotImplementedError(
40+
f'start_sec={start_sec} and stop_sec={stop_sec} cannot be set ' \
41+
f' when running inaSpeechSegmenter without ffmpeg. Please cut '\
42+
f'down your audio files beforehand or use ffmpeg.'
43+
)
44+
if medianame.startswith('http://') or medianame.startswith('https://'):
45+
raise NotImplementedError(
46+
f'Without ffmpeg you cannot process media content on http ' \
47+
f'servers. You need to download your audio files beforehand ' \
48+
f'or use ffmpeg. You gave medianame={medianame}.'
49+
)
50+
51+
sig, sr = sf.read(medianame, dtype=dtype)
52+
assert sr == 16_000, \
53+
f'Without ffmpeg, inaSpeechSegmenter can only take files sampled ' \
54+
f'at 16000 Hz. The file {medianame} is sampled at {sr} Hz.'
55+
return sig
56+
3757
base, _ = os.path.splitext(os.path.basename(medianame))
3858

3959
with tempfile.TemporaryDirectory(dir=tmpdir) as tmpdirname:
@@ -57,4 +77,5 @@ def media2sig16kmono(medianame, tmpdir=None, start_sec=None, stop_sec=None, ffmp
5777
# Get Mel Power Spectrogram and Energy
5878
sig, sr = sf.read(tmpwav, dtype=dtype)
5979
assert sr == 16000
60-
return sig
80+
return sig
81+

inaSpeechSegmenter/segmenter.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -224,9 +224,10 @@ def __init__(self, vad_engine='smn', detect_gender=True, ffmpeg='ffmpeg', batch_
224224
default value (32) is slow, but works on any hardware
225225
"""
226226

227-
# test ffmpeg installation
228-
if shutil.which(ffmpeg) is None:
229-
raise(Exception("""ffmpeg program not found"""))
227+
if ffmpeg is not None:
228+
# test ffmpeg installation
229+
if shutil.which(ffmpeg) is None:
230+
raise(Exception("""ffmpeg program not found"""))
230231
self.ffmpeg = ffmpeg
231232

232233
# set energic ratio for 1st VAD

media/musanmix.wav

2.27 MB
Binary file not shown.

run_test.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ def seg2str(iseg, tseg):
7979
self.assertEqual(curstop, nextstart,
8080
'%s VS %s' % (seg2str(i, ret[i]), seg2str(i+1, ret[i+1])))
8181

82+
seg = Segmenter(ffmpeg=None)
83+
ret = seg('./media/musanmix.wav')
84+
for i in range(len(ret) -1):
85+
curstop = ret[i][2]
86+
nextstart = ret[i+1][1]
87+
self.assertEqual(curstop, nextstart,
88+
'%s VS %s' % (seg2str(i, ret[i]), seg2str(i+1, ret[i+1])))
89+
8290
def test_processingresult(self):
8391
seg = Segmenter(vad_engine='sm')
8492
ret = seg('./media/musanmix.mp3')
@@ -87,6 +95,14 @@ def test_processingresult(self):
8795
self.assertEqual([e[0] for e in ref], [e[0] for e in ret])
8896
np.testing.assert_almost_equal([e[1] for e in ref], [e[1] for e in ret])
8997
np.testing.assert_almost_equal([e[2] for e in ref], [e[2] for e in ret])
98+
99+
seg = Segmenter(vad_engine='sm', ffmpeg=None)
100+
ret = seg('./media/musanmix.wav')
101+
df = pd.read_csv('./media/musanmix-sm-gender.csv', sep='\t')
102+
ref = [(l.labels, float(l.start), float(l.stop)) for _, l in df.iterrows()]
103+
self.assertEqual([e[0] for e in ref], [e[0] for e in ret])
104+
np.testing.assert_almost_equal([e[1] for e in ref], [e[1] for e in ret])
105+
np.testing.assert_almost_equal([e[2] for e in ref], [e[2] for e in ret])
90106

91107
def test_batch(self):
92108
seg = Segmenter(vad_engine='sm')
@@ -96,6 +112,12 @@ def test_batch(self):
96112
self.assertTrue(filecmp.cmp(lout[0], lout[1]))
97113
self.assertTrue(filecmp.cmp(lout[0], './media/musanmix-sm-gender.csv'))
98114

115+
seg = Segmenter(vad_engine='sm', ffmpeg=None)
116+
with tempfile.TemporaryDirectory() as tmpdirname:
117+
lout = [os.path.join(tmpdirname, '1.1.csv'), os.path.join(tmpdirname, '2.1.csv')]
118+
ret = seg.batch_process(['./media/musanmix.wav', './media/musanmix.wav'], lout)
119+
self.assertTrue(filecmp.cmp(lout[0], lout[1]))
120+
self.assertTrue(filecmp.cmp(lout[0], './media/musanmix-sm-gender.csv'))
99121

100122
def test_praat_export(self):
101123
seg = Segmenter()

scripts/ina_speech_segmenter.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,16 @@
4848
parser.add_argument('-s', '--batch_size', type=int, default=32, help="(default: 32 - we recommend 1024). Size of batches to be sent to the GPU. Larger values allow faster processings, but require GPU with more memories. Default 32 size is fine even with a baseline laptop GPU.")
4949
parser.add_argument('-d', '--vad_engine', choices=['sm', 'smn'], default='smn', help="Voice activity detection (VAD) engine to be used (default: 'smn'). 'smn' split signal into 'speech', 'music' and 'noise' (better). 'sm' split signal into 'speech' and 'music' and do not take noise into account, which is either classified as music or speech. Results presented in ICASSP were obtained using 'sm' option")
5050
parser.add_argument('-g', '--detect_gender', choices = ['true', 'false'], default='True', help="(default: 'true'). If set to 'true', segments detected as speech will be splitted into 'male' and 'female' segments. If set to 'false', segments corresponding to speech will be labelled as 'speech' (faster)")
51-
parser.add_argument('-b', '--ffmpeg_binary', default='ffmpeg', help='Your custom binary of ffmpeg', required=False)
51+
parser.add_argument('-b', '--ffmpeg_binary', default='ffmpeg', help='Your custom binary of ffmpeg. Set `None` to disable ffmpeg.', required=False)
5252
parser.add_argument('-e', '--export_format', choices = ['csv', 'textgrid'], default='csv', help="(default: 'csv'). If set to 'csv', result will be exported in csv. If set to 'textgrid', results will be exported to praat Textgrid")
5353
parser.add_argument('-r', '--energy_ratio', default=0.03, type=float, help="(default: 0.03). Energetic threshold used to detect activity (percentage of mean energy of the signal)")
5454

5555
args = parser.parse_args()
5656

57+
if args.ffmpeg_binary.lower() == "none" or args.ffmpeg_binary == "":
58+
print("Disabling ffmpeg. Make sure your audio files are already sampled at 16kHz.")
59+
args.ffmpeg_binary = None
60+
5761
# Preprocess arguments and check their consistency
5862
input_files = []
5963
for e in args.input:

scripts/ina_speech_segmenter_pyro_client.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,33 +25,50 @@
2525

2626

2727
import Pyro4
28-
import sys
2928
import os
3029
import socket
3130

32-
from inaSpeechSegmenter import Segmenter
33-
31+
import argparse
3432

3533
if __name__ == '__main__':
36-
dname = os.path.dirname(os.path.realpath(__file__))
34+
parser = argparse.ArgumentParser(
35+
description='Start a inaSpeechSegmenter Pyro client.'
36+
)
37+
parser.add_argument(
38+
'uri', type=str,
39+
help='URI of the Pyro server to connect and get jobs from.'
40+
)
41+
parser.add_argument(
42+
'--batch_size', type=int, default=1024,
43+
help='Batch size to use. Use lower values with small GPUs.'
44+
)
45+
parser.add_argument(
46+
'--ffmpeg_binary', default='ffmpeg', type=str,
47+
help='Your custom binary of ffmpeg. Set `None` to disable ffmpeg.'
48+
)
49+
args = parser.parse_args()
3750

38-
hostname = socket.gethostname()
51+
if args.ffmpeg_binary.lower() == "none" or args.ffmpeg_binary == "":
52+
print("Disabling ffmpeg. Make sure your audio files are already sampled at 16kHz.")
53+
args.ffmpeg_binary = None
3954

40-
uri = sys.argv[1]
41-
jobserver = Pyro4.Proxy(uri)
55+
dname = os.path.dirname(os.path.realpath(__file__))
56+
hostname = socket.gethostname()
57+
jobserver = Pyro4.Proxy(args.uri)
4258

4359
ret = -1
4460
outname = 'init'
45-
61+
4662
# batch size set at 1024. Use lower values with small gpus
47-
g = Segmenter(batch_size=1024)
48-
63+
from inaSpeechSegmenter import Segmenter
64+
g = Segmenter(batch_size=args.batch_size, ffmpeg=args.ffmpeg_binary)
65+
4966
while True:
5067
lsrc, ldst = jobserver.get_njobs('%s %s' % (hostname, ret))
51-
68+
5269
print(lsrc, ldst)
5370
if len(lsrc) == 0:
5471
print('job list finished')
5572
break
56-
57-
ret = g.batch_process(lsrc, ldst, skipifexist=True, nbtry=3)
73+
74+
ret = g.batch_process(lsrc, ldst, skipifexist=True, nbtry=3)

0 commit comments

Comments
 (0)