|
17 | 17 | }, |
18 | 18 | "outputs": [], |
19 | 19 | "source": [ |
| 20 | + "#!apt install ffmpeg\n", |
20 | 21 | "!pip -q install pydub\n", |
21 | 22 | "from google.colab import output\n", |
22 | 23 | "from base64 import b64decode, b64encode\n", |
|
37 | 38 | " model='silero_vad',\n", |
38 | 39 | " force_reload=True)\n", |
39 | 40 | "\n", |
40 | | - "def int2float(sound):\n", |
41 | | - " abs_max = np.abs(sound).max()\n", |
42 | | - " sound = sound.astype('float32')\n", |
43 | | - " if abs_max > 0:\n", |
44 | | - " sound *= 1/32768\n", |
45 | | - " sound = sound.squeeze()\n", |
46 | | - " return sound\n", |
| 41 | + "def int2float(audio):\n", |
| 42 | + " samples = audio.get_array_of_samples()\n", |
| 43 | + " new_sound = audio._spawn(samples)\n", |
| 44 | + " arr = np.array(samples).astype(np.float32)\n", |
| 45 | + " arr = arr / np.abs(arr).max()\n", |
| 46 | + " return arr\n", |
47 | 47 | "\n", |
48 | 48 | "AUDIO_HTML = \"\"\"\n", |
49 | 49 | "<script>\n", |
|
68 | 68 | " //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n", |
69 | 69 | " mimeType : 'audio/webm;codecs=opus'\n", |
70 | 70 | " //mimeType : 'audio/webm;codecs=pcm'\n", |
71 | | - " }; \n", |
| 71 | + " };\n", |
72 | 72 | " //recorder = new MediaRecorder(stream, options);\n", |
73 | 73 | " recorder = new MediaRecorder(stream);\n", |
74 | | - " recorder.ondataavailable = function(e) { \n", |
| 74 | + " recorder.ondataavailable = function(e) {\n", |
75 | 75 | " var url = URL.createObjectURL(e.data);\n", |
76 | 76 | " // var preview = document.createElement('audio');\n", |
77 | 77 | " // preview.controls = true;\n", |
78 | 78 | " // preview.src = url;\n", |
79 | 79 | " // document.body.appendChild(preview);\n", |
80 | 80 | "\n", |
81 | 81 | " reader = new FileReader();\n", |
82 | | - " reader.readAsDataURL(e.data); \n", |
| 82 | + " reader.readAsDataURL(e.data);\n", |
83 | 83 | " reader.onloadend = function() {\n", |
84 | 84 | " base64data = reader.result;\n", |
85 | 85 | " //console.log(\"Inside FileReader:\" + base64data);\n", |
|
121 | 121 | "\n", |
122 | 122 | "}\n", |
123 | 123 | "});\n", |
124 | | - " \n", |
| 124 | + "\n", |
125 | 125 | "</script>\n", |
126 | 126 | "\"\"\"\n", |
127 | 127 | "\n", |
|
133 | 133 | " audio.export('test.mp3', format='mp3')\n", |
134 | 134 | " audio = audio.set_channels(1)\n", |
135 | 135 | " audio = audio.set_frame_rate(16000)\n", |
136 | | - " audio_float = int2float(np.array(audio.get_array_of_samples()))\n", |
137 | | - " audio_tens = torch.tensor(audio_float )\n", |
| 136 | + " audio_float = int2float(audio)\n", |
| 137 | + " audio_tens = torch.tensor(audio_float)\n", |
138 | 138 | " return audio_tens\n", |
139 | 139 | "\n", |
140 | 140 | "def make_animation(probs, audio_duration, interval=40):\n", |
|
154 | 154 | " def animate(i):\n", |
155 | 155 | " x = i * interval / 1000 - 0.04\n", |
156 | 156 | " y = np.linspace(0, 1.02, 2)\n", |
157 | | - " \n", |
| 157 | + "\n", |
158 | 158 | " line.set_data(x, y)\n", |
159 | 159 | " line.set_color('#990000')\n", |
160 | 160 | " return line,\n", |
| 161 | + " anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=int(audio_duration / (interval / 1000)))\n", |
161 | 162 | "\n", |
162 | | - " anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n", |
163 | | - "\n", |
164 | | - " f = r\"animation.mp4\" \n", |
165 | | - " writervideo = FFMpegWriter(fps=1000/interval) \n", |
| 163 | + " f = r\"animation.mp4\"\n", |
| 164 | + " writervideo = FFMpegWriter(fps=1000/interval)\n", |
166 | 165 | " anim.save(f, writer=writervideo)\n", |
167 | 166 | " plt.close('all')\n", |
168 | 167 | "\n", |
169 | | - "def combine_audio(vidname, audname, outname, fps=25): \n", |
| 168 | + "def combine_audio(vidname, audname, outname, fps=25):\n", |
170 | 169 | " my_clip = mpe.VideoFileClip(vidname, verbose=False)\n", |
171 | 170 | " audio_background = mpe.AudioFileClip(audname)\n", |
172 | 171 | " final_clip = my_clip.set_audio(audio_background)\n", |
173 | 172 | " final_clip.write_videofile(outname,fps=fps,verbose=False)\n", |
174 | 173 | "\n", |
175 | 174 | "def record_make_animation():\n", |
176 | 175 | " tensor = record()\n", |
177 | | - "\n", |
178 | 176 | " print('Calculating probabilities...')\n", |
179 | 177 | " speech_probs = []\n", |
180 | 178 | " window_size_samples = 512\n", |
181 | | - " for i in range(0, len(tensor), window_size_samples):\n", |
182 | | - " if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n", |
183 | | - " break\n", |
184 | | - " speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n", |
185 | | - " speech_probs.append(speech_prob)\n", |
| 179 | + " speech_probs = model.audio_forward(tensor, sr=16000)[0].tolist()\n", |
186 | 180 | " model.reset_states()\n", |
187 | 181 | " print('Making animation...')\n", |
188 | 182 | " make_animation(speech_probs, len(tensor) / 16000)\n", |
|
196 | 190 | " <video width=800 controls>\n", |
197 | 191 | " <source src=\"%s\" type=\"video/mp4\">\n", |
198 | 192 | " </video>\n", |
199 | | - " \"\"\" % data_url))" |
| 193 | + " \"\"\" % data_url))\n", |
| 194 | + "\n", |
| 195 | + " return speech_probs" |
200 | 196 | ] |
201 | 197 | }, |
202 | 198 | { |
|
216 | 212 | }, |
217 | 213 | "outputs": [], |
218 | 214 | "source": [ |
219 | | - "record_make_animation()" |
| 215 | + "speech_probs = record_make_animation()" |
220 | 216 | ] |
221 | 217 | } |
222 | 218 | ], |
|
0 commit comments