Merge pull request #543 from snakers4/adamnsandle

adamnsandle · web-flow · commit 1a7499607a76 · 2024-09-24T15:19:30.000+03:00
Adamnsandle
diff --git a/examples/colab_record_example.ipynb b/examples/colab_record_example.ipynb
@@ -17,6 +17,7 @@
    },
    "outputs": [],
    "source": [
+    "#!apt install ffmpeg\n",
     "!pip -q install pydub\n",
     "from google.colab import output\n",
     "from base64 import b64decode, b64encode\n",
@@ -37,13 +38,12 @@
     "                              model='silero_vad',\n",
     "                              force_reload=True)\n",
     "\n",
-    "def int2float(sound):\n",
-    "    abs_max = np.abs(sound).max()\n",
-    "    sound = sound.astype('float32')\n",
-    "    if abs_max > 0:\n",
-    "        sound *= 1/32768\n",
-    "    sound = sound.squeeze()\n",
-    "    return sound\n",
+    "def int2float(audio):\n",
+    "    samples = audio.get_array_of_samples()\n",
+    "    new_sound = audio._spawn(samples)\n",
+    "    arr = np.array(samples).astype(np.float32)\n",
+    "    arr = arr / np.abs(arr).max()\n",
+    "    return arr\n",
     "\n",
     "AUDIO_HTML = \"\"\"\n",
     "<script>\n",
@@ -68,18 +68,18 @@
     "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
     "    mimeType : 'audio/webm;codecs=opus'\n",
     "    //mimeType : 'audio/webm;codecs=pcm'\n",
-    "  };            \n",
+    "  };\n",
     "  //recorder = new MediaRecorder(stream, options);\n",
     "  recorder = new MediaRecorder(stream);\n",
-    "  recorder.ondataavailable = function(e) {            \n",
+    "  recorder.ondataavailable = function(e) {\n",
     "    var url = URL.createObjectURL(e.data);\n",
     "    // var preview = document.createElement('audio');\n",
     "    // preview.controls = true;\n",
     "    // preview.src = url;\n",
     "    // document.body.appendChild(preview);\n",
     "\n",
     "    reader = new FileReader();\n",
-    "    reader.readAsDataURL(e.data); \n",
+    "    reader.readAsDataURL(e.data);\n",
     "    reader.onloadend = function() {\n",
     "      base64data = reader.result;\n",
     "      //console.log(\"Inside FileReader:\" + base64data);\n",
@@ -121,7 +121,7 @@
     "\n",
     "}\n",
     "});\n",
-    "      \n",
+    "\n",
     "</script>\n",
     "\"\"\"\n",
     "\n",
@@ -133,8 +133,8 @@
     "    audio.export('test.mp3', format='mp3')\n",
     "    audio = audio.set_channels(1)\n",
     "    audio = audio.set_frame_rate(16000)\n",
-    "    audio_float = int2float(np.array(audio.get_array_of_samples()))\n",
-    "    audio_tens = torch.tensor(audio_float )\n",
+    "    audio_float = int2float(audio)\n",
+    "    audio_tens = torch.tensor(audio_float)\n",
     "    return audio_tens\n",
     "\n",
     "def make_animation(probs, audio_duration, interval=40):\n",
@@ -154,35 +154,29 @@
     "    def animate(i):\n",
     "        x = i * interval / 1000 - 0.04\n",
     "        y = np.linspace(0, 1.02, 2)\n",
-    "        \n",
+    "\n",
     "        line.set_data(x, y)\n",
     "        line.set_color('#990000')\n",
     "        return line,\n",
+    "    anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=int(audio_duration / (interval / 1000)))\n",
     "\n",
-    "    anim = FuncAnimation(fig, animate, init_func=init, interval=interval, save_count=audio_duration / (interval / 1000))\n",
-    "\n",
-    "    f = r\"animation.mp4\" \n",
-    "    writervideo = FFMpegWriter(fps=1000/interval) \n",
+    "    f = r\"animation.mp4\"\n",
+    "    writervideo = FFMpegWriter(fps=1000/interval)\n",
     "    anim.save(f, writer=writervideo)\n",
     "    plt.close('all')\n",
     "\n",
-    "def combine_audio(vidname, audname, outname, fps=25): \n",
+    "def combine_audio(vidname, audname, outname, fps=25):\n",
     "    my_clip = mpe.VideoFileClip(vidname, verbose=False)\n",
     "    audio_background = mpe.AudioFileClip(audname)\n",
     "    final_clip = my_clip.set_audio(audio_background)\n",
     "    final_clip.write_videofile(outname,fps=fps,verbose=False)\n",
     "\n",
     "def record_make_animation():\n",
     "  tensor = record()\n",
-    "\n",
     "  print('Calculating probabilities...')\n",
     "  speech_probs = []\n",
     "  window_size_samples = 512\n",
-    "  for i in range(0, len(tensor), window_size_samples):\n",
-    "      if len(tensor[i: i+ window_size_samples]) < window_size_samples:\n",
-    "        break\n",
-    "      speech_prob = model(tensor[i: i+ window_size_samples], 16000).item()\n",
-    "      speech_probs.append(speech_prob)\n",
+    "  speech_probs = model.audio_forward(tensor, sr=16000)[0].tolist()\n",
     "  model.reset_states()\n",
     "  print('Making animation...')\n",
     "  make_animation(speech_probs, len(tensor) / 16000)\n",
@@ -196,7 +190,9 @@
     "  <video width=800 controls>\n",
     "        <source src=\"%s\" type=\"video/mp4\">\n",
     "  </video>\n",
-    "  \"\"\" % data_url))"
+    "  \"\"\" % data_url))\n",
+    "\n",
+    "  return speech_probs"
    ]
   },
   {
@@ -216,7 +212,7 @@
    },
    "outputs": [],
    "source": [
-    "record_make_animation()"
+    "speech_probs = record_make_animation()"
    ]
   }
  ],
diff --git a/examples/parallel_example.ipynb b/examples/parallel_example.ipynb
@@ -1,7 +1,6 @@
 {
  "cells": [
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -18,17 +17,19 @@
     "SAMPLING_RATE = 16000\n",
     "import torch\n",
     "from pprint import pprint\n",
+    "import time\n",
+    "import shutil\n",
     "\n",
     "torch.set_num_threads(1)\n",
     "NUM_PROCESS=4 # set to the number of CPU cores in the machine\n",
     "NUM_COPIES=8\n",
     "# download wav files, make multiple copies\n",
-    "for idx in range(NUM_COPIES):\n",
-    "    torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example{idx}.wav\")\n"
+    "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', f\"en_example0.wav\")\n",
+    "for idx in range(NUM_COPIES-1):\n",
+    "    shutil.copy(f\"en_example0.wav\", f\"en_example{idx+1}.wav\")"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -54,7 +55,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -99,7 +99,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -127,7 +126,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "diarization",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -141,7 +140,20 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.15"
+   "version": "3.10.14"
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": true,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {},
+   "toc_section_display": true,
+   "toc_window_display": false
   }
  },
  "nbformat": 4,