From a1071b0230635976d2289dd8c58072d64fcbba67 Mon Sep 17 00:00:00 2001
From: wghezaiel <wghezaiel@linagora.com>
Date: Wed, 17 Jul 2024 15:09:31 +0200
Subject: [PATCH 1/3] compute speaker embeddings by pyannote

---
 identification/speaker_identify.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/identification/speaker_identify.py b/identification/speaker_identify.py
index 7d85a9a..1bb8644 100644
--- a/identification/speaker_identify.py
+++ b/identification/speaker_identify.py
@@ -3,6 +3,8 @@
    from speechbrain.inference.speaker import EncoderClassifier
 else:
    from speechbrain.pretrained import EncoderClassifier
+from pyannote.audio import Model
+from pyannote.audio import Inference
 import os
 from collections import defaultdict
 import torch
@@ -129,11 +131,9 @@ def initialize_embeddings(
     global _embedding_model
     if _embedding_model is None:
         tic = time.time()
-        _embedding_model = EncoderClassifier.from_hparams(
-            source="speechbrain/spkrec-ecapa-voxceleb",
-            # savedir="pretrained_models/spkrec-ecapa-voxceleb",
-            run_opts={"device":device}
-        )
+        _embedding_model = Model.from_pretrained(
+            "pyannote/embedding",
+            use_auth_token="****")
         if log: log.info(f"Speaker identification model loaded in {time.time() - tic:.3f} seconds on {device}")
 
     os.makedirs(_FOLDER_EMBEDDINGS, exist_ok=True)
@@ -181,23 +181,17 @@ def initialize_embeddings(
 
         spk_embed = compute_embedding(audio)
         # Note: it is important to save the embeddings on the CPU (to be able to load them on the CPU later on)
-        spk_embed = spk_embed.cpu()
+        #spk_embed = spk_embed.cpu()
         with open(embedding_file, "wb") as f:
             pkl.dump(spk_embed, f)
     if log: log.info(f"Speaker identification initialized with {len(speakers)} speakers")
 
-def compute_embedding(audio, min_len = 640):
-    """
-    Compute speaker embedding from audio
-
-    Args:
-        audio (torch.Tensor): audio waveform
-    """
-    assert _embedding_model is not None, "Speaker identification model not initialized"
-    # The following is to avoid a failure on too short audio (less than 640 samples = 40ms at 16kHz)
+def compute_embedding(audio, min_len = 640):       
     if audio.shape[-1] < min_len:
         audio = torch.cat([audio, torch.zeros(audio.shape[0], min_len - audio.shape[-1])], dim=-1)
-    return _embedding_model.encode_batch(audio)
+    inference = Inference(_embedding_model, window="whole")
+    inference.to(torch.device("cuda"))
+    return inference({"waveform":audio, "sample_rate": 16000})
 
 def _get_db_speaker_ids(cursor=None):
     return _get_db_possible_values("id", cursor)

From ad6785800baa78a037f4d52cb85a1790f8a8f714 Mon Sep 17 00:00:00 2001
From: wghezaiel <wghezaiel@linagora.com>
Date: Thu, 18 Jul 2024 09:25:54 +0200
Subject: [PATCH 2/3] fix embedding format and device

---
 identification/speaker_identify.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/identification/speaker_identify.py b/identification/speaker_identify.py
index 1bb8644..2c98c1e 100644
--- a/identification/speaker_identify.py
+++ b/identification/speaker_identify.py
@@ -353,6 +353,8 @@ def speaker_identify(
             break
 
     embedding_audio = compute_embedding(audio_selection)
+    embedding_audio = torch.from_numpy(embedding_audio)
+    embedding_audio = embedding_audio.to(_embedding_model.device)
 
     # Loop on the target speakers
     for speaker_name in speaker_names:
@@ -362,6 +364,7 @@ def speaker_identify(
         # Get speaker embedding
         with open(_get_speaker_embedding_file(speaker_name), "rb") as f:
             embedding_speaker = pkl.load(f)
+        embedding_speaker = torch.from_numpy(embedding_speaker)
         embedding_speaker = embedding_speaker.to(_embedding_model.device)
 
         # Compute score similarity

From e74daa86577727c2a169884c63fb3eb0b3e79772 Mon Sep 17 00:00:00 2001
From: wghezaiel <wghezaiel@linagora.com>
Date: Thu, 22 Aug 2024 14:11:37 +0200
Subject: [PATCH 3/3] fix pyannote embedding for simple

---
 identification/speaker_identify.py | 1 -
 simple/requirements.txt            | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/identification/speaker_identify.py b/identification/speaker_identify.py
index 2c98c1e..05d1638 100644
--- a/identification/speaker_identify.py
+++ b/identification/speaker_identify.py
@@ -355,7 +355,6 @@ def speaker_identify(
     embedding_audio = compute_embedding(audio_selection)
     embedding_audio = torch.from_numpy(embedding_audio)
     embedding_audio = embedding_audio.to(_embedding_model.device)
-
     # Loop on the target speakers
     for speaker_name in speaker_names:
         if speaker_name in exclude_speakers:
diff --git a/simple/requirements.txt b/simple/requirements.txt
index a168010..c73eba4 100644
--- a/simple/requirements.txt
+++ b/simple/requirements.txt
@@ -12,4 +12,5 @@ speechbrain==1.0.0
 torchaudio==2.2.1
 onnxruntime-gpu==1.17.1
 scipy==1.8.1 # newer version can provoke segmentation faults
-numpy==1.23.5
\ No newline at end of file
+numpy==1.23.5
+pyannote.audio==3.1.1
\ No newline at end of file