From a1071b0230635976d2289dd8c58072d64fcbba67 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Wed, 17 Jul 2024 15:09:31 +0200 Subject: [PATCH 1/3] compute speaker embeddings by pyannote --- identification/speaker_identify.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/identification/speaker_identify.py b/identification/speaker_identify.py index 7d85a9a..1bb8644 100644 --- a/identification/speaker_identify.py +++ b/identification/speaker_identify.py @@ -3,6 +3,8 @@ from speechbrain.inference.speaker import EncoderClassifier else: from speechbrain.pretrained import EncoderClassifier +from pyannote.audio import Model +from pyannote.audio import Inference import os from collections import defaultdict import torch @@ -129,11 +131,9 @@ def initialize_embeddings( global _embedding_model if _embedding_model is None: tic = time.time() - _embedding_model = EncoderClassifier.from_hparams( - source="speechbrain/spkrec-ecapa-voxceleb", - # savedir="pretrained_models/spkrec-ecapa-voxceleb", - run_opts={"device":device} - ) + _embedding_model = Model.from_pretrained( + "pyannote/embedding", + use_auth_token="****") if log: log.info(f"Speaker identification model loaded in {time.time() - tic:.3f} seconds on {device}") os.makedirs(_FOLDER_EMBEDDINGS, exist_ok=True) @@ -181,23 +181,17 @@ def initialize_embeddings( spk_embed = compute_embedding(audio) # Note: it is important to save the embeddings on the CPU (to be able to load them on the CPU later on) - spk_embed = spk_embed.cpu() + #spk_embed = spk_embed.cpu() with open(embedding_file, "wb") as f: pkl.dump(spk_embed, f) if log: log.info(f"Speaker identification initialized with {len(speakers)} speakers") -def compute_embedding(audio, min_len = 640): - """ - Compute speaker embedding from audio - - Args: - audio (torch.Tensor): audio waveform - """ - assert _embedding_model is not None, "Speaker identification model not initialized" - # The following is to avoid a failure on too short audio (less than 640 samples = 40ms at 16kHz) +def compute_embedding(audio, min_len = 640): if audio.shape[-1] < min_len: audio = torch.cat([audio, torch.zeros(audio.shape[0], min_len - audio.shape[-1])], dim=-1) - return _embedding_model.encode_batch(audio) + inference = Inference(_embedding_model, window="whole") + inference.to(torch.device("cuda")) + return inference({"waveform":audio, "sample_rate": 16000}) def _get_db_speaker_ids(cursor=None): return _get_db_possible_values("id", cursor) From ad6785800baa78a037f4d52cb85a1790f8a8f714 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Thu, 18 Jul 2024 09:25:54 +0200 Subject: [PATCH 2/3] fix embedding format and device --- identification/speaker_identify.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/identification/speaker_identify.py b/identification/speaker_identify.py index 1bb8644..2c98c1e 100644 --- a/identification/speaker_identify.py +++ b/identification/speaker_identify.py @@ -353,6 +353,8 @@ def speaker_identify( break embedding_audio = compute_embedding(audio_selection) + embedding_audio = torch.from_numpy(embedding_audio) + embedding_audio = embedding_audio.to(_embedding_model.device) # Loop on the target speakers for speaker_name in speaker_names: @@ -362,6 +364,7 @@ def speaker_identify( # Get speaker embedding with open(_get_speaker_embedding_file(speaker_name), "rb") as f: embedding_speaker = pkl.load(f) + embedding_speaker = torch.from_numpy(embedding_speaker) embedding_speaker = embedding_speaker.to(_embedding_model.device) # Compute score similarity From e74daa86577727c2a169884c63fb3eb0b3e79772 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Thu, 22 Aug 2024 14:11:37 +0200 Subject: [PATCH 3/3] fix pyannote embedding for simple --- identification/speaker_identify.py | 1 - simple/requirements.txt | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/identification/speaker_identify.py b/identification/speaker_identify.py index 2c98c1e..05d1638 100644 --- a/identification/speaker_identify.py +++ b/identification/speaker_identify.py @@ -355,7 +355,6 @@ def speaker_identify( embedding_audio = compute_embedding(audio_selection) embedding_audio = torch.from_numpy(embedding_audio) embedding_audio = embedding_audio.to(_embedding_model.device) - # Loop on the target speakers for speaker_name in speaker_names: if speaker_name in exclude_speakers: diff --git a/simple/requirements.txt b/simple/requirements.txt index a168010..c73eba4 100644 --- a/simple/requirements.txt +++ b/simple/requirements.txt @@ -12,4 +12,5 @@ speechbrain==1.0.0 torchaudio==2.2.1 onnxruntime-gpu==1.17.1 scipy==1.8.1 # newer version can provoke segmentation faults -numpy==1.23.5 \ No newline at end of file +numpy==1.23.5 +pyannote.audio==3.1.1 \ No newline at end of file