improve(pyannoteAI): update on-premise wrapper to return both regular and exclusive diarization (#1953)

hbredin · web-flow · commit cc059d66a3f2 · 2025-11-19T21:39:59.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 - feat(pipeline): add `preload` option to base `Pipeline.__call__` to force preloading audio in memory ([@antoinelaurent](https://github.com/antoinelaurent/))
 - feat(pipeline): add `Pipeline.cuda()` convenience method [@tkanarsky](https://github.com/tkanarsky/)
 - improve(util): make `permutate` faster thanks to vectorized cost function
+- improve(pyannoteAI): update pyannoteAI wrapper to return both regular and exclusive diarization
 
 ## Version 4.0.1 (2025-10-10)
 
diff --git a/src/pyannote/audio/pipelines/pyannoteai/local.py b/src/pyannote/audio/pipelines/pyannoteai/local.py
@@ -21,11 +21,13 @@
 # SOFTWARE.
 
 import os
-from pathlib import Path
 
 from pyannote.audio import Pipeline
+from pyannote.audio.core.io import AudioFile
 from pyannote.core import Annotation, Segment
 
+from ..speaker_diarization import DiarizeOutput
+
 
 class Local(Pipeline):
     """Wrapper around official pyannoteAI on-premise package
@@ -51,29 +53,24 @@ def __init__(self, token: str | None = None, **kwargs):
         self.token = token or os.environ.get("PYANNOTEAI_API_KEY", None)
         self._pipeline = _LocalPipeline(self.token)
 
-    def _to_annotation(self, completed_job: dict) -> Annotation:
-        """Deserialize job output into pyannote.core Annotation"""
-
-        output = completed_job["output"]["diarization"]
-        job_id = completed_job["jobId"]
-
-        annotation = Annotation(uri=job_id)
-        for t, turn in enumerate(output):
+    def _deserialize(self, diarization: list[dict]) -> Annotation:
+        # deserialize the output into a good-old Annotation instance
+        annotation = Annotation()
+        for t, turn in enumerate(diarization):
             segment = Segment(start=turn["start"], end=turn["end"])
             speaker = turn["speaker"]
             annotation[segment, t] = speaker
-
         return annotation.rename_tracks("string")
 
     def apply(
         self,
-        file: Path,
+        file: AudioFile,
         num_speakers: int | None = None,
         min_speakers: int | None = None,
         max_speakers: int | None = None,
-        exclusive: bool = False,
-    ) -> Annotation:
-        """Speaker diarization using pyannoteAI on-premise package
+        **kwargs,
+    ) -> DiarizeOutput:
+        """Speaker diarization using on-premise pyannoteAI package.
 
         Parameters
         ----------
@@ -86,41 +83,45 @@ def apply(
             Not supported yet. Minimum number of speakers. Has no effect when `num_speakers` is provided.
         max_speakers : int, optional
             Not supported yet. Maximum number of speakers. Has no effect when `num_speakers` is provided.
-        exclusive : bool, optional
-            Enable exclusive diarization.
 
         Returns
         -------
-        speaker_diarization : Annotation
-            Speaker diarization result (when successful)
-
-        Raises
-        ------
-        PyannoteAIFailedJob
-            If the job failed
-        PyannoteAICanceledJob
-            If the job was canceled
-        HTTPError
-            If something else went wrong
+        output : DiarizeOutput
+            DiarizeOutput object containing both regular and exclusive speaker diarization results.
         """
 
-        predictions = self._pipeline.diarize(
-            file["audio"],
-            num_speakers=num_speakers,
-            min_speakers=min_speakers,
-            max_speakers=max_speakers,
-        )
-
-        # use exclusive diarization whenever requested
-        if exclusive:
-            diarization = predictions["exclusive_diarization"]
+        # if file provides "audio" path
+        if "audio" in file:
+            predictions = self._pipeline.diarize(
+                file["audio"],
+                num_speakers=num_speakers,
+                min_speakers=min_speakers,
+                max_speakers=max_speakers,
+                **kwargs,
+            )
+
+        # if file provides "waveform", make sure it is numpy (and not torch) array
+        elif "waveform" in file:
+            waveform = file["waveform"]
+            if hasattr(waveform, "numpy"):
+                waveform = waveform.numpy(force=True)
+
+            predictions = self._pipeline.diarize(
+                {"waveform": waveform, "sample_rate": file["sample_rate"]},
+                num_speakers=num_speakers,
+                min_speakers=min_speakers,
+                max_speakers=max_speakers,
+                **kwargs,
+            )
         else:
-            diarization = predictions["diarization"]
+            raise ValueError("AudioFile must provide either 'audio' or 'waveform' key")
 
-        # deserialize the output into a good-old Annotation instance
-        annotation = Annotation()
-        for t, turn in enumerate(diarization):
-            segment = Segment(start=turn["start"], end=turn["end"])
-            speaker = turn["speaker"]
-            annotation[segment, t] = speaker
-        return annotation.rename_tracks("string")
+        speaker_diarization: Annotation = self._deserialize(predictions["diarization"])
+        exclusive_speaker_diarization: Annotation = self._deserialize(
+            predictions["exclusive_diarization"]
+        )
+
+        return DiarizeOutput(
+            speaker_diarization=speaker_diarization,
+            exclusive_speaker_diarization=exclusive_speaker_diarization,
+        )
diff --git a/src/pyannote/audio/pipelines/pyannoteai/sdk.py b/src/pyannote/audio/pipelines/pyannoteai/sdk.py
@@ -21,9 +21,9 @@
 # SOFTWARE.
 
 import os
-from pathlib import Path
 
 from pyannote.audio import Pipeline
+from pyannote.audio.core.io import AudioFile
 from pyannote.core import Annotation, Segment
 
 from pyannoteai.sdk import Client
@@ -68,7 +68,7 @@ def _deserialize(self, diarization: list[dict]) -> Annotation:
 
     def apply(
         self,
-        file: Path,
+        file: AudioFile,
         num_speakers: int | None = None,
         min_speakers: int | None = None,
         max_speakers: int | None = None,