blakeblackshear
diff --git a/‎docs/docs/configuration/audio_detectors.md‎
Lines changed: 7 additions & 1 deletion b/‎docs/docs/configuration/audio_detectors.md‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎docs/docs/configuration/reference.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/docs/configuration/reference.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎frigate/api/classification.py‎
Lines changed: 1 addition & 0 deletions b/‎frigate/api/classification.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎frigate/comms/dispatcher.py‎
Lines changed: 17 additions & 0 deletions b/‎frigate/comms/dispatcher.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎frigate/const.py‎
Lines changed: 1 addition & 0 deletions b/‎frigate/const.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎frigate/data_processing/post/audio_transcription.py‎
Lines changed: 5 additions & 0 deletions b/‎frigate/data_processing/post/audio_transcription.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎frigate/timeline.py‎
Lines changed: 1 addition & 0 deletions b/‎frigate/timeline.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎web/src/api/ws.tsx‎
Lines changed: 34 additions & 0 deletions b/‎web/src/api/ws.tsx‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎web/src/components/overlay/ObjectTrackOverlay.tsx‎
Lines changed: 29 additions & 1 deletion b/‎web/src/components/overlay/ObjectTrackOverlay.tsx‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎web/src/components/overlay/detail/SearchDetailDialog.tsx‎
Lines changed: 18 additions & 2 deletions b/‎web/src/components/overlay/detail/SearchDetailDialog.tsx‎
Lines changed: 18 additions & 2 deletions
@@ -144,4 +144,10 @@ In order to use transcription and translation for past events, you must enable a
 
 The transcribed/translated speech will appear in the description box in the Tracked Object Details pane. If Semantic Search is enabled, embeddings are generated for the transcription text and are fully searchable using the description search type.
 
-Recorded `speech` events will always use a `whisper` model, regardless of the `model_size` config setting. Without a GPU, generating transcriptions for longer `speech` events may take a fair amount of time, so be patient.
+:::note
+
+Only one `speech` event may be transcribed at a time. Frigate does not automatically transcribe `speech` events or implement a queue for long-running transcription model inference.
+
+:::
+
+Recorded `speech` events will always use a `whisper` model, regardless of the `model_size` config setting. Without a supported Nvidia GPU, generating transcriptions for longer `speech` events may take a fair amount of time, so be patient.
@@ -700,11 +700,11 @@ genai:
 # Optional: Configuration for audio transcription
 # NOTE: only the enabled option can be overridden at the camera level
 audio_transcription:
-  # Optional: Enable license plate recognition (default: shown below)
+  # Optional: Enable live and speech event audio transcription (default: shown below)
   enabled: False
-  # Optional: The device to run the models on (default: shown below)
+  # Optional: The device to run the models on for live transcription. (default: shown below)
   device: CPU
-  # Optional: Set the model size used for transcription. (default: shown below)
+  # Optional: Set the model size used for live transcription. (default: shown below)
   model_size: small
   # Optional: Set the language used for transcription translation. (default: shown below)
   # List of language codes: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10
 
@@ -542,6 +542,7 @@ def transcribe_audio(request: Request, body: AudioTranscriptionBody):
             status_code=409,  # 409 Conflict
         )
     else:
+        logger.debug(f"Failed to transcribe audio, response: {response}")
         return JSONResponse(
             content={
                 "success": False,
 
@@ -23,6 +23,7 @@
     NOTIFICATION_TEST,
     REQUEST_REGION_GRID,
     UPDATE_AUDIO_ACTIVITY,
+    UPDATE_AUDIO_TRANSCRIPTION_STATE,
     UPDATE_BIRDSEYE_LAYOUT,
     UPDATE_CAMERA_ACTIVITY,
     UPDATE_EMBEDDINGS_REINDEX_PROGRESS,
@@ -61,6 +62,7 @@ def __init__(
         self.model_state: dict[str, ModelStatusTypesEnum] = {}
         self.embeddings_reindex: dict[str, Any] = {}
         self.birdseye_layout: dict[str, Any] = {}
+        self.audio_transcription_state: str = "idle"
         self._camera_settings_handlers: dict[str, Callable] = {
             "audio": self._on_audio_command,
             "audio_transcription": self._on_audio_transcription_command,
@@ -178,6 +180,19 @@ def handle_update_model_state() -> None:
         def handle_model_state() -> None:
             self.publish("model_state", json.dumps(self.model_state.copy()))
 
+        def handle_update_audio_transcription_state() -> None:
+            if payload:
+                self.audio_transcription_state = payload
+                self.publish(
+                    "audio_transcription_state",
+                    json.dumps(self.audio_transcription_state),
+                )
+
+        def handle_audio_transcription_state() -> None:
+            self.publish(
+                "audio_transcription_state", json.dumps(self.audio_transcription_state)
+            )
+
         def handle_update_embeddings_reindex_progress() -> None:
             self.embeddings_reindex = payload
             self.publish(
@@ -264,10 +279,12 @@ def handle_notification_test() -> None:
             UPDATE_MODEL_STATE: handle_update_model_state,
             UPDATE_EMBEDDINGS_REINDEX_PROGRESS: handle_update_embeddings_reindex_progress,
             UPDATE_BIRDSEYE_LAYOUT: handle_update_birdseye_layout,
+            UPDATE_AUDIO_TRANSCRIPTION_STATE: handle_update_audio_transcription_state,
             NOTIFICATION_TEST: handle_notification_test,
             "restart": handle_restart,
             "embeddingsReindexProgress": handle_embeddings_reindex_progress,
             "modelState": handle_model_state,
+            "audioTranscriptionState": handle_audio_transcription_state,
             "birdseyeLayout": handle_birdseye_layout,
             "onConnect": handle_on_connect,
         }
 
@@ -113,6 +113,7 @@
 UPDATE_CAMERA_ACTIVITY = "update_camera_activity"
 UPDATE_AUDIO_ACTIVITY = "update_audio_activity"
 EXPIRE_AUDIO_ACTIVITY = "expire_audio_activity"
+UPDATE_AUDIO_TRANSCRIPTION_STATE = "update_audio_transcription_state"
 UPDATE_EVENT_DESCRIPTION = "update_event_description"
 UPDATE_REVIEW_DESCRIPTION = "update_review_description"
 UPDATE_MODEL_STATE = "update_model_state"
 
@@ -13,6 +13,7 @@
 from frigate.const import (
     CACHE_DIR,
     MODEL_CACHE_DIR,
+    UPDATE_AUDIO_TRANSCRIPTION_STATE,
     UPDATE_EVENT_DESCRIPTION,
 )
 from frigate.data_processing.types import PostProcessDataEnum
@@ -190,6 +191,8 @@ def _transcription_wrapper(self, event: dict[str, any]) -> None:
                 self.transcription_running = False
                 self.transcription_thread = None
 
+            self.requestor.send_data(UPDATE_AUDIO_TRANSCRIPTION_STATE, "idle")
+
     def handle_request(self, topic: str, request_data: dict[str, any]) -> str | None:
         if topic == "transcribe_audio":
             event = request_data["event"]
@@ -203,6 +206,8 @@ def handle_request(self, topic: str, request_data: dict[str, any]) -> str | None
 
                 # Mark as running and start the thread
                 self.transcription_running = True
+                self.requestor.send_data(UPDATE_AUDIO_TRANSCRIPTION_STATE, "processing")
+
                 self.transcription_thread = threading.Thread(
                     target=self._transcription_wrapper, args=(event,), daemon=True
                 )
 
@@ -109,6 +109,7 @@ def handle_object_detection(
                     event_data["region"],
                 ),
                 "attribute": "",
+                "score": event_data["score"],
             },
         }
 
 
@@ -461,6 +461,40 @@ export function useEmbeddingsReindexProgress(
   return { payload: data };
 }
 
+export function useAudioTranscriptionProcessState(
+  revalidateOnFocus: boolean = true,
+): { payload: string } {
+  const {
+    value: { payload },
+    send: sendCommand,
+  } = useWs("audio_transcription_state", "audioTranscriptionState");
+
+  const data = useDeepMemo(
+    payload ? (JSON.parse(payload as string) as string) : "idle",
+  );
+
+  useEffect(() => {
+    let listener = undefined;
+    if (revalidateOnFocus) {
+      sendCommand("audioTranscriptionState");
+      listener = () => {
+        if (document.visibilityState == "visible") {
+          sendCommand("audioTranscriptionState");
+        }
+      };
+      addEventListener("visibilitychange", listener);
+    }
+    return () => {
+      if (listener) {
+        removeEventListener("visibilitychange", listener);
+      }
+    };
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [revalidateOnFocus]);
+
+  return { payload: data || "idle" };
+}
+
 export function useBirdseyeLayout(revalidateOnFocus: boolean = true): {
   payload: string;
 } {
 
@@ -42,6 +42,7 @@ type ObjectData = {
   pathPoints: PathPoint[];
   currentZones: string[];
   currentBox?: number[];
+  currentAttributeBox?: number[];
 };
 
 export default function ObjectTrackOverlay({
@@ -105,14 +106,25 @@ export default function ObjectTrackOverlay({
     selectedObjectIds.length > 0
       ? ["event_ids", { ids: selectedObjectIds.join(",") }]
       : null,
+    null,
+    {
+      revalidateOnFocus: false,
+      revalidateOnReconnect: false,
+      dedupingInterval: 30000,
+    },
   );
 
   // Fetch timeline data for each object ID using fixed number of hooks
   const { data: timelineData } = useSWR<TrackingDetailsSequence[]>(
     selectedObjectIds.length > 0
       ? `timeline?source_id=${selectedObjectIds.join(",")}&limit=1000`
       : null,
-    { revalidateOnFocus: false },
+    null,
+    {
+      revalidateOnFocus: false,
+      revalidateOnReconnect: false,
+      dedupingInterval: 30000,
+    },
   );
 
   const getZonesFriendlyNames = (zones: string[], config: FrigateConfig) => {
@@ -270,6 +282,7 @@ export default function ObjectTrackOverlay({
           );
 
         const currentBox = nearbyTimelineEvent?.data?.box;
+        const currentAttributeBox = nearbyTimelineEvent?.data?.attribute_box;
 
         return {
           objectId,
@@ -278,6 +291,7 @@ export default function ObjectTrackOverlay({
           pathPoints: combinedPoints,
           currentZones,
           currentBox,
+          currentAttributeBox,
         };
       })
       .filter((obj: ObjectData) => obj.pathPoints.length > 0); // Only include objects with path data
@@ -482,6 +496,20 @@ export default function ObjectTrackOverlay({
                 />
               </g>
             )}
+            {objData.currentAttributeBox && showBoundingBoxes && (
+              <g>
+                <rect
+                  x={objData.currentAttributeBox[0] * videoWidth}
+                  y={objData.currentAttributeBox[1] * videoHeight}
+                  width={objData.currentAttributeBox[2] * videoWidth}
+                  height={objData.currentAttributeBox[3] * videoHeight}
+                  fill="none"
+                  stroke={objData.color}
+                  strokeWidth={boxStroke}
+                  opacity="0.9"
+                />
+              </g>
+            )}
           </g>
         );
       })}
 
@@ -92,6 +92,7 @@ import { DialogPortal } from "@radix-ui/react-dialog";
 import { useDetailStream } from "@/context/detail-stream-context";
 import { PiSlidersHorizontalBold } from "react-icons/pi";
 import { HiSparkles } from "react-icons/hi";
+import { useAudioTranscriptionProcessState } from "@/api/ws";
 
 const SEARCH_TABS = ["snapshot", "tracking_details"] as const;
 export type SearchTab = (typeof SEARCH_TABS)[number];
@@ -1076,6 +1077,11 @@ function ObjectDetailsTab({
       });
   }, [search, t]);
 
+  // audio transcription processing state
+
+  const { payload: audioTranscriptionProcessState } =
+    useAudioTranscriptionProcessState();
+
   // frigate+ submission
 
   type SubmissionState = "reviewing" | "uploading" | "submitted";
@@ -1431,10 +1437,20 @@ function ObjectDetailsTab({
                   <TooltipTrigger asChild>
                     <button
                       aria-label={t("itemMenu.audioTranscription.label")}
-                      className="text-primary/40 hover:text-primary/80"
+                      className={cn(
+                        "text-primary/40",
+                        audioTranscriptionProcessState === "processing"
+                          ? "cursor-not-allowed"
+                          : "hover:text-primary/80",
+                      )}
                       onClick={onTranscribe}
+                      disabled={audioTranscriptionProcessState === "processing"}
                     >
-                      <FaMicrophone className="size-4" />
+                      {audioTranscriptionProcessState === "processing" ? (
+                        <ActivityIndicator className="size-4" />
+                      ) : (
+                        <FaMicrophone className="size-4" />
+                      )}
                     </button>
                   </TooltipTrigger>
                   <TooltipContent>
Original file line number	Diff line number	Diff line change
`@@ -542,6 +542,7 @@ def transcribe_audio(request: Request, body: AudioTranscriptionBody):`
`542`	`542`	`status_code=409, # 409 Conflict`
`543`	`543`	`)`
`544`	`544`	`else:`
	`545`	`+ logger.debug(f"Failed to transcribe audio, response: {response}")`
`545`	`546`	`return JSONResponse(`
`546`	`547`	`content={`
`547`	`548`	`"success": False,`
Original file line number	Diff line number	Diff line change
`@@ -109,6 +109,7 @@ def handle_object_detection(`
`109`	`109`	`event_data["region"],`
`110`	`110`	`),`
`111`	`111`	`"attribute": "",`
	`112`	`+ "score": event_data["score"],`
`112`	`113`	`},`
`113`	`114`	`}`
`114`	`115`