Skip to content

Commit aa8b423

Browse files
authored
Miscellaneous Fixes (#21024)
* fix wording in reference config * spacing tweaks * make live view settings drawer scrollable * clarify audio transcription docs * change audio transcription icon to activity indicator when transcription is in progress the backend doesn't implement any kind of queueing for speech event transcription * tracking details tweaks - Add attribute box overlay and area - Add score - Throttle swr revalidation during video component rerendering * add mse codecs to console debug on errors * add camera name
1 parent 2d8b6c8 commit aa8b423

25 files changed

+587
-385
lines changed

docs/docs/configuration/audio_detectors.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,4 +144,10 @@ In order to use transcription and translation for past events, you must enable a
144144

145145
The transcribed/translated speech will appear in the description box in the Tracked Object Details pane. If Semantic Search is enabled, embeddings are generated for the transcription text and are fully searchable using the description search type.
146146

147-
Recorded `speech` events will always use a `whisper` model, regardless of the `model_size` config setting. Without a GPU, generating transcriptions for longer `speech` events may take a fair amount of time, so be patient.
147+
:::note
148+
149+
Only one `speech` event may be transcribed at a time. Frigate does not automatically transcribe `speech` events or implement a queue for long-running transcription model inference.
150+
151+
:::
152+
153+
Recorded `speech` events will always use a `whisper` model, regardless of the `model_size` config setting. Without a supported Nvidia GPU, generating transcriptions for longer `speech` events may take a fair amount of time, so be patient.

docs/docs/configuration/reference.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -700,11 +700,11 @@ genai:
700700
# Optional: Configuration for audio transcription
701701
# NOTE: only the enabled option can be overridden at the camera level
702702
audio_transcription:
703-
# Optional: Enable license plate recognition (default: shown below)
703+
# Optional: Enable live and speech event audio transcription (default: shown below)
704704
enabled: False
705-
# Optional: The device to run the models on (default: shown below)
705+
# Optional: The device to run the models on for live transcription. (default: shown below)
706706
device: CPU
707-
# Optional: Set the model size used for transcription. (default: shown below)
707+
# Optional: Set the model size used for live transcription. (default: shown below)
708708
model_size: small
709709
# Optional: Set the language used for transcription translation. (default: shown below)
710710
# List of language codes: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10

frigate/api/classification.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,6 +542,7 @@ def transcribe_audio(request: Request, body: AudioTranscriptionBody):
542542
status_code=409, # 409 Conflict
543543
)
544544
else:
545+
logger.debug(f"Failed to transcribe audio, response: {response}")
545546
return JSONResponse(
546547
content={
547548
"success": False,

frigate/comms/dispatcher.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
NOTIFICATION_TEST,
2424
REQUEST_REGION_GRID,
2525
UPDATE_AUDIO_ACTIVITY,
26+
UPDATE_AUDIO_TRANSCRIPTION_STATE,
2627
UPDATE_BIRDSEYE_LAYOUT,
2728
UPDATE_CAMERA_ACTIVITY,
2829
UPDATE_EMBEDDINGS_REINDEX_PROGRESS,
@@ -61,6 +62,7 @@ def __init__(
6162
self.model_state: dict[str, ModelStatusTypesEnum] = {}
6263
self.embeddings_reindex: dict[str, Any] = {}
6364
self.birdseye_layout: dict[str, Any] = {}
65+
self.audio_transcription_state: str = "idle"
6466
self._camera_settings_handlers: dict[str, Callable] = {
6567
"audio": self._on_audio_command,
6668
"audio_transcription": self._on_audio_transcription_command,
@@ -178,6 +180,19 @@ def handle_update_model_state() -> None:
178180
def handle_model_state() -> None:
179181
self.publish("model_state", json.dumps(self.model_state.copy()))
180182

183+
def handle_update_audio_transcription_state() -> None:
184+
if payload:
185+
self.audio_transcription_state = payload
186+
self.publish(
187+
"audio_transcription_state",
188+
json.dumps(self.audio_transcription_state),
189+
)
190+
191+
def handle_audio_transcription_state() -> None:
192+
self.publish(
193+
"audio_transcription_state", json.dumps(self.audio_transcription_state)
194+
)
195+
181196
def handle_update_embeddings_reindex_progress() -> None:
182197
self.embeddings_reindex = payload
183198
self.publish(
@@ -264,10 +279,12 @@ def handle_notification_test() -> None:
264279
UPDATE_MODEL_STATE: handle_update_model_state,
265280
UPDATE_EMBEDDINGS_REINDEX_PROGRESS: handle_update_embeddings_reindex_progress,
266281
UPDATE_BIRDSEYE_LAYOUT: handle_update_birdseye_layout,
282+
UPDATE_AUDIO_TRANSCRIPTION_STATE: handle_update_audio_transcription_state,
267283
NOTIFICATION_TEST: handle_notification_test,
268284
"restart": handle_restart,
269285
"embeddingsReindexProgress": handle_embeddings_reindex_progress,
270286
"modelState": handle_model_state,
287+
"audioTranscriptionState": handle_audio_transcription_state,
271288
"birdseyeLayout": handle_birdseye_layout,
272289
"onConnect": handle_on_connect,
273290
}

frigate/const.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@
113113
UPDATE_CAMERA_ACTIVITY = "update_camera_activity"
114114
UPDATE_AUDIO_ACTIVITY = "update_audio_activity"
115115
EXPIRE_AUDIO_ACTIVITY = "expire_audio_activity"
116+
UPDATE_AUDIO_TRANSCRIPTION_STATE = "update_audio_transcription_state"
116117
UPDATE_EVENT_DESCRIPTION = "update_event_description"
117118
UPDATE_REVIEW_DESCRIPTION = "update_review_description"
118119
UPDATE_MODEL_STATE = "update_model_state"

frigate/data_processing/post/audio_transcription.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from frigate.const import (
1414
CACHE_DIR,
1515
MODEL_CACHE_DIR,
16+
UPDATE_AUDIO_TRANSCRIPTION_STATE,
1617
UPDATE_EVENT_DESCRIPTION,
1718
)
1819
from frigate.data_processing.types import PostProcessDataEnum
@@ -190,6 +191,8 @@ def _transcription_wrapper(self, event: dict[str, any]) -> None:
190191
self.transcription_running = False
191192
self.transcription_thread = None
192193

194+
self.requestor.send_data(UPDATE_AUDIO_TRANSCRIPTION_STATE, "idle")
195+
193196
def handle_request(self, topic: str, request_data: dict[str, any]) -> str | None:
194197
if topic == "transcribe_audio":
195198
event = request_data["event"]
@@ -203,6 +206,8 @@ def handle_request(self, topic: str, request_data: dict[str, any]) -> str | None
203206

204207
# Mark as running and start the thread
205208
self.transcription_running = True
209+
self.requestor.send_data(UPDATE_AUDIO_TRANSCRIPTION_STATE, "processing")
210+
206211
self.transcription_thread = threading.Thread(
207212
target=self._transcription_wrapper, args=(event,), daemon=True
208213
)

frigate/timeline.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def handle_object_detection(
109109
event_data["region"],
110110
),
111111
"attribute": "",
112+
"score": event_data["score"],
112113
},
113114
}
114115

web/src/api/ws.tsx

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,40 @@ export function useEmbeddingsReindexProgress(
461461
return { payload: data };
462462
}
463463

464+
export function useAudioTranscriptionProcessState(
465+
revalidateOnFocus: boolean = true,
466+
): { payload: string } {
467+
const {
468+
value: { payload },
469+
send: sendCommand,
470+
} = useWs("audio_transcription_state", "audioTranscriptionState");
471+
472+
const data = useDeepMemo(
473+
payload ? (JSON.parse(payload as string) as string) : "idle",
474+
);
475+
476+
useEffect(() => {
477+
let listener = undefined;
478+
if (revalidateOnFocus) {
479+
sendCommand("audioTranscriptionState");
480+
listener = () => {
481+
if (document.visibilityState == "visible") {
482+
sendCommand("audioTranscriptionState");
483+
}
484+
};
485+
addEventListener("visibilitychange", listener);
486+
}
487+
return () => {
488+
if (listener) {
489+
removeEventListener("visibilitychange", listener);
490+
}
491+
};
492+
// eslint-disable-next-line react-hooks/exhaustive-deps
493+
}, [revalidateOnFocus]);
494+
495+
return { payload: data || "idle" };
496+
}
497+
464498
export function useBirdseyeLayout(revalidateOnFocus: boolean = true): {
465499
payload: string;
466500
} {

web/src/components/overlay/ObjectTrackOverlay.tsx

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ type ObjectData = {
4242
pathPoints: PathPoint[];
4343
currentZones: string[];
4444
currentBox?: number[];
45+
currentAttributeBox?: number[];
4546
};
4647

4748
export default function ObjectTrackOverlay({
@@ -105,14 +106,25 @@ export default function ObjectTrackOverlay({
105106
selectedObjectIds.length > 0
106107
? ["event_ids", { ids: selectedObjectIds.join(",") }]
107108
: null,
109+
null,
110+
{
111+
revalidateOnFocus: false,
112+
revalidateOnReconnect: false,
113+
dedupingInterval: 30000,
114+
},
108115
);
109116

110117
// Fetch timeline data for each object ID using fixed number of hooks
111118
const { data: timelineData } = useSWR<TrackingDetailsSequence[]>(
112119
selectedObjectIds.length > 0
113120
? `timeline?source_id=${selectedObjectIds.join(",")}&limit=1000`
114121
: null,
115-
{ revalidateOnFocus: false },
122+
null,
123+
{
124+
revalidateOnFocus: false,
125+
revalidateOnReconnect: false,
126+
dedupingInterval: 30000,
127+
},
116128
);
117129

118130
const getZonesFriendlyNames = (zones: string[], config: FrigateConfig) => {
@@ -270,6 +282,7 @@ export default function ObjectTrackOverlay({
270282
);
271283

272284
const currentBox = nearbyTimelineEvent?.data?.box;
285+
const currentAttributeBox = nearbyTimelineEvent?.data?.attribute_box;
273286

274287
return {
275288
objectId,
@@ -278,6 +291,7 @@ export default function ObjectTrackOverlay({
278291
pathPoints: combinedPoints,
279292
currentZones,
280293
currentBox,
294+
currentAttributeBox,
281295
};
282296
})
283297
.filter((obj: ObjectData) => obj.pathPoints.length > 0); // Only include objects with path data
@@ -482,6 +496,20 @@ export default function ObjectTrackOverlay({
482496
/>
483497
</g>
484498
)}
499+
{objData.currentAttributeBox && showBoundingBoxes && (
500+
<g>
501+
<rect
502+
x={objData.currentAttributeBox[0] * videoWidth}
503+
y={objData.currentAttributeBox[1] * videoHeight}
504+
width={objData.currentAttributeBox[2] * videoWidth}
505+
height={objData.currentAttributeBox[3] * videoHeight}
506+
fill="none"
507+
stroke={objData.color}
508+
strokeWidth={boxStroke}
509+
opacity="0.9"
510+
/>
511+
</g>
512+
)}
485513
</g>
486514
);
487515
})}

web/src/components/overlay/detail/SearchDetailDialog.tsx

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ import { DialogPortal } from "@radix-ui/react-dialog";
9292
import { useDetailStream } from "@/context/detail-stream-context";
9393
import { PiSlidersHorizontalBold } from "react-icons/pi";
9494
import { HiSparkles } from "react-icons/hi";
95+
import { useAudioTranscriptionProcessState } from "@/api/ws";
9596

9697
const SEARCH_TABS = ["snapshot", "tracking_details"] as const;
9798
export type SearchTab = (typeof SEARCH_TABS)[number];
@@ -1076,6 +1077,11 @@ function ObjectDetailsTab({
10761077
});
10771078
}, [search, t]);
10781079

1080+
// audio transcription processing state
1081+
1082+
const { payload: audioTranscriptionProcessState } =
1083+
useAudioTranscriptionProcessState();
1084+
10791085
// frigate+ submission
10801086

10811087
type SubmissionState = "reviewing" | "uploading" | "submitted";
@@ -1431,10 +1437,20 @@ function ObjectDetailsTab({
14311437
<TooltipTrigger asChild>
14321438
<button
14331439
aria-label={t("itemMenu.audioTranscription.label")}
1434-
className="text-primary/40 hover:text-primary/80"
1440+
className={cn(
1441+
"text-primary/40",
1442+
audioTranscriptionProcessState === "processing"
1443+
? "cursor-not-allowed"
1444+
: "hover:text-primary/80",
1445+
)}
14351446
onClick={onTranscribe}
1447+
disabled={audioTranscriptionProcessState === "processing"}
14361448
>
1437-
<FaMicrophone className="size-4" />
1449+
{audioTranscriptionProcessState === "processing" ? (
1450+
<ActivityIndicator className="size-4" />
1451+
) : (
1452+
<FaMicrophone className="size-4" />
1453+
)}
14381454
</button>
14391455
</TooltipTrigger>
14401456
<TooltipContent>

0 commit comments

Comments
 (0)