Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion docs/docs/configuration/audio_detectors.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,4 +144,10 @@ In order to use transcription and translation for past events, you must enable a

The transcribed/translated speech will appear in the description box in the Tracked Object Details pane. If Semantic Search is enabled, embeddings are generated for the transcription text and are fully searchable using the description search type.

Recorded `speech` events will always use a `whisper` model, regardless of the `model_size` config setting. Without a GPU, generating transcriptions for longer `speech` events may take a fair amount of time, so be patient.
:::note

Only one `speech` event may be transcribed at a time. Frigate does not automatically transcribe `speech` events or implement a queue for long-running transcription model inference.

:::

Recorded `speech` events will always use a `whisper` model, regardless of the `model_size` config setting. Without a supported Nvidia GPU, generating transcriptions for longer `speech` events may take a fair amount of time, so be patient.
6 changes: 3 additions & 3 deletions docs/docs/configuration/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -700,11 +700,11 @@ genai:
# Optional: Configuration for audio transcription
# NOTE: only the enabled option can be overridden at the camera level
audio_transcription:
# Optional: Enable license plate recognition (default: shown below)
# Optional: Enable live and speech event audio transcription (default: shown below)
enabled: False
# Optional: The device to run the models on (default: shown below)
# Optional: The device to run the models on for live transcription. (default: shown below)
device: CPU
# Optional: Set the model size used for transcription. (default: shown below)
# Optional: Set the model size used for live transcription. (default: shown below)
model_size: small
# Optional: Set the language used for transcription translation. (default: shown below)
# List of language codes: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L10
Expand Down
1 change: 1 addition & 0 deletions frigate/api/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,7 @@ def transcribe_audio(request: Request, body: AudioTranscriptionBody):
status_code=409, # 409 Conflict
)
else:
logger.debug(f"Failed to transcribe audio, response: {response}")
return JSONResponse(
content={
"success": False,
Expand Down
17 changes: 17 additions & 0 deletions frigate/comms/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
NOTIFICATION_TEST,
REQUEST_REGION_GRID,
UPDATE_AUDIO_ACTIVITY,
UPDATE_AUDIO_TRANSCRIPTION_STATE,
UPDATE_BIRDSEYE_LAYOUT,
UPDATE_CAMERA_ACTIVITY,
UPDATE_EMBEDDINGS_REINDEX_PROGRESS,
Expand Down Expand Up @@ -61,6 +62,7 @@ def __init__(
self.model_state: dict[str, ModelStatusTypesEnum] = {}
self.embeddings_reindex: dict[str, Any] = {}
self.birdseye_layout: dict[str, Any] = {}
self.audio_transcription_state: str = "idle"
self._camera_settings_handlers: dict[str, Callable] = {
"audio": self._on_audio_command,
"audio_transcription": self._on_audio_transcription_command,
Expand Down Expand Up @@ -178,6 +180,19 @@ def handle_update_model_state() -> None:
def handle_model_state() -> None:
self.publish("model_state", json.dumps(self.model_state.copy()))

def handle_update_audio_transcription_state() -> None:
if payload:
self.audio_transcription_state = payload
self.publish(
"audio_transcription_state",
json.dumps(self.audio_transcription_state),
)

def handle_audio_transcription_state() -> None:
self.publish(
"audio_transcription_state", json.dumps(self.audio_transcription_state)
)

def handle_update_embeddings_reindex_progress() -> None:
self.embeddings_reindex = payload
self.publish(
Expand Down Expand Up @@ -264,10 +279,12 @@ def handle_notification_test() -> None:
UPDATE_MODEL_STATE: handle_update_model_state,
UPDATE_EMBEDDINGS_REINDEX_PROGRESS: handle_update_embeddings_reindex_progress,
UPDATE_BIRDSEYE_LAYOUT: handle_update_birdseye_layout,
UPDATE_AUDIO_TRANSCRIPTION_STATE: handle_update_audio_transcription_state,
NOTIFICATION_TEST: handle_notification_test,
"restart": handle_restart,
"embeddingsReindexProgress": handle_embeddings_reindex_progress,
"modelState": handle_model_state,
"audioTranscriptionState": handle_audio_transcription_state,
"birdseyeLayout": handle_birdseye_layout,
"onConnect": handle_on_connect,
}
Expand Down
1 change: 1 addition & 0 deletions frigate/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@
UPDATE_CAMERA_ACTIVITY = "update_camera_activity"
UPDATE_AUDIO_ACTIVITY = "update_audio_activity"
EXPIRE_AUDIO_ACTIVITY = "expire_audio_activity"
UPDATE_AUDIO_TRANSCRIPTION_STATE = "update_audio_transcription_state"
UPDATE_EVENT_DESCRIPTION = "update_event_description"
UPDATE_REVIEW_DESCRIPTION = "update_review_description"
UPDATE_MODEL_STATE = "update_model_state"
Expand Down
5 changes: 5 additions & 0 deletions frigate/data_processing/post/audio_transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from frigate.const import (
CACHE_DIR,
MODEL_CACHE_DIR,
UPDATE_AUDIO_TRANSCRIPTION_STATE,
UPDATE_EVENT_DESCRIPTION,
)
from frigate.data_processing.types import PostProcessDataEnum
Expand Down Expand Up @@ -190,6 +191,8 @@ def _transcription_wrapper(self, event: dict[str, any]) -> None:
self.transcription_running = False
self.transcription_thread = None

self.requestor.send_data(UPDATE_AUDIO_TRANSCRIPTION_STATE, "idle")

def handle_request(self, topic: str, request_data: dict[str, any]) -> str | None:
if topic == "transcribe_audio":
event = request_data["event"]
Expand All @@ -203,6 +206,8 @@ def handle_request(self, topic: str, request_data: dict[str, any]) -> str | None

# Mark as running and start the thread
self.transcription_running = True
self.requestor.send_data(UPDATE_AUDIO_TRANSCRIPTION_STATE, "processing")

self.transcription_thread = threading.Thread(
target=self._transcription_wrapper, args=(event,), daemon=True
)
Expand Down
1 change: 1 addition & 0 deletions frigate/timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def handle_object_detection(
event_data["region"],
),
"attribute": "",
"score": event_data["score"],
},
}

Expand Down
34 changes: 34 additions & 0 deletions web/src/api/ws.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,40 @@ export function useEmbeddingsReindexProgress(
return { payload: data };
}

export function useAudioTranscriptionProcessState(
revalidateOnFocus: boolean = true,
): { payload: string } {
const {
value: { payload },
send: sendCommand,
} = useWs("audio_transcription_state", "audioTranscriptionState");

const data = useDeepMemo(
payload ? (JSON.parse(payload as string) as string) : "idle",
);

useEffect(() => {
let listener = undefined;
if (revalidateOnFocus) {
sendCommand("audioTranscriptionState");
listener = () => {
if (document.visibilityState == "visible") {
sendCommand("audioTranscriptionState");
}
};
addEventListener("visibilitychange", listener);
}
return () => {
if (listener) {
removeEventListener("visibilitychange", listener);
}
};
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [revalidateOnFocus]);

return { payload: data || "idle" };
}

export function useBirdseyeLayout(revalidateOnFocus: boolean = true): {
payload: string;
} {
Expand Down
30 changes: 29 additions & 1 deletion web/src/components/overlay/ObjectTrackOverlay.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ type ObjectData = {
pathPoints: PathPoint[];
currentZones: string[];
currentBox?: number[];
currentAttributeBox?: number[];
};

export default function ObjectTrackOverlay({
Expand Down Expand Up @@ -105,14 +106,25 @@ export default function ObjectTrackOverlay({
selectedObjectIds.length > 0
? ["event_ids", { ids: selectedObjectIds.join(",") }]
: null,
null,
{
revalidateOnFocus: false,
revalidateOnReconnect: false,
dedupingInterval: 30000,
},
);

// Fetch timeline data for each object ID using fixed number of hooks
const { data: timelineData } = useSWR<TrackingDetailsSequence[]>(
selectedObjectIds.length > 0
? `timeline?source_id=${selectedObjectIds.join(",")}&limit=1000`
: null,
{ revalidateOnFocus: false },
null,
{
revalidateOnFocus: false,
revalidateOnReconnect: false,
dedupingInterval: 30000,
},
);

const getZonesFriendlyNames = (zones: string[], config: FrigateConfig) => {
Expand Down Expand Up @@ -270,6 +282,7 @@ export default function ObjectTrackOverlay({
);

const currentBox = nearbyTimelineEvent?.data?.box;
const currentAttributeBox = nearbyTimelineEvent?.data?.attribute_box;

return {
objectId,
Expand All @@ -278,6 +291,7 @@ export default function ObjectTrackOverlay({
pathPoints: combinedPoints,
currentZones,
currentBox,
currentAttributeBox,
};
})
.filter((obj: ObjectData) => obj.pathPoints.length > 0); // Only include objects with path data
Expand Down Expand Up @@ -482,6 +496,20 @@ export default function ObjectTrackOverlay({
/>
</g>
)}
{objData.currentAttributeBox && showBoundingBoxes && (
<g>
<rect
x={objData.currentAttributeBox[0] * videoWidth}
y={objData.currentAttributeBox[1] * videoHeight}
width={objData.currentAttributeBox[2] * videoWidth}
height={objData.currentAttributeBox[3] * videoHeight}
fill="none"
stroke={objData.color}
strokeWidth={boxStroke}
opacity="0.9"
/>
</g>
)}
</g>
);
})}
Expand Down
20 changes: 18 additions & 2 deletions web/src/components/overlay/detail/SearchDetailDialog.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ import { DialogPortal } from "@radix-ui/react-dialog";
import { useDetailStream } from "@/context/detail-stream-context";
import { PiSlidersHorizontalBold } from "react-icons/pi";
import { HiSparkles } from "react-icons/hi";
import { useAudioTranscriptionProcessState } from "@/api/ws";

const SEARCH_TABS = ["snapshot", "tracking_details"] as const;
export type SearchTab = (typeof SEARCH_TABS)[number];
Expand Down Expand Up @@ -1076,6 +1077,11 @@ function ObjectDetailsTab({
});
}, [search, t]);

// audio transcription processing state

const { payload: audioTranscriptionProcessState } =
useAudioTranscriptionProcessState();

// frigate+ submission

type SubmissionState = "reviewing" | "uploading" | "submitted";
Expand Down Expand Up @@ -1431,10 +1437,20 @@ function ObjectDetailsTab({
<TooltipTrigger asChild>
<button
aria-label={t("itemMenu.audioTranscription.label")}
className="text-primary/40 hover:text-primary/80"
className={cn(
"text-primary/40",
audioTranscriptionProcessState === "processing"
? "cursor-not-allowed"
: "hover:text-primary/80",
)}
onClick={onTranscribe}
disabled={audioTranscriptionProcessState === "processing"}
>
<FaMicrophone className="size-4" />
{audioTranscriptionProcessState === "processing" ? (
<ActivityIndicator className="size-4" />
) : (
<FaMicrophone className="size-4" />
)}
</button>
</TooltipTrigger>
<TooltipContent>
Expand Down
Loading