diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 241a703c..6d70587e 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.5 +current_version = 0.5.0 commit = True tag = True tag_name = v{new_version} diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml index 520482e8..5d6ec098 100644 --- a/.github/workflows/build-windows.yml +++ b/.github/workflows/build-windows.yml @@ -29,11 +29,14 @@ jobs: run: | cd backend python build_binary.py + python build_binary.py --shim PLATFORM=$(rustc --print host-tuple) mkdir -p ../tauri/src-tauri/binaries cp dist/voicebox-server.exe ../tauri/src-tauri/binaries/voicebox-server-${PLATFORM}.exe + cp dist/voicebox-mcp.exe ../tauri/src-tauri/binaries/voicebox-mcp-${PLATFORM}.exe echo "Built voicebox-server-${PLATFORM}.exe" + echo "Built voicebox-mcp-${PLATFORM}.exe" - name: Setup Bun uses: oven-sh/setup-bun@v2 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4f864c55..e94bcd70 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -114,6 +114,7 @@ jobs: run: | cd backend python build_binary.py + python build_binary.py --shim # Get platform tuple PLATFORM=$(rustc --print host-tuple) @@ -123,7 +124,9 @@ jobs: # Copy with platform suffix cp dist/voicebox-server.exe ../tauri/src-tauri/binaries/voicebox-server-${PLATFORM}.exe + cp dist/voicebox-mcp.exe ../tauri/src-tauri/binaries/voicebox-mcp-${PLATFORM}.exe echo "Built voicebox-server-${PLATFORM}.exe" + echo "Built voicebox-mcp-${PLATFORM}.exe" - name: Setup Bun uses: oven-sh/setup-bun@v2 diff --git a/.mcp.json b/.mcp.json new file mode 100644 index 00000000..de6d6caf --- /dev/null +++ b/.mcp.json @@ -0,0 +1,11 @@ +{ + "mcpServers": { + "voicebox": { + "type": "http", + "url": "http://127.0.0.1:17493/mcp", + "headers": { + "X-Voicebox-Client-Id": "claude-code" + } + } + } +} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 1559d692..2c6412ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,90 @@ # Changelog -## [Unreleased] +## [0.5.0] - 2026-04-22 + +**The Capture release.** Voicebox stops being just a voice-cloning studio and becomes a full AI voice studio. Hold a key anywhere on your machine, speak, release — the transcript lands in the focused text field. Flip the primitive around and any MCP-aware agent — Claude Code, Cursor, Spacebot — speaks back through an on-screen pill in one of your cloned voices. A local LLM sits between the two, so transcripts come out clean and voice profiles can carry a personality that reshapes what the agent says before it gets spoken. + +### Dictation — speak anywhere, paste anywhere + +- **Global hotkey capture.** Hold a customizable chord anywhere on your machine (defaults: right-Cmd + right-Option on macOS, right-Ctrl + right-Shift on Windows), speak, release. A floating on-screen pill walks through recording → transcribing → refining → done with a live elapsed timer. The transcript lands as clean text. +- **Push-to-talk and toggle modes, each with its own chord.** The default toggle chord adds Space to the push-to-talk chord. Holding PTT and tapping Space mid-hold upgrades a hold into a hands-free session without a gap in the recording. +- **Auto-paste into the focused app.** Once transcription finishes, Voicebox synthesizes a paste into whatever text field had focus when you started the chord — not wherever focus drifted while you were talking. Works across Dvorak / AZERTY layouts. Your clipboard is saved before and restored after. +- **Chord picker UI.** Customize either chord from Settings → Captures by holding the keys you want. Left/right modifier badges show whether a key is the left or right variant. +- **Defaults stay out of your way.** macOS defaults avoid left-hand Cmd+Option chords so the system shortcuts they collide with stay yours. Windows defaults route around AltGr collisions on German / French / Spanish layouts. +- **Accessibility permission is scoped.** If macOS Accessibility isn't granted, dictation still runs and transcripts still land in the Captures tab — only synthetic paste is disabled. The permission prompt lives inline next to the auto-paste toggle, not as a global banner. + +### Personality — voice profiles that speak for themselves + +Voice profiles now carry an optional **personality** — a free-form description of who this voice is, up to 2000 characters. When set, two new controls appear next to the generate button, each powered by a new Qwen3 LLM running entirely locally: + +- **Compose** — the shuffle button drops a fresh in-character line into the textarea. Click again for variety, edit before speaking. +- **Speak in character** — the wand toggle runs your input through the personality LLM before TTS, preserving every idea but delivering it in the character's voice. + +The same LLM doubles as the refinement model, so there's one local LLM in the app, not two. + +**API surface.** `POST /generate`, `POST /speak`, and the MCP `voicebox.speak` tool accept `personality: bool`. `POST /profiles/{id}/compose` powers the shuffle button. MCP client bindings carry a `default_personality: bool` that applies when `personality` isn't passed explicitly. + +### Agents — any MCP-aware agent gets a voice + +Voicebox ships a built-in **Model Context Protocol** server at `http://127.0.0.1:17493/mcp` so Claude Code, Cursor, Windsurf, Cline, VS Code MCP extensions — any MCP-aware agent — can call into your local Voicebox install. Four tools ship with dotted names: + +- **`voicebox.speak`** — speak text in any voice profile, with optional `personality: true` to run through the profile's personality LLM first +- **`voicebox.transcribe`** — Whisper transcription of a base64 blob or an absolute local path. Path mode is restricted to loopback callers so a Voicebox bound on `0.0.0.0` doesn't double as an unauthenticated arbitrary-local-file read primitive. +- **`voicebox.list_captures`** — recent captures with their transcripts +- **`voicebox.list_profiles`** — available voice profiles (cloned + preset) + +- **Streamable HTTP as primary transport.** Cursor / Windsurf / VS Code / Claude Code all support it out of the box — drop a `mcpServers` block with the URL and an `X-Voicebox-Client-Id` header. +- **Stdio shim for clients that don't speak HTTP MCP.** A `voicebox-mcp` binary ships inside the app bundle as a Tauri sidecar. The Settings page renders the install snippet with the right absolute path pre-filled. +- **Per-client voice binding.** Pin Claude Code to Morgan, Cursor to Scarlett, Cline to its own voice — the `X-Voicebox-Client-Id` header resolves to a bound voice whenever `speak` is called without an explicit `profile`. Managed in **Settings → MCP**. +- **Profile resolution precedence.** Explicit `profile` arg (name or id, case-insensitive) → per-client binding → global default from `capture_settings.default_playback_voice_id` → error with a pointer to Settings. +- **Speaking pill.** Agent-initiated speech surfaces the same on-screen pill as dictation, in a `speaking` state with the profile name and an elapsed timer. Silent background TTS is a trust hazard — the pill always shows what's coming out of your machine. +- **`POST /speak` REST wrapper.** Same code path and voice resolution for shell scripts, ACP, A2A, GitHub Actions, or anything else that isn't MCP-native. + +**Claude Code one-liner:** + +``` +claude mcp add voicebox --transport http --url http://127.0.0.1:17493/mcp --header "X-Voicebox-Client-Id: claude-code" +``` + +### Refinement + +A clean transcript needs more than Whisper. Each capture flows through a small Qwen3 LLM that strips fillers, fixes punctuation, and optionally rewrites self-corrections — all on-device. + +- **Loop-stripping before the LLM sees the transcript.** Whisper's "thanks for watching thanks for watching thanks for watching…" hallucination loops are collapsed at a six-identical-tokens threshold (case-insensitive) so a small refinement model can't echo them back. Coverage spans single-word runs, multi-word phrases, CJK character runs, and Japanese emphasis patterns; legitimate repetition ("no, no, no, no, no") doesn't cross the threshold. +- **Per-capture flag snapshot.** `smart_cleanup`, `self_correction`, and `preserve_technical` are stored on each capture, so refinement can be re-run later with different flags without losing the raw transcript. +- **Model picker** — Qwen3 0.6B (400 MB, very fast), 1.7B (1.1 GB, fast), 4B (2.5 GB, full quality). 0.6B is the default; 1.7B is the sweet spot for transcripts with code identifiers. + +### Captures tab + settings + +Settings → Captures is now the home for the whole dictation flow: + +- **Dictation**: global shortcut toggle, push-to-talk chord picker, toggle chord picker, live pill preview, auto-paste into focused field (with inline accessibility prompt). +- **Transcription**: model picker (Whisper Base / Small / Medium / Large / Turbo), language lock. +- **Refinement**: auto-refine toggle, model picker, smart cleanup, remove self-corrections, preserve technical terms. +- **Playback**: default voice for the Captures tab's "Play as" action — picking a voice from the split-button persists the choice across tab switches and restarts. +- **Storage**: captures folder quick-open. + +### Stories — timeline editor + +The Stories tab graduates from a TTS sequencer into a real timeline editor. Same generation-row backing, but clips now compose with imported audio, per-clip levels, and a flexible track stack. + +- **Import external audio.** Drag a music file onto the story content area or pick one from the new "Import audio" entry in the add-clip popover. Accepted formats: wav / mp3 / flac / ogg / m4a / aac / webm, capped at 200 MB. Imported clips show their filename instead of a profile name and skip the regenerate / version-picker controls — there's nothing to regenerate. +- **Per-clip volume.** A `Volume2` icon in the clip-edit toolbar opens a 0–200% slider. Adjustments apply live and to exports. Split and duplicate carry the volume forward into the new clips. +- **Regenerate** from both the clip's chat-list dropdown and the track-editor toolbar. Re-runs the underlying generation through the same path the History tab uses, with completion tracked in the global pending set. +- **Add empty tracks above or below the timeline** via tiny `+` strips at the top of the topmost label cell and the bottom of the bottommost. Sticky in the label column so they follow horizontal scroll. +- **Zoom bar tracks the project.** Min scope is 10 seconds visible (zoomed in cap), max is the entire project (zoomed out cap), default lands on 60 s. Both the +/− buttons and the scrollbar edge-drag handles clamp to those dynamic bounds. + +### Interface + +- **Theme selector.** Light / dark / system in **Settings → General**, persisted across sessions. System mode listens for OS-level appearance changes and flips live without a restart. +- **Scrubbable waveform player on captures.** The capture detail card now embeds a WaveSurfer waveform with click-to-seek and a current / total timestamp pair, replacing the static duration label. +- **Capture pill light mode.** The on-screen pill gets a dedicated light palette so it stays legible against bright windows. +- **Readiness checklist in the Captures settings sidebar.** The same six-gate checklist the Captures empty state uses mirrors into Settings → Captures so a red gate can't hide behind a green toggle. Hidden once every gate is green. macOS-only rows (Input Monitoring, Accessibility) hide entirely on Windows and Linux. + +### Windows parity + +Same dictation flow on Windows. Right-hand default chord (Ctrl+Shift) avoids AltGr collisions on layouts where Ctrl+Alt is the compose key. Focus is captured at chord-start so paste lands in the original field even if focus drifts during transcribe/refine. ## [0.4.5] - 2026-04-22 @@ -657,7 +740,7 @@ The first public release of Voicebox — an open-source voice synthesis studio p Tauri v2, React, TypeScript, Tailwind CSS, FastAPI, Qwen3-TTS, Whisper, SQLite -[Unreleased]: https://github.com/jamiepine/voicebox/compare/v0.4.5...HEAD +[0.5.0]: https://github.com/jamiepine/voicebox/compare/v0.4.5...v0.5.0 [0.4.5]: https://github.com/jamiepine/voicebox/compare/v0.4.4...v0.4.5 [0.4.4]: https://github.com/jamiepine/voicebox/compare/v0.4.3...v0.4.4 [0.4.3]: https://github.com/jamiepine/voicebox/compare/v0.4.2...v0.4.3 diff --git a/README.md b/README.md index 8d220202..c201751f 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,9 @@

Voicebox

- The open-source voice synthesis studio.
- Clone voices. Generate speech. Apply effects. Build voice-powered apps.
- All running locally on your machine. + The open-source AI voice studio.
+ Clone any voice. Generate speech. Dictate into any app. Talk to agents in voices you own.
+ The full voice I/O stack, running locally on your machine.

@@ -63,17 +63,22 @@ ## What is Voicebox? -Voicebox is a **local-first voice cloning studio** — a free and open-source alternative to ElevenLabs. Clone voices from a few seconds of audio or pick from 50+ preset voices, generate speech in 23 languages across 7 TTS engines, apply post-processing effects, and compose multi-voice projects with a timeline editor. +Voicebox is a **local-first AI voice studio** — a free and open-source alternative to **ElevenLabs** and **WisprFlow** in one app. Clone voices from a few seconds of audio, generate speech in 23 languages across 7 TTS engines, dictate into any text field with a global hotkey, and give any MCP-aware AI agent a voice of your choosing. -- **Complete privacy** — models and voice data stay on your machine +The two cloud incumbents sit on opposite halves of the voice I/O loop — ElevenLabs on output, WisprFlow on input. Voicebox does both, bridges them with a bundled local LLM for refinement and per-profile personas, and runs the whole thing on your machine. + +- **Complete privacy** — models, voice data, and captures never leave your machine - **7 TTS engines** — Qwen3-TTS, Qwen CustomVoice, LuxTTS, Chatterbox Multilingual, Chatterbox Turbo, HumeAI TADA, and Kokoro -- **Cloning and preset voices** — zero-shot cloning from a reference sample, or curated preset voices via Kokoro (50 voices) and Qwen CustomVoice (9 voices) +- **Voice cloning and preset voices** — zero-shot cloning from a reference sample, or 50+ curated preset voices via Kokoro and Qwen CustomVoice - **23 languages** — from English to Arabic, Japanese, Hindi, Swahili, and more - **Post-processing effects** — pitch shift, reverb, delay, chorus, compression, and filters - **Expressive speech** — paralinguistic tags like `[laugh]`, `[sigh]`, `[gasp]` via Chatterbox Turbo; natural-language delivery control via Qwen CustomVoice - **Unlimited length** — auto-chunking with crossfade for scripts, articles, and chapters - **Stories editor** — multi-track timeline for conversations, podcasts, and narratives -- **API-first** — REST API for integrating voice synthesis into your own projects +- **Voice input** — global dictation hotkey with push-to-talk and toggle modes, accessibility-verified auto-paste on macOS, in-app mic on every text field, Whisper-based STT +- **Agent voice output** — one tool call (`voicebox.speak`) and any MCP-aware agent (Claude Code, Cursor, Cline) speaks to you in a voice you've cloned +- **Voice personalities** — attach a free-form persona to any voice profile, then Compose, Rewrite, or Respond via a bundled local LLM — agents can invoke the same modes over MCP +- **API-first** — REST API plus a built-in MCP server for integrating voice I/O into your own apps and agents - **Native performance** — built with Tauri (Rust), not Electron - **Runs everywhere** — macOS (MLX/Metal), Windows (CUDA), Linux, AMD ROCm, Intel Arc, Docker @@ -185,12 +190,69 @@ Multi-voice timeline editor for conversations, podcasts, and narratives. - Auto-playback with synchronized playhead - Version pinning per track clip -### Recording & Transcription +### Global Dictation & Voice Input + +The other half of the voice I/O loop. Hold a hotkey anywhere on your system, speak, release — on macOS the transcript pastes straight into the focused text field. Or hit the mic on any Voicebox text input and dictate directly into the app. + +- **Configurable chord bindings** — hold-to-speak and tap-to-toggle chords, each rebindable in the in-app chord picker. Holding push-to-talk and tapping `Space` mid-hold upgrades into a toggle session without a gap in audio +- **Target-aware paste (macOS)** — accessibility-verified injection into the focused text field, with atomic clipboard save/restore so your clipboard isn't clobbered +- **First-run permissions UX** — in-app gates walk you through the macOS Accessibility and Input Monitoring grants with deep-links to System Settings +- **In-app mic button** on every Voicebox text field — generation form, profile descriptions, story titles, anywhere you'd type +- **LLM refinement** — optional cleanup of ums, stutters, and false starts before paste +- **On-screen pill** — floating overlay surfacing `recording`, `transcribing`, `refining`, and `speaking` states. Same pill agents use when they speak to you, so there's one mental model for both directions of the loop + +### Speech-to-Text + +Voicebox runs OpenAI Whisper for transcription — the same model that backs dictation, the Captures tab, and the `/transcribe` API. Running on MLX (Apple Silicon) or PyTorch (CUDA / ROCm / DirectML / CPU) depending on your platform. + +| Size | Notes | +| ----------------------------- | -------------------------------------------------- | +| Base / Small / Medium / Large | Standard Whisper quality ladder | +| Turbo | ~8x faster than Whisper Large, minimal quality loss | + +More engines (Parakeet v3, Qwen3-ASR) are planned — see [Roadmap](#roadmap). + +### Captures + +Every dictation, in-app recording, and uploaded audio file lands in the Captures tab — original audio paired with transcript, always preserved. + +- **Replay, re-transcribe, refine** — rerun STT with any Whisper size, or re-run the raw transcript through the local LLM with different flags (filler cleanup, self-correction removal, technical-term preservation) +- **Edit inline** — tweak the transcript and save on blur +- **Play as voice profile** — turn any capture into speech with a cloned voice, one click +- **Promote to voice sample** — use a capture's audio + transcript as a reference sample on any voice profile +- **Local capture storage** — original audio and transcript stay in your Voicebox data directory, with a folder shortcut in Settings + +### Agent Voice Output + +Every agent gets a voice. One tool call and any MCP-aware agent can speak to you in a voice you've cloned — task completions, questions, notifications. The same pill that surfaces during dictation surfaces during agent speech, so you always see what's coming out of your machine. + +```ts +// In any MCP-aware agent: +await voicebox.speak({ + text: "Deploy complete.", + profile: "Morgan", +}); +``` + +Also exposed as `POST /speak` for anything that doesn't speak MCP — ACP, A2A, shell scripts, custom harnesses. + +- **Bidirectional pill** — `recording`, `transcribing`, `refining`, and `speaking` are all states of the same OS-level overlay, so dictation and agent speech share one surface +- **Per-agent voice binding** — in **Settings → MCP**, pin Claude Code to Morgan and Cursor to Scarlett so you can tell which agent is talking without looking. Each client's `last_seen_at` timestamp confirms the install actually took +- **Always visible** — no silent background TTS; every agent-initiated speak surfaces the pill with the voice profile name for the full duration +- **HTTP + stdio transports** — install as a URL in Claude Code / Cursor / Windsurf / VS Code MCP, or point stdio-only clients at the bundled `voicebox-mcp` binary + +### Voice Personalities + +Attach a free-form personality to any voice profile — who this voice is, how they speak, what they care about. Two actions appear on the generate box when a personality is set, powered by a bundled Qwen3 LLM running entirely locally. + +- **Compose** — a shuffle button that drops a fresh in-character line into the textarea; edit and speak, or click again for a different take +- **Speak in character** — a toggle that routes your input text through the personality LLM to be rewritten in their voice before TTS + +Agents can reach the same rewrite path over MCP by passing `personality: true` to `voicebox.speak`, turning the tool into a text-in → personality-LLM → TTS pipeline. The same LLM backs dictation's refinement step — one LLM in the app, one model cache, one GPU-memory footprint. -- In-app recording with waveform visualization -- System audio capture (macOS and Windows) -- Automatic transcription powered by Whisper (including Whisper Turbo) -- Export recordings in multiple formats +**Local LLM options:** Qwen3 0.6B / 1.7B / 4B, sharing the TTS runtime (MLX on Apple Silicon, PyTorch elsewhere). + +Use cases: agent dev loops (dictate a question, hear the answer in a cloned voice), interactive characters for games and narrative tools, speech assistance for people who can't speak in their original voice. ### Model Management @@ -214,55 +276,121 @@ Multi-voice timeline editor for conversations, podcasts, and narratives. ## API -Voicebox exposes a full REST API for integrating voice synthesis into your own apps. +Voicebox exposes a REST API for integrating voice I/O into your own apps and agents. ```bash # Generate speech -curl -X POST http://localhost:17493/generate \ +curl -X POST http://127.0.0.1:17493/generate \ -H "Content-Type: application/json" \ -d '{"text": "Hello world", "profile_id": "abc123", "language": "en"}' +# Agent voice output — any app or script can speak in a cloned voice +curl -X POST http://127.0.0.1:17493/speak \ + -H "Content-Type: application/json" \ + -H "X-Voicebox-Client-Id: my-script" \ + -d '{"text": "Deploy complete.", "profile": "Morgan"}' + +# Transcribe an audio file +curl -X POST http://127.0.0.1:17493/transcribe \ + -F "audio=@recording.wav" \ + -F "model=whisper-turbo" + # List voice profiles -curl http://localhost:17493/profiles +curl http://127.0.0.1:17493/profiles +``` + +`POST /speak` accepts `profile` as a name (case-insensitive) or id, and resolves via the same precedence as the MCP tool: explicit arg → per-client binding → `capture_settings.default_playback_voice_id`. + +### MCP server + +Voicebox ships a built-in **Model Context Protocol** server so any MCP-aware agent (Claude Code, Cursor, Windsurf, Cline, VS Code MCP extensions) can speak, transcribe, and browse captures and profiles. + +**Claude Code one-liner:** -# Create a profile -curl -X POST http://localhost:17493/profiles \ - -H "Content-Type: application/json" \ - -d '{"name": "My Voice", "language": "en"}' +``` +claude mcp add voicebox \ + --transport http \ + --url http://127.0.0.1:17493/mcp \ + --header "X-Voicebox-Client-Id: claude-code" ``` -**Use cases:** game dialogue, podcast production, accessibility tools, voice assistants, content automation. +**Any HTTP MCP client** (Cursor, Windsurf, VS Code, etc.): + +```json +{ + "mcpServers": { + "voicebox": { + "url": "http://127.0.0.1:17493/mcp", + "headers": { "X-Voicebox-Client-Id": "cursor" } + } + } +} +``` -Full API documentation available at `http://localhost:17493/docs`. +**Stdio fallback** for clients that don't speak HTTP MCP — point at the bundled `voicebox-mcp` binary inside the app: + +```json +{ + "mcpServers": { + "voicebox": { + "command": "/Applications/Voicebox.app/Contents/MacOS/voicebox-mcp", + "env": { "VOICEBOX_CLIENT_ID": "claude-desktop" } + } + } +} +``` + +Four tools ship: `voicebox.speak`, `voicebox.transcribe`, `voicebox.list_captures`, `voicebox.list_profiles`. Per-client voice bindings are managed in **Voicebox → Settings → MCP**. See the [full MCP guide](docs/content/docs/overview/mcp-server.mdx) for tool signatures, resolution precedence, the speaking-pill contract, and security notes. + +```ts +// In any MCP-aware agent: +await voicebox.speak({ + text: "Tests passing. Ready to merge.", + profile: "Morgan", // optional — falls back to the per-client binding + personality: true, // optional — rewrites text through the profile's personality LLM first +}); +``` + +**Use cases:** agent dev loops (voice in, voice out), game dialogue, podcast production, accessibility tools, voice assistants, content automation. + +Full API documentation available at `http://127.0.0.1:17493/docs`. --- ## Tech Stack -| Layer | Technology | -| ------------- | ------------------------------------------------- | -| Desktop App | Tauri (Rust) | -| Frontend | React, TypeScript, Tailwind CSS | -| State | Zustand, React Query | -| Backend | FastAPI (Python) | +| Layer | Technology | +| ------------- | ------------------------------------------------------------------------------- | +| Desktop App | Tauri (Rust) | +| Frontend | React, TypeScript, Tailwind CSS | +| State | Zustand, React Query | +| Backend | FastAPI (Python) | | TTS Engines | Qwen3-TTS, Qwen CustomVoice, LuxTTS, Chatterbox, Chatterbox Turbo, TADA, Kokoro | -| Effects | Pedalboard (Spotify) | -| Transcription | Whisper / Whisper Turbo (PyTorch or MLX) | -| Inference | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) | -| Database | SQLite | -| Audio | WaveSurfer.js, librosa | +| STT | Whisper / Whisper Turbo (PyTorch or MLX) | +| Local LLM | Qwen3 (0.6B / 1.7B / 4B), shared runtime with TTS / STT | +| MCP Server | FastMCP mounted at `/mcp` (Streamable HTTP) + bundled stdio shim binary | +| Native Shim | Rust (inside Tauri) for global hotkey, paste injection, focus introspection | +| Effects | Pedalboard (Spotify) | +| Inference | MLX (Apple Silicon) / PyTorch (CUDA/ROCm/XPU/CPU) | +| Database | SQLite | +| Audio | WaveSurfer.js, librosa | --- ## Roadmap -| Feature | Description | -| ----------------------- | ---------------------------------------------- | -| **Real-time Streaming** | Stream audio as it generates, word by word | -| **Voice Design** | Create new voices from text descriptions | -| **More Models** | XTTS, Bark, and other open-source voice models | -| **Plugin Architecture** | Extend with custom models and effects | -| **Mobile Companion** | Control Voicebox from your phone | +| Feature | Description | +| ---------------------------------- | ------------------------------------------------------------------------ | +| **Windows / Linux auto-paste** | Dictation paste parity — `SendInput` on Windows, `uinput` / AT-SPI on Linux | +| **STT engine expansion** | Parakeet v3 and Qwen3-ASR joining Whisper — 50+ languages, better non-English quality | +| **Pipeline routing** | Configurable source → transform → sink chains with webhook + MCP sinks and a preset editor | +| **Streaming transcription** | WebSocket `/transcribe/stream` for partial transcripts as you speak | +| **End-to-end speech LLMs** | Moshi, GLM-4-Voice, Qwen2.5 Omni — real voice-to-voice, no text between | +| **Voice Design** | Create new voices from text descriptions | +| **Long-form capture** | Dual-stream recorder (mic + system audio) with summary LLM transform | +| **Platform sinks** | Apple Notes, Obsidian, and other opt-in integrations | +| **Plugin architecture** | Extend with custom models, transforms, and sinks | +| **Mobile companion** | Control Voicebox from your phone | For the **full engineering status, open-issue triage, and prioritized work queue**, see [`docs/PROJECT_STATUS.md`](docs/PROJECT_STATUS.md) — a living document that tracks what's shipped, what's in-flight, candidate TTS engines under evaluation, and why we've accepted or backlogged specific integrations. @@ -286,6 +414,8 @@ Install [just](https://github.com/casey/just): `brew install just` or `cargo ins **Prerequisites:** [Bun](https://bun.sh), [Rust](https://rustup.rs), [Python 3.11+](https://python.org), [Tauri Prerequisites](https://v2.tauri.app/start/prerequisites/), and [Xcode](https://developer.apple.com/xcode/) on macOS. +The repo ships a pre-wired `.mcp.json` at the root — running Claude Code inside this checkout picks up the Voicebox MCP tools automatically once the dev app is running. + ### Building Locally ```bash diff --git a/app/index.html b/app/index.html index c7a4be9f..2a155139 100644 --- a/app/index.html +++ b/app/index.html @@ -1,10 +1,26 @@ - + voicebox +

diff --git a/app/package.json b/app/package.json index a94b30b6..56bf162a 100644 --- a/app/package.json +++ b/app/package.json @@ -1,6 +1,6 @@ { "name": "@voicebox/app", - "version": "0.4.5", + "version": "0.5.0", "private": true, "type": "module", "scripts": { diff --git a/app/src/App.tsx b/app/src/App.tsx index ba07b7c1..177f97e8 100644 --- a/app/src/App.tsx +++ b/app/src/App.tsx @@ -1,11 +1,14 @@ import { RouterProvider } from '@tanstack/react-router'; import { useEffect, useRef, useState } from 'react'; import voiceboxLogo from '@/assets/voicebox-logo.png'; +import { DictateWindow } from '@/components/DictateWindow/DictateWindow'; import ShinyText from '@/components/ShinyText'; import { TitleBarDragRegion } from '@/components/TitleBarDragRegion'; import { useAutoUpdater } from '@/hooks/useAutoUpdater'; +import { useThemeSync } from '@/hooks/useThemeSync'; import { apiClient } from '@/lib/api/client'; import type { HealthResponse } from '@/lib/api/types'; +import { useChordSync } from '@/lib/hooks/useChordSync'; import { TOP_SAFE_AREA_PADDING } from '@/lib/constants/ui'; import { cn } from '@/lib/utils/cn'; import { usePlatform } from '@/platform/PlatformContext'; @@ -17,6 +20,11 @@ import { useServerStore, } from '@/stores/serverStore'; +function isDictateView(): boolean { + if (typeof window === 'undefined') return false; + return new URLSearchParams(window.location.search).get('view') === 'dictate'; +} + /** * Validate that a health response has the expected Voicebox-specific shape. * Prevents misidentifying an unrelated service on the same port. @@ -68,6 +76,19 @@ const LOADING_MESSAGES = [ ]; function App() { + useThemeSync(); + + // The dictate window runs in a separate Tauri webview that must skip + // server bootstrap (the main window owns that lifecycle) and render only + // the floating recording surface. Split into a sibling component so the + // main app's hooks are not called on the dictate path. + if (isDictateView()) { + return ; + } + return ; +} + +function MainApp() { const platform = usePlatform(); const [serverReady, setServerReady] = useState(false); const [startupError, setStartupError] = useState(null); @@ -77,6 +98,10 @@ function App() { // Automatically check for app updates on startup and show toast notifications useAutoUpdater({ checkOnMount: true, showToast: true }); + // Replay the saved chord into the Rust hotkey listener every time + // capture_settings resolves or the user edits the chord. + useChordSync(); + // Sync stored setting to Rust on startup useEffect(() => { if (platform.metadata.isTauri) { diff --git a/app/src/assets/sponsors/openai.svg b/app/src/assets/sponsors/openai.svg new file mode 100644 index 00000000..859d7af3 --- /dev/null +++ b/app/src/assets/sponsors/openai.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/app/src/components/AccessibilityGate/AccessibilityGate.tsx b/app/src/components/AccessibilityGate/AccessibilityGate.tsx new file mode 100644 index 00000000..45307aa7 --- /dev/null +++ b/app/src/components/AccessibilityGate/AccessibilityGate.tsx @@ -0,0 +1,126 @@ +import { invoke } from '@tauri-apps/api/core'; +import { listen, type UnlistenFn } from '@tauri-apps/api/event'; +import { AlertTriangle, ExternalLink } from 'lucide-react'; +import { useCallback, useEffect, useState } from 'react'; +import { Trans, useTranslation } from 'react-i18next'; +import { Button } from '@/components/ui/button'; +import { usePlatform } from '@/platform/PlatformContext'; + +/** + * Tracks macOS Accessibility permission state. Without this permission the + * global chord can still record, but the synthetic-⌘V paste silently drops — + * so callers can surface an inline prompt instead of relying on the + * system-level permission dialog (which only fires once, the first time the + * app tries to post a keystroke). + * + * Triggered on three signals: + * - app mount in Tauri + * - `system:accessibility-missing` event from the dictate window's paste + * failure handler + * - window focus (cheap way to re-check after the user flips the toggle in + * System Settings and alt-tabs back) + */ +export function useAccessibilityPermission() { + const platform = usePlatform(); + const [needsPermission, setNeedsPermission] = useState(false); + const [checking, setChecking] = useState(false); + + const recheck = useCallback(async (): Promise => { + if (!platform.metadata.isTauri) return true; + setChecking(true); + try { + const trusted = await invoke('check_accessibility_permission'); + setNeedsPermission(!trusted); + return trusted; + } catch (err) { + console.warn('[accessibility] check failed:', err); + return false; + } finally { + setChecking(false); + } + }, [platform.metadata.isTauri]); + + useEffect(() => { + if (!platform.metadata.isTauri) return; + recheck(); + const onFocus = () => { + recheck(); + }; + window.addEventListener('focus', onFocus); + return () => window.removeEventListener('focus', onFocus); + }, [platform.metadata.isTauri, recheck]); + + useEffect(() => { + if (!platform.metadata.isTauri) return; + let unlisten: UnlistenFn | null = null; + listen('system:accessibility-missing', () => { + setNeedsPermission(true); + }) + .then((fn) => { + unlisten = fn; + }) + .catch(() => {}); + return () => { + if (unlisten) unlisten(); + }; + }, [platform.metadata.isTauri]); + + const openSettings = useCallback(async () => { + try { + await invoke('open_accessibility_settings'); + } catch (err) { + console.warn('[accessibility] open settings failed:', err); + } + }, []); + + return { needsPermission, checking, recheck, openSettings }; +} + +/** + * Inline notice rendered next to the auto-paste setting when macOS + * Accessibility permission is missing. Returns null when the permission is + * already granted. + */ +export function AccessibilityNotice() { + const { t } = useTranslation(); + const { needsPermission, checking, recheck, openSettings } = useAccessibilityPermission(); + const [stillMissing, setStillMissing] = useState(false); + + const handleRecheck = useCallback(async () => { + setStillMissing(false); + const trusted = await recheck(); + if (!trusted) setStillMissing(true); + }, [recheck]); + + if (!needsPermission) return null; + + return ( +
+
+ +
+

+ {t('captures.permissions.accessibility.title')} +

+

+ }} /> +

+
+ + +
+ {stillMissing && !checking && ( +

+ {t('captures.permissions.accessibility.stillMissing')} +

+ )} +
+
+
+ ); +} diff --git a/app/src/components/AudioBars.tsx b/app/src/components/AudioBars.tsx new file mode 100644 index 00000000..69601ccc --- /dev/null +++ b/app/src/components/AudioBars.tsx @@ -0,0 +1,39 @@ +import { motion } from 'framer-motion'; + +import { cn } from '@/lib/utils/cn'; + +export type AudioBarsMode = 'idle' | 'generating' | 'playing'; + +interface AudioBarsProps { + mode: AudioBarsMode; + className?: string; + barClassName?: string; +} + +export function AudioBars({ mode, className, barClassName }: AudioBarsProps) { + const activeColor = mode !== 'idle' ? 'bg-accent' : 'bg-muted-foreground/40'; + return ( +
+ {[0, 1, 2, 3, 4].map((i) => ( + + ))} +
+ ); +} diff --git a/app/src/components/CapturePill/CapturePill.tsx b/app/src/components/CapturePill/CapturePill.tsx new file mode 100644 index 00000000..d8609415 --- /dev/null +++ b/app/src/components/CapturePill/CapturePill.tsx @@ -0,0 +1,198 @@ +import { motion } from 'framer-motion'; +import { AlertCircle } from 'lucide-react'; +import { useTranslation } from 'react-i18next'; +import { cn } from '@/lib/utils/cn'; + +/** + * Pill state machine shared between the settings preview and the live + * recording pill in the Captures tab. + */ +export type PillState = + | 'recording' + | 'transcribing' + | 'refining' + | 'speaking' + | 'completed' + | 'rest' + | 'error'; + +const PILL_LABEL_KEYS: Record, string> = { + recording: 'captures.pill.recording', + transcribing: 'captures.pill.transcribing', + refining: 'captures.pill.refining', + speaking: 'captures.pill.speaking', + completed: 'captures.pill.completed', +}; + +function barModeFor( + state: Exclude, +): 'generating' | 'playing' | 'idle' { + if (state === 'recording' || state === 'speaking') return 'playing'; + if (state === 'completed' || state === 'rest') return 'idle'; + return 'generating'; +} + +export function PillAudioBars({ mode }: { mode: 'generating' | 'playing' | 'idle' }) { + return ( +
+ {[0, 1, 2, 3, 4].map((i) => ( + + ))} +
+ ); +} + +function formatElapsed(ms: number): string { + const total = Math.max(0, Math.floor(ms / 1000)); + const m = Math.floor(total / 60); + const s = total % 60; + return `${m}:${String(s).padStart(2, '0')}`; +} + +/** + * Floating pill shown during capture. `state` drives the label, dot animation, + * and bar motion; `elapsedMs` freezes at whatever the caller last passed in + * (recording advances the timer, transcribing/refining hold the final value). + * The ``error`` state renders a destructive variant — a clickable pill that + * copies its message to the clipboard on press and calls ``onDismiss``. + */ +export function CapturePill({ + state, + elapsedMs, + onStop, + errorMessage, + onDismiss, + className, +}: { + state: PillState; + elapsedMs: number; + onStop?: () => void; + errorMessage?: string | null; + onDismiss?: () => void; + className?: string; +}) { + const { t } = useTranslation(); + + if (state === 'error') { + return ( + + ); + } + + const visible = state !== 'rest'; + const labelText = t(state === 'rest' ? PILL_LABEL_KEYS.recording : PILL_LABEL_KEYS[state]); + const barMode = barModeFor(state); + + const dot = ( + + {state === 'recording' && ( + + )} + + + ); + + const stopButton = onStop && state === 'recording' ? ( + + ) : dot; + + // Completed gets an inset accent stroke (via box-shadow, not Tailwind's + // ring — ring utility doesn't compose with arbitrary shadow-[…]) to mark + // the success moment without changing the pill's dimensions. + const completedStroke = + state === 'completed' + ? 'shadow-[inset_0_0_0_2px_hsl(var(--accent)/0.6)]' + : null; + + return ( +
+ {stopButton} + + {labelText} + + + + {formatElapsed(elapsedMs)} + +
+ ); +} + +function ErrorPill({ + message, + onDismiss, + className, +}: { + message: string; + onDismiss?: () => void; + className?: string; +}) { + const { t } = useTranslation(); + const handleClick = async () => { + try { + await navigator.clipboard.writeText(message); + } catch { + // Clipboard access can be denied in rare webview configs — ignore, + // we still want the dismiss to land. + } + onDismiss?.(); + }; + + return ( + + ); +} + diff --git a/app/src/components/CapturesTab/CaptureInlinePlayer.tsx b/app/src/components/CapturesTab/CaptureInlinePlayer.tsx new file mode 100644 index 00000000..2796c004 --- /dev/null +++ b/app/src/components/CapturesTab/CaptureInlinePlayer.tsx @@ -0,0 +1,156 @@ +import { Loader2, Pause, Play } from 'lucide-react'; +import { useEffect, useRef, useState } from 'react'; +import WaveSurfer from 'wavesurfer.js'; +import { Button } from '@/components/ui/button'; +import { cn } from '@/lib/utils/cn'; +import { debug } from '@/lib/utils/debug'; + +function formatDuration(ms?: number | null): string { + if (!ms || ms < 0) return '0:00'; + const total = Math.round(ms / 1000); + const m = Math.floor(total / 60); + const s = total % 60; + return `${m}:${String(s).padStart(2, '0')}`; +} + +export function CaptureInlinePlayer({ + audioUrl, + fallbackDurationMs, + className, +}: { + audioUrl: string; + fallbackDurationMs?: number | null; + className?: string; +}) { + const waveformRef = useRef(null); + const wavesurferRef = useRef(null); + const [isPlaying, setIsPlaying] = useState(false); + const [isLoading, setIsLoading] = useState(true); + const [duration, setDuration] = useState(0); + const [currentTime, setCurrentTime] = useState(0); + const [error, setError] = useState(null); + + useEffect(() => { + const container = waveformRef.current; + if (!container) return; + + const root = document.documentElement; + const cssHsla = (varName: string, alpha: number) => { + const value = getComputedStyle(root).getPropertyValue(varName).trim(); + if (!value) return ''; + const [h, s, l] = value.split(/\s+/); + if (!h || !s || !l) return ''; + return `hsla(${h}, ${s}, ${l}, ${alpha})`; + }; + + const ws = WaveSurfer.create({ + container, + waveColor: cssHsla('--muted-foreground', 1), + progressColor: cssHsla('--accent', 1), + cursorColor: 'transparent', + barWidth: 2, + barRadius: 2, + barGap: 2, + height: 40, + normalize: true, + interact: true, + dragToSeek: { debounceTime: 0 }, + mediaControls: false, + backend: 'WebAudio', + }); + + ws.on('ready', () => { + setDuration(ws.getDuration()); + setIsLoading(false); + setError(null); + }); + ws.on('play', () => setIsPlaying(true)); + ws.on('pause', () => setIsPlaying(false)); + ws.on('finish', () => { + setIsPlaying(false); + setCurrentTime(ws.getDuration()); + }); + ws.on('timeupdate', (t) => setCurrentTime(t)); + ws.on('seeking', (t) => setCurrentTime(t)); + ws.on('error', (err) => { + debug.error('Inline waveform error', err); + setError(err instanceof Error ? err.message : String(err)); + setIsLoading(false); + }); + + wavesurferRef.current = ws; + + return () => { + try { + ws.destroy(); + } catch (err) { + debug.error('Failed to destroy inline waveform', err); + } + wavesurferRef.current = null; + }; + }, []); + + useEffect(() => { + const ws = wavesurferRef.current; + if (!ws) return; + setIsLoading(true); + setError(null); + setCurrentTime(0); + setDuration(0); + setIsPlaying(false); + try { + if (ws.isPlaying()) ws.pause(); + ws.seekTo(0); + } catch (err) { + debug.error('Failed to reset inline waveform before load', err); + } + ws.load(audioUrl).catch((err) => { + debug.error('Inline waveform load failed', err); + setError(err instanceof Error ? err.message : String(err)); + setIsLoading(false); + }); + }, [audioUrl]); + + const handlePlayPause = () => { + const ws = wavesurferRef.current; + if (!ws || isLoading) return; + if (ws.isPlaying()) { + ws.pause(); + } else { + ws.play().catch((err) => { + debug.error('Inline play failed', err); + setError(err instanceof Error ? err.message : String(err)); + }); + } + }; + + const displayMs = + duration > 0 + ? Math.round((isPlaying || currentTime > 0 ? currentTime : duration) * 1000) + : (fallbackDurationMs ?? 0); + + return ( +
+ +
+ + {error ? '—' : formatDuration(displayMs)} + +
+ ); +} diff --git a/app/src/components/CapturesTab/CapturesTab.tsx b/app/src/components/CapturesTab/CapturesTab.tsx new file mode 100644 index 00000000..7492d320 --- /dev/null +++ b/app/src/components/CapturesTab/CapturesTab.tsx @@ -0,0 +1,909 @@ +import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; +import { Link } from '@tanstack/react-router'; +import { listen, type UnlistenFn } from '@tauri-apps/api/event'; +import { save } from '@tauri-apps/plugin-dialog'; +import { writeFile, writeTextFile } from '@tauri-apps/plugin-fs'; +import { + Captions, + Check, + ChevronDown, + CircleDot, + Copy, + Download, + FileAudio, + FileText, + Loader2, + Mic, + Settings2, + Sparkles, + Square, + Trash2, + Upload, + Volume2, +} from 'lucide-react'; +import { useEffect, useMemo, useRef, useState } from 'react'; +import { useTranslation } from 'react-i18next'; +import { AudioBars } from '@/components/AudioBars'; +import { CapturePill } from '@/components/CapturePill/CapturePill'; +import { CaptureInlinePlayer } from '@/components/CapturesTab/CaptureInlinePlayer'; +import { DictationReadinessChecklist } from '@/components/CapturesTab/DictationReadinessChecklist'; +import { + AlertDialog, + AlertDialogAction, + AlertDialogCancel, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, +} from '@/components/ui/alert-dialog'; +import { Badge } from '@/components/ui/badge'; +import { Button } from '@/components/ui/button'; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuLabel, + DropdownMenuSeparator, + DropdownMenuTrigger, +} from '@/components/ui/dropdown-menu'; +import { Textarea } from '@/components/ui/textarea'; +import { + ListPane, + ListPaneHeader, + ListPaneScroll, + ListPaneSearch, + ListPaneTitle, + ListPaneTitleRow, +} from '@/components/ListPane'; +import { useToast } from '@/components/ui/use-toast'; +import { apiClient } from '@/lib/api/client'; +import type { + CaptureListResponse, + CaptureResponse, + CaptureSource, + VoiceProfileResponse, +} from '@/lib/api/types'; +import type { LanguageCode } from '@/lib/constants/languages'; +import { BOTTOM_SAFE_AREA_PADDING } from '@/lib/constants/ui'; +import { useCaptureRecordingSession } from '@/lib/hooks/useCaptureRecordingSession'; +import { useDictationReadiness } from '@/lib/hooks/useDictationReadiness'; +import { useCaptureSettings } from '@/lib/hooks/useSettings'; +import { cn } from '@/lib/utils/cn'; +import { formatAbsoluteDate, formatDate } from '@/lib/utils/format'; +import { displayLabelForKey, modifierSideHint } from '@/lib/utils/keyCodes'; +import { useGenerationStore } from '@/stores/generationStore'; +import { usePlayerStore } from '@/stores/playerStore'; + +const CAPTURE_AUDIO_MIME = 'audio/*,.wav,.mp3,.m4a,.flac,.ogg,.webm'; + +function formatDuration(ms?: number | null): string { + if (!ms || ms < 0) return '0:00'; + const total = Math.round(ms / 1000); + const m = Math.floor(total / 60); + const s = total % 60; + return `${m}:${String(s).padStart(2, '0')}`; +} + +function ChordKeys({ keys }: { keys: string[] }) { + if (keys.length === 0) return null; + return ( +
+ {keys.map((k) => { + const side = modifierSideHint(k); + return ( + + {displayLabelForKey(k)} + {side ? ( + + {side} + + ) : null} + + ); + })} +
+ ); +} + +function SourceBadge({ source }: { source: CaptureSource }) { + const { t } = useTranslation(); + const Icon = source === 'dictation' ? Mic : source === 'recording' ? CircleDot : FileAudio; + const label = + source === 'dictation' + ? t('captures.source.dictation') + : source === 'recording' + ? t('captures.source.recording') + : t('captures.source.file'); + return ( + + + {label} + + ); +} + +type PlaybackState = 'idle' | 'generating' | 'playing'; + +export function CapturesTab() { + const { t } = useTranslation(); + const queryClient = useQueryClient(); + const { toast } = useToast(); + const fileInputRef = useRef(null); + const uploadInputRef = useRef(null); + + const snippetOf = (capture: CaptureResponse): string => { + const source = capture.transcript_refined || capture.transcript_raw || ''; + return source.trim() || t('captures.snippetEmpty'); + }; + + const [selectedId, setSelectedId] = useState(null); + const [search, setSearch] = useState(''); + const [showRefined, setShowRefined] = useState(true); + const [launchedPlayAsId, setLaunchedPlayAsId] = useState(null); + const [deleteDialogOpen, setDeleteDialogOpen] = useState(false); + + const audioUrl = usePlayerStore((s) => s.audioUrl); + const playerAudioId = usePlayerStore((s) => s.audioId); + const playerIsPlaying = usePlayerStore((s) => s.isPlaying); + const isPlayerVisible = !!audioUrl; + + const setIsPlaying = usePlayerStore((s) => s.setIsPlaying); + + const addPendingGeneration = useGenerationStore((s) => s.addPendingGeneration); + const pendingGenerationIds = useGenerationStore((s) => s.pendingGenerationIds); + + const { settings: captureSettings, update: updateCaptureSettings } = useCaptureSettings(); + const sttModel = captureSettings?.stt_model ?? 'turbo'; + const llmModel = captureSettings?.llm_model ?? '0.6B'; + const hotkeyEnabled = captureSettings?.hotkey_enabled ?? false; + const pushToTalkKeys = captureSettings?.chord_push_to_talk_keys ?? []; + const toggleToTalkKeys = captureSettings?.chord_toggle_to_talk_keys ?? []; + const readiness = useDictationReadiness(); + + const session = useCaptureRecordingSession({ + onCaptureCreated: (capture) => setSelectedId(capture.id), + }); + + const { data: capturesData, isLoading: capturesLoading } = useQuery({ + queryKey: ['captures'], + queryFn: () => apiClient.listCaptures(200, 0), + }); + + const { data: profiles } = useQuery({ + queryKey: ['profiles'], + queryFn: () => apiClient.listProfiles(), + }); + + const captures = capturesData?.items ?? []; + + // Keep a selection. If the current selection disappears (e.g. deletion), + // fall through to the first capture, then to null. + useEffect(() => { + if (!captures.length) { + if (selectedId !== null) setSelectedId(null); + return; + } + if (!selectedId || !captures.find((c) => c.id === selectedId)) { + setSelectedId(captures[0].id); + } + }, [captures, selectedId]); + + // Live sync from sibling Tauri webviews (the floating dictate window). + // ``capture:created`` carries the full row so we can seed the cache before + // the refetch lands and focus the new capture in one shot — without the + // seed, the selection-guard effect would snap back to ``captures[0]`` in + // the race window between ``setSelectedId(new)`` and the refetched list + // actually containing the new row. + useEffect(() => { + const unlistens: Promise[] = []; + unlistens.push( + listen<{ capture: CaptureResponse }>('capture:created', (event) => { + const capture = event.payload?.capture; + if (capture) { + queryClient.setQueryData(['captures'], (prev) => { + if (!prev) return prev; + if (prev.items.some((c) => c.id === capture.id)) return prev; + return { ...prev, items: [capture, ...prev.items], total: prev.total + 1 }; + }); + setSelectedId(capture.id); + } + queryClient.invalidateQueries({ queryKey: ['captures'] }); + }), + ); + unlistens.push( + listen('capture:updated', () => { + queryClient.invalidateQueries({ queryKey: ['captures'] }); + }), + ); + return () => { + for (const p of unlistens) p.then((fn) => fn()).catch(() => {}); + }; + }, [queryClient]); + + const filtered = useMemo(() => { + const q = search.trim().toLowerCase(); + if (!q) return captures; + return captures.filter((c) => { + const raw = (c.transcript_raw || '').toLowerCase(); + const refined = (c.transcript_refined || '').toLowerCase(); + return raw.includes(q) || refined.includes(q); + }); + }, [search, captures]); + + const selected = captures.find((c) => c.id === selectedId) ?? null; + // Source of truth is capture_settings.default_playback_voice_id, shared + // with Settings → Captures and the MCP global default. Stale ids (e.g. + // referenced profile was deleted) fall through to the first profile. + const storedVoiceId = captureSettings?.default_playback_voice_id ?? null; + const playAsVoice = + (storedVoiceId && profiles?.find((p) => p.id === storedVoiceId)) || + profiles?.[0] || + null; + const playAsVoiceId = playAsVoice?.id ?? null; + + const deleteMutation = useMutation({ + mutationFn: async (captureId: string) => apiClient.deleteCapture(captureId), + onSuccess: () => { + setDeleteDialogOpen(false); + queryClient.invalidateQueries({ queryKey: ['captures'] }); + }, + onError: (err: Error) => { + toast({ title: t('captures.toast.deleteFailed'), description: err.message, variant: 'destructive' }); + }, + }); + + const playAsMutation = useMutation({ + mutationFn: async ({ capture, voice }: { capture: CaptureResponse; voice: VoiceProfileResponse }) => { + const text = capture.transcript_refined || capture.transcript_raw; + if (!text.trim()) throw new Error(t('captures.noTranscriptError')); + const language = (capture.language || voice.language) as LanguageCode; + // Preset profiles (Kokoro etc.) reject the qwen default — honor the + // profile's stored engine preference. Cloned profiles without an + // override fall through to whatever the backend picks. + const engine = voice.default_engine as + | 'qwen' | 'qwen_custom_voice' | 'luxtts' | 'chatterbox' + | 'chatterbox_turbo' | 'tada' | 'kokoro' + | undefined; + return apiClient.generateSpeech({ + profile_id: voice.id, + text, + language, + engine, + }); + }, + onSuccess: (result) => { + // /generate is queue-based — it returns a generating row with an empty + // audio_path. Hand the id to the global SSE handler which polls + // /generation/{id}/status and triggers autoplay on completion. + setLaunchedPlayAsId(result.id); + addPendingGeneration(result.id); + }, + onError: (err: Error) => { + toast({ title: t('captures.toast.playAsFailed'), description: err.message, variant: 'destructive' }); + }, + }); + + const playbackState: PlaybackState = playAsMutation.isPending + ? 'generating' + : launchedPlayAsId && pendingGenerationIds.has(launchedPlayAsId) + ? 'generating' + : launchedPlayAsId && playerAudioId === launchedPlayAsId && playerIsPlaying + ? 'playing' + : 'idle'; + + const handleUploadClick = () => uploadInputRef.current?.click(); + + const handleUploadFile = (e: React.ChangeEvent, source: CaptureSource) => { + const file = e.target.files?.[0]; + e.target.value = ''; + if (!file) return; + session.uploadFile(file, source); + }; + + const handleCopy = async () => { + if (!selected) return; + const text = showRefined + ? selected.transcript_refined || selected.transcript_raw + : selected.transcript_raw; + try { + await navigator.clipboard.writeText(text || ''); + toast({ title: t('captures.toast.transcriptCopied') }); + } catch { + toast({ title: t('captures.toast.copyFailed'), variant: 'destructive' }); + } + }; + + const exportToastSuccess = (path: string) => { + const name = path.split(/[\\/]/).pop() ?? path; + toast({ title: t('captures.toast.exportSuccess', { path: name }) }); + }; + + const exportToastError = (err: unknown) => { + toast({ + title: t('captures.toast.exportFailed'), + description: err instanceof Error ? err.message : String(err), + variant: 'destructive', + }); + }; + + const handleExportAudio = async () => { + if (!selected) return; + try { + const dest = await save({ + defaultPath: `capture_${selected.id.slice(0, 8)}.wav`, + filters: [{ name: 'Audio', extensions: ['wav'] }], + }); + if (!dest) return; + const res = await fetch(apiClient.getCaptureAudioUrl(selected.id)); + if (!res.ok) throw new Error(`HTTP ${res.status}`); + const buf = new Uint8Array(await res.arrayBuffer()); + await writeFile(dest, buf); + exportToastSuccess(dest); + } catch (err) { + exportToastError(err); + } + }; + + const handleExportTranscript = async () => { + if (!selected) return; + const text = (selected.transcript_refined || selected.transcript_raw || '').trim(); + if (!text) { + toast({ title: t('captures.toast.exportEmpty'), variant: 'destructive' }); + return; + } + try { + const dest = await save({ + defaultPath: `capture_${selected.id.slice(0, 8)}.txt`, + filters: [{ name: 'Text', extensions: ['txt'] }], + }); + if (!dest) return; + await writeTextFile(dest, text); + exportToastSuccess(dest); + } catch (err) { + exportToastError(err); + } + }; + + const buildCaptureMarkdown = (capture: CaptureResponse): string => { + const lines: string[] = []; + lines.push(`# Capture ${capture.id}`, ''); + lines.push(`- **Source:** ${capture.source}`); + lines.push(`- **Created:** ${capture.created_at}`); + if (capture.duration_ms != null) lines.push(`- **Duration:** ${formatDuration(capture.duration_ms)}`); + if (capture.language) lines.push(`- **Language:** ${capture.language}`); + if (capture.stt_model) lines.push(`- **STT model:** ${capture.stt_model}`); + if (capture.llm_model) lines.push(`- **LLM model:** ${capture.llm_model}`); + lines.push(''); + if (capture.transcript_refined?.trim()) { + lines.push('## Refined transcript', '', capture.transcript_refined.trim(), ''); + } + if (capture.transcript_raw?.trim()) { + lines.push('## Raw transcript', '', capture.transcript_raw.trim(), ''); + } + return lines.join('\n'); + }; + + const handleExportMarkdown = async () => { + if (!selected) return; + const hasContent = (selected.transcript_refined || selected.transcript_raw || '').trim(); + if (!hasContent) { + toast({ title: t('captures.toast.exportEmpty'), variant: 'destructive' }); + return; + } + try { + const dest = await save({ + defaultPath: `capture_${selected.id.slice(0, 8)}.md`, + filters: [{ name: 'Markdown', extensions: ['md'] }], + }); + if (!dest) return; + await writeTextFile(dest, buildCaptureMarkdown(selected)); + exportToastSuccess(dest); + } catch (err) { + exportToastError(err); + } + }; + + const handlePlayAs = (voice?: VoiceProfileResponse) => { + if (!selected) return; + // Stop the current playback when the button is in its 'playing' state + // and the user clicked the main button without picking a new voice. + if (!voice && playbackState === 'playing') { + setIsPlaying(false); + return; + } + const target = voice ?? playAsVoice; + if (!target) { + toast({ + title: t('captures.toast.noVoice'), + description: t('captures.toast.noVoiceDescription'), + variant: 'destructive', + }); + return; + } + if (voice && voice.id !== playAsVoiceId) { + updateCaptureSettings({ default_playback_voice_id: voice.id }); + } + playAsMutation.mutate({ capture: selected, voice: target }); + }; + + return ( +
+ handleUploadFile(e, 'file')} + className="hidden" + /> + handleUploadFile(e, 'file')} + className="hidden" + /> + + {/* Left: capture list */} +
+ + + + {t('captures.title')} + + {t('captures.beta')} + + + + + + +
+ {capturesLoading ? ( +
+ +
+ ) : filtered.length === 0 ? ( +
+ {search ? ( +

{t('captures.empty.noMatches', { query: search })}

+ ) : ( +

{t('captures.empty.none')}

+ )} +
+ ) : ( + filtered.map((capture) => { + const isActive = selectedId === capture.id; + const refined = !!capture.transcript_refined; + return ( + + ); + }) + )} +
+
+
+
+ + {/* Right: capture detail */} +
+
+ + {/* Top action bar */} +
+
+
+ + + {t('captures.header.modelSummary', { + stt: sttModel.charAt(0).toUpperCase() + sttModel.slice(1), + llm: llmModel, + })} + +
+
+ {session.pillState !== 'hidden' && ( + + )} + {session.pillState === 'hidden' && ( + <> + + {readiness.canRecord && ( + + )} + + )} + {/* Hide Dictate when recording readiness fails so the user can't kick off + a capture that has nowhere to land. Stop stays visible if a + recording is somehow already in flight (e.g. a model was + uninstalled mid-record) so the user can always cancel. */} + {(readiness.canRecord || session.isRecording) && ( + + )} +
+
+ + {selected ? ( +
+ {/* Meta row */} +
+ {formatAbsoluteDate(selected.created_at)} + {selected.language && ( + <> + · + {selected.language.toUpperCase()} + + )} + · + +
+ + {/* Audio player card */} +
+ +
+ + {/* Transcript header */} +
+
+ + +
+
+ + {showRefined && selected.transcript_refined + ? t('captures.transcript.refinedHint', { model: selected.llm_model ?? llmModel }) + : selected.stt_model + ? t('captures.transcript.rawHint', { model: selected.stt_model }) + : null} + +
+ + {/* Transcript body */} +
+