diff --git a/README.md b/README.md index 12dc9d9..713f588 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,34 @@ If `OPENAI_API_KEY` is missing (or `persist=false`), clean/summary/sentiment are When `persist=true`, `POST /api/get_transcription` updates `transcripts.state` as it runs: `progress` → (`summarizing` →) `done`, or `failed` on errors. +#### `POST /api/get_speech` + +Accepts text input and returns a Deepgram text-to-speech (TTS) MP3 stream. + +Request requirements: +- Provide `text` (or `input`) as either query string parameters or as form fields (`application/x-www-form-urlencoded` or multipart form) + +Deepgram TTS parameters: +- `model`: Deepgram TTS voice model. See [Deepgram TTS docs](https://developers.deepgram.com/docs/tts-models) for available models. Note that language is inferred from the model, choose a model that matches your text language. + +- These are passed through to Deepgram `/v1/speak` when provided. See [Deepgram TTS docs](https://developers.deepgram.com/reference/text-to-speech/speak-request): `encoding`, `container`, `sample_rate`, `bit_rate`, `mip_opt_out`, `tag`, `callback`, `callback_method` + +Response: +- `Content-Type: audio/mpeg` +- `Content-Disposition: attachment; filename="speech-.mp3"` + +Example: +``` +curl -X POST http://127.0.0.1:8000/api/get_speech \ + -H 'Authorization: Bearer YOUR_TOKEN' \ + -d 'text=Hello from Satellite' \ + --output speech.mp3 +``` + +Notes: +- Text is split into 2000-character chunks (Deepgram input limit) and each chunk is synthesized sequentially; the resulting MP3 parts are concatenated. +- Errors: `400` for missing text, `401` if `API_TOKEN` is set and auth is missing/invalid, `504` on Deepgram timeout, `502` if Deepgram is unreachable. + ## Architecture Satellite consists of several key components: diff --git a/api.py b/api.py index 138f9c0..b39ef03 100644 --- a/api.py +++ b/api.py @@ -1,11 +1,16 @@ from fastapi import APIRouter, Depends, FastAPI, HTTPException, UploadFile, File, Request from fastapi.concurrency import run_in_threadpool +from fastapi.responses import StreamingResponse +import re +import uuid import json import httpx import os import logging import subprocess import sys +from langchain_text_splitters import RecursiveCharacterTextSplitter + import db @@ -14,6 +19,10 @@ DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") # Ensure this environment variable is set +def _iter_bytes(data: bytes, *, chunk_size: int): + for i in range(0, len(data), chunk_size): + yield data[i : i + chunk_size] + def _require_api_token_if_configured(request: Request) -> None: configured_token = (os.getenv("API_TOKEN") or "").strip() @@ -83,6 +92,138 @@ def _get_deepgram_timeout_seconds() -> float: logger.warning("Invalid DEEPGRAM_TIMEOUT_SECONDS=%r; defaulting to 300", raw) return 300.0 + +@api_router.post("/get_speech") +async def get_speech(request: Request): + # Collect parameters from query string and multipart/x-www-form-urlencoded form fields + try: + form = await request.form() + except Exception: + form = {} + + form_params = {} + if hasattr(form, "items"): + for k, v in form.items(): + form_params[k] = v if isinstance(v, str) else str(v) + + input_params = {**dict(request.query_params), **form_params} + logger.debug("Params: %s", input_params) + + text = (input_params.get("text") or input_params.get("input") or "").strip() + if not text: + raise HTTPException(status_code=400, detail="Missing required field: text") + + # use lanchain text splitter to split text into smaller chunks + # Deepgram TTS can handle 2000 characters per request https://developers.deepgram.com/docs/text-to-speech#input-text-limit + splitter = RecursiveCharacterTextSplitter( + chunk_size=2000, + chunk_overlap=0, + separators=["\n\n", "\n", ".", "?", "!", " ", ""], + ) + chunks = [c.strip() for c in splitter.split_text(text or "")] + if not chunks: + raise HTTPException(status_code=400, detail="Text is empty") + + # https://developers.deepgram.com/reference/text-to-speech/speak-request + deepgram_params = { + "callback": "", + "callback_method": "", + "mip_opt_out": "false", + "tag": "", + "bit_rate": "", + "container": "", + "encoding": "mp3", + "model": "", + "sample_rate": "" + } + + params: dict[str, str] = {} + for k, v in deepgram_params.items(): + if k in input_params and str(input_params[k]).strip(): + params[k] = str(input_params[k]).strip() + elif str(v).strip(): + params[k] = str(v).strip() + + logger.debug("Deepgram TTS params: %s", params) + + url = "https://api.deepgram.com/v1/speak" + headers = { + "Authorization": f"Token {DEEPGRAM_API_KEY}", + "Content-Type": "application/json", + "Accept": "audio/mpeg", + } + + audio_parts: list[bytes] = [] + try: + deepgram_timeout_seconds = _get_deepgram_timeout_seconds() + timeout = httpx.Timeout( + connect=10.0, + read=deepgram_timeout_seconds, + write=deepgram_timeout_seconds, + pool=10.0, + ) + async with httpx.AsyncClient(timeout=timeout) as client: + for idx, chunk in enumerate(chunks, start=1): + response = await client.post( + url, + headers=headers, + params=params, + json={"text": chunk}, + ) + try: + logger.debug( + "Deepgram TTS response: chunk=%s/%s status=%s content_type=%s bytes=%s", + idx, + len(chunks), + response.status_code, + response.headers.get("Content-Type"), + len(response.content or b""), + ) + except Exception: + logger.debug("Failed to log Deepgram TTS response meta") + response.raise_for_status() + audio_parts.append(response.content) + except httpx.HTTPStatusError as e: + try: + status = e.response.status_code if e.response is not None else 502 + body_preview = ( + e.response.text[:500] + if e.response is not None and hasattr(e.response, "text") and e.response.text + else "" + ) + logger.error("Deepgram TTS API error: status=%s body_preview=%s", status, body_preview) + except Exception: + logger.error("Deepgram TTS API error (logging failed)") + raise HTTPException( + status_code=e.response.status_code if e.response is not None else 502, + detail=f"Deepgram API error: {e.response.text if e.response is not None else ''}", + ) + except httpx.TimeoutException: + logger.warning("Deepgram TTS request timed out") + raise HTTPException(status_code=504, detail="Deepgram request timed out") + except httpx.RequestError as e: + logger.error("Deepgram TTS request failed: %s", str(e)) + raise HTTPException(status_code=502, detail="Failed to reach Deepgram") + except Exception as e: + logger.exception("Unexpected error while calling Deepgram TTS") + raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}") + + audio_bytes = b"".join(audio_parts) + if not audio_bytes: + raise HTTPException(status_code=500, detail="Deepgram returned empty audio") + + filename = f"speech-{uuid.uuid4().hex}.mp3" + headers_out = { + "Content-Disposition": f'attachment; filename="{filename}"', + "Cache-Control": "no-store", + "X-Content-Type-Options": "nosniff", + } + return StreamingResponse( + _iter_bytes(audio_bytes, chunk_size=65536), + media_type="audio/mpeg", + headers=headers_out, + ) + @api_router.post('/get_transcription') async def get_transcription( request: Request, diff --git a/tests/test_api.py b/tests/test_api.py index db7a282..661e115 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -322,3 +322,112 @@ def test_missing_paragraphs_transcript_is_error(self, mock_client_class, client, assert response.status_code == 500 assert "Failed to parse transcription response" in response.json()["detail"] + +class TestGetSpeech: + """Tests for the /api/get_speech endpoint.""" + + @patch("httpx.AsyncClient") + def test_get_speech_returns_mp3_and_filename(self, mock_client_class, client): + mock_response = Mock() + mock_response.content = b"MP3DATA" + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "audio/mpeg"} + mock_response.raise_for_status = Mock() + + mock_client = AsyncMock() + mock_client.post = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_class.return_value = mock_client + + response = client.post("/api/get_speech", data={"text": "hello"}) + + assert response.status_code == 200 + assert response.headers["content-type"].startswith("audio/mpeg") + assert response.content == b"MP3DATA" + assert "content-disposition" in response.headers + assert "speech-" in response.headers["content-disposition"] + + @patch("httpx.AsyncClient") + def test_get_speech_splits_and_concatenates(self, mock_client_class, client): + resp1 = Mock() + resp1.content = b"AAA" + resp1.status_code = 200 + resp1.headers = {"Content-Type": "audio/mpeg"} + resp1.raise_for_status = Mock() + + resp2 = Mock() + resp2.content = b"BBB" + resp2.status_code = 200 + resp2.headers = {"Content-Type": "audio/mpeg"} + resp2.raise_for_status = Mock() + + mock_client = AsyncMock() + mock_client.post = AsyncMock(side_effect=[resp1, resp2]) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_class.return_value = mock_client + + # api.py uses a fixed 2000-char chunk size; use a 2001-char input to force 2 requests. + response = client.post("/api/get_speech", data={"text": "a" * 2001}) + + assert response.status_code == 200 + assert response.content == b"AAABBB" + assert mock_client.post.call_count == 2 + + @patch("httpx.AsyncClient") + def test_get_speech_ignores_unknown_params(self, mock_client_class, client): + mock_response = Mock() + mock_response.content = b"MP3DATA" + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "audio/mpeg"} + mock_response.raise_for_status = Mock() + + mock_client = AsyncMock() + mock_client.post = AsyncMock(return_value=mock_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_class.return_value = mock_client + + response = client.post("/api/get_speech?unknown_param=1", data={"text": "hello"}) + + assert response.status_code == 200 + + _, kwargs = mock_client.post.call_args + assert "params" in kwargs + assert "unknown_param" not in kwargs["params"] + + def test_get_speech_missing_text_returns_400(self, client): + response = client.post("/api/get_speech", data={}) + assert response.status_code == 400 + + @patch("httpx.AsyncClient") + def test_get_speech_deepgram_timeout_returns_504(self, mock_client_class, client): + mock_client = AsyncMock() + mock_client.post = AsyncMock(side_effect=httpx.ReadTimeout("Timed out", request=Mock())) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_class.return_value = mock_client + + response = client.post("/api/get_speech", data={"text": "hello"}) + assert response.status_code == 504 + + @patch("httpx.AsyncClient") + def test_get_speech_deepgram_http_error_propagates_status(self, mock_client_class, client): + mock_response = Mock() + mock_response.status_code = 401 + mock_response.text = "Invalid API key" + + error = httpx.HTTPStatusError("Unauthorized", request=Mock(), response=mock_response) + + mock_client = AsyncMock() + mock_client.post = AsyncMock(return_value=mock_response) + mock_response.raise_for_status = Mock(side_effect=error) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + mock_client_class.return_value = mock_client + + response = client.post("/api/get_speech", data={"text": "hello"}) + assert response.status_code == 401 + assert "Deepgram API error" in response.json()["detail"] +