nethesis · Stell0 · Jan 27, 2026
diff --git a/README.md b/README.md
@@ -159,6 +159,34 @@ If `OPENAI_API_KEY` is missing (or `persist=false`), clean/summary/sentiment are
 
 When `persist=true`, `POST /api/get_transcription` updates `transcripts.state` as it runs: `progress` → (`summarizing` →) `done`, or `failed` on errors.
 
+#### `POST /api/get_speech`
+
+Accepts text input and returns a Deepgram text-to-speech (TTS) MP3 stream.
+
+Request requirements:
+- Provide `text` (or `input`) as either query string parameters or as form fields (`application/x-www-form-urlencoded` or multipart form)
+
+Deepgram TTS parameters:
+- `model`: Deepgram TTS voice model. See [Deepgram TTS docs](https://developers.deepgram.com/docs/tts-models) for available models. Note that language is inferred from the model, choose a model that matches your text language.
+
+- These are passed through to Deepgram `/v1/speak` when provided. See [Deepgram TTS docs](https://developers.deepgram.com/reference/text-to-speech/speak-request): `encoding`, `container`, `sample_rate`, `bit_rate`, `mip_opt_out`, `tag`, `callback`, `callback_method`
+
+Response:
+- `Content-Type: audio/mpeg`
+- `Content-Disposition: attachment; filename="speech-<uuid>.mp3"`
+
+Example:
+```
+curl -X POST http://127.0.0.1:8000/api/get_speech \
+    -H 'Authorization: Bearer YOUR_TOKEN' \
+    -d 'text=Hello from Satellite' \
+    --output speech.mp3
+```
+
+Notes:
+- Text is split into 2000-character chunks (Deepgram input limit) and each chunk is synthesized sequentially; the resulting MP3 parts are concatenated.
+- Errors: `400` for missing text, `401` if `API_TOKEN` is set and auth is missing/invalid, `504` on Deepgram timeout, `502` if Deepgram is unreachable.
+
 ## Architecture
 
 Satellite consists of several key components:

diff --git a/api.py b/api.py
@@ -1,11 +1,16 @@
 from fastapi import APIRouter, Depends, FastAPI, HTTPException, UploadFile, File, Request
 from fastapi.concurrency import run_in_threadpool
+from fastapi.responses import StreamingResponse
+import re
+import uuid
 import json
 import httpx
 import os
 import logging
 import subprocess
 import sys
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
 
 import db
 
@@ -14,6 +19,10 @@
 
 DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")  # Ensure this environment variable is set
 
+def _iter_bytes(data: bytes, *, chunk_size: int):
+    for i in range(0, len(data), chunk_size):
+        yield data[i : i + chunk_size]
+
 
 def _require_api_token_if_configured(request: Request) -> None:
     configured_token = (os.getenv("API_TOKEN") or "").strip()
@@ -83,6 +92,138 @@ def _get_deepgram_timeout_seconds() -> float:
         logger.warning("Invalid DEEPGRAM_TIMEOUT_SECONDS=%r; defaulting to 300", raw)
         return 300.0
 
+
+@api_router.post("/get_speech")
+async def get_speech(request: Request):
+    # Collect parameters from query string and multipart/x-www-form-urlencoded form fields
+    try:
+        form = await request.form()
+    except Exception:
+        form = {}
+
+    form_params = {}
+    if hasattr(form, "items"):
+        for k, v in form.items():
+            form_params[k] = v if isinstance(v, str) else str(v)
+
+    input_params = {**dict(request.query_params), **form_params}
+    logger.debug("Params: %s", input_params)
+
+    text = (input_params.get("text") or input_params.get("input") or "").strip()
+    if not text:
+        raise HTTPException(status_code=400, detail="Missing required field: text")
+
+    # use lanchain text splitter to split text into smaller chunks
+    # Deepgram TTS can handle 2000 characters per request https://developers.deepgram.com/docs/text-to-speech#input-text-limit
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=2000,
+        chunk_overlap=0,
+        separators=["\n\n", "\n", ".", "?", "!", " ", ""],
+    )
+    chunks = [c.strip() for c in splitter.split_text(text or "")]
+    if not chunks:
+        raise HTTPException(status_code=400, detail="Text is empty")
+
+    # https://developers.deepgram.com/reference/text-to-speech/speak-request
+    deepgram_params = {
+        "callback": "",
+        "callback_method": "",
+        "mip_opt_out": "false",
+        "tag": "",
+        "bit_rate": "",
+        "container": "",
+        "encoding": "mp3",
+        "model": "",
+        "sample_rate": ""
+    }
+
+    params: dict[str, str] = {}
+    for k, v in deepgram_params.items():
+        if k in input_params and str(input_params[k]).strip():
+            params[k] = str(input_params[k]).strip()
+        elif str(v).strip():
+            params[k] = str(v).strip()
+
+        logger.debug("Deepgram TTS params: %s", params)
+
+    url = "https://api.deepgram.com/v1/speak"
+    headers = {
+        "Authorization": f"Token {DEEPGRAM_API_KEY}",
+        "Content-Type": "application/json",
+        "Accept": "audio/mpeg",
+    }
+
+    audio_parts: list[bytes] = []
+    try:
+        deepgram_timeout_seconds = _get_deepgram_timeout_seconds()
+        timeout = httpx.Timeout(
+            connect=10.0,
+            read=deepgram_timeout_seconds,
+            write=deepgram_timeout_seconds,
+            pool=10.0,
+        )
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            for idx, chunk in enumerate(chunks, start=1):
+                response = await client.post(
+                    url,
+                    headers=headers,
+                    params=params,
+                    json={"text": chunk},
+                )
+                try:
+                    logger.debug(
+                        "Deepgram TTS response: chunk=%s/%s status=%s content_type=%s bytes=%s",
+                        idx,
+                        len(chunks),
+                        response.status_code,
+                        response.headers.get("Content-Type"),
+                        len(response.content or b""),
+                    )
+                except Exception:
+                    logger.debug("Failed to log Deepgram TTS response meta")
+                response.raise_for_status()
+                audio_parts.append(response.content)
+    except httpx.HTTPStatusError as e:
+        try:
+            status = e.response.status_code if e.response is not None else 502
+            body_preview = (
+                e.response.text[:500]
+                if e.response is not None and hasattr(e.response, "text") and e.response.text
+                else ""
+            )
+            logger.error("Deepgram TTS API error: status=%s body_preview=%s", status, body_preview)
+        except Exception:
+            logger.error("Deepgram TTS API error (logging failed)")
+        raise HTTPException(
+            status_code=e.response.status_code if e.response is not None else 502,
+            detail=f"Deepgram API error: {e.response.text if e.response is not None else ''}",
+        )
+    except httpx.TimeoutException:
+        logger.warning("Deepgram TTS request timed out")
+        raise HTTPException(status_code=504, detail="Deepgram request timed out")
+    except httpx.RequestError as e:
+        logger.error("Deepgram TTS request failed: %s", str(e))
+        raise HTTPException(status_code=502, detail="Failed to reach Deepgram")
+    except Exception as e:
+        logger.exception("Unexpected error while calling Deepgram TTS")
+        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
+
+    audio_bytes = b"".join(audio_parts)
+    if not audio_bytes:
+        raise HTTPException(status_code=500, detail="Deepgram returned empty audio")
+
+    filename = f"speech-{uuid.uuid4().hex}.mp3"
+    headers_out = {
+        "Content-Disposition": f'attachment; filename="{filename}"',
+        "Cache-Control": "no-store",
+        "X-Content-Type-Options": "nosniff",
+    }
+    return StreamingResponse(
+        _iter_bytes(audio_bytes, chunk_size=65536),
+        media_type="audio/mpeg",
+        headers=headers_out,
+    )
+
 @api_router.post('/get_transcription')
 async def get_transcription(
     request: Request,

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -322,3 +322,112 @@ def test_missing_paragraphs_transcript_is_error(self, mock_client_class, client,
         assert response.status_code == 500
         assert "Failed to parse transcription response" in response.json()["detail"]
 
+
+class TestGetSpeech:
+    """Tests for the /api/get_speech endpoint."""
+
+    @patch("httpx.AsyncClient")
+    def test_get_speech_returns_mp3_and_filename(self, mock_client_class, client):
+        mock_response = Mock()
+        mock_response.content = b"MP3DATA"
+        mock_response.status_code = 200
+        mock_response.headers = {"Content-Type": "audio/mpeg"}
+        mock_response.raise_for_status = Mock()
+
+        mock_client = AsyncMock()
+        mock_client.post = AsyncMock(return_value=mock_response)
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=False)
+        mock_client_class.return_value = mock_client
+
+        response = client.post("/api/get_speech", data={"text": "hello"})
+
+        assert response.status_code == 200
+        assert response.headers["content-type"].startswith("audio/mpeg")
+        assert response.content == b"MP3DATA"
+        assert "content-disposition" in response.headers
+        assert "speech-" in response.headers["content-disposition"]
+
+    @patch("httpx.AsyncClient")
+    def test_get_speech_splits_and_concatenates(self, mock_client_class, client):
+        resp1 = Mock()
+        resp1.content = b"AAA"
+        resp1.status_code = 200
+        resp1.headers = {"Content-Type": "audio/mpeg"}
+        resp1.raise_for_status = Mock()
+
+        resp2 = Mock()
+        resp2.content = b"BBB"
+        resp2.status_code = 200
+        resp2.headers = {"Content-Type": "audio/mpeg"}
+        resp2.raise_for_status = Mock()
+
+        mock_client = AsyncMock()
+        mock_client.post = AsyncMock(side_effect=[resp1, resp2])
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=False)
+        mock_client_class.return_value = mock_client
+
+        # api.py uses a fixed 2000-char chunk size; use a 2001-char input to force 2 requests.
+        response = client.post("/api/get_speech", data={"text": "a" * 2001})
+
+        assert response.status_code == 200
+        assert response.content == b"AAABBB"
+        assert mock_client.post.call_count == 2
+
+    @patch("httpx.AsyncClient")
+    def test_get_speech_ignores_unknown_params(self, mock_client_class, client):
+        mock_response = Mock()
+        mock_response.content = b"MP3DATA"
+        mock_response.status_code = 200
+        mock_response.headers = {"Content-Type": "audio/mpeg"}
+        mock_response.raise_for_status = Mock()
+
+        mock_client = AsyncMock()
+        mock_client.post = AsyncMock(return_value=mock_response)
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=False)
+        mock_client_class.return_value = mock_client
+
+        response = client.post("/api/get_speech?unknown_param=1", data={"text": "hello"})
+
+        assert response.status_code == 200
+
+        _, kwargs = mock_client.post.call_args
+        assert "params" in kwargs
+        assert "unknown_param" not in kwargs["params"]
+
+    def test_get_speech_missing_text_returns_400(self, client):
+        response = client.post("/api/get_speech", data={})
+        assert response.status_code == 400
+
+    @patch("httpx.AsyncClient")
+    def test_get_speech_deepgram_timeout_returns_504(self, mock_client_class, client):
+        mock_client = AsyncMock()
+        mock_client.post = AsyncMock(side_effect=httpx.ReadTimeout("Timed out", request=Mock()))
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=False)
+        mock_client_class.return_value = mock_client
+
+        response = client.post("/api/get_speech", data={"text": "hello"})
+        assert response.status_code == 504
+
+    @patch("httpx.AsyncClient")
+    def test_get_speech_deepgram_http_error_propagates_status(self, mock_client_class, client):
+        mock_response = Mock()
+        mock_response.status_code = 401
+        mock_response.text = "Invalid API key"
+
+        error = httpx.HTTPStatusError("Unauthorized", request=Mock(), response=mock_response)
+
+        mock_client = AsyncMock()
+        mock_client.post = AsyncMock(return_value=mock_response)
+        mock_response.raise_for_status = Mock(side_effect=error)
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=False)
+        mock_client_class.return_value = mock_client
+
+        response = client.post("/api/get_speech", data={"text": "hello"})
+        assert response.status_code == 401
+        assert "Deepgram API error" in response.json()["detail"]
+