Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,34 @@ If `OPENAI_API_KEY` is missing (or `persist=false`), clean/summary/sentiment are

When `persist=true`, `POST /api/get_transcription` updates `transcripts.state` as it runs: `progress` → (`summarizing` →) `done`, or `failed` on errors.

#### `POST /api/get_speech`

Accepts text input and returns a Deepgram text-to-speech (TTS) MP3 stream.

Request requirements:
- Provide `text` (or `input`) as either query string parameters or as form fields (`application/x-www-form-urlencoded` or multipart form)

Deepgram TTS parameters:
- `model`: Deepgram TTS voice model. See [Deepgram TTS docs](https://developers.deepgram.com/docs/tts-models) for available models. Note that language is inferred from the model, choose a model that matches your text language.

- These are passed through to Deepgram `/v1/speak` when provided. See [Deepgram TTS docs](https://developers.deepgram.com/reference/text-to-speech/speak-request): `encoding`, `container`, `sample_rate`, `bit_rate`, `mip_opt_out`, `tag`, `callback`, `callback_method`

Response:
- `Content-Type: audio/mpeg`
- `Content-Disposition: attachment; filename="speech-<uuid>.mp3"`

Example:
```
curl -X POST http://127.0.0.1:8000/api/get_speech \
-H 'Authorization: Bearer YOUR_TOKEN' \
-d 'text=Hello from Satellite' \
--output speech.mp3
```

Notes:
- Text is split into 2000-character chunks (Deepgram input limit) and each chunk is synthesized sequentially; the resulting MP3 parts are concatenated.
- Errors: `400` for missing text, `401` if `API_TOKEN` is set and auth is missing/invalid, `504` on Deepgram timeout, `502` if Deepgram is unreachable.

## Architecture

Satellite consists of several key components:
Expand Down
141 changes: 141 additions & 0 deletions api.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from fastapi import APIRouter, Depends, FastAPI, HTTPException, UploadFile, File, Request
from fastapi.concurrency import run_in_threadpool
from fastapi.responses import StreamingResponse
import re
import uuid
import json
import httpx
import os
import logging
import subprocess
import sys
from langchain_text_splitters import RecursiveCharacterTextSplitter


import db

Expand All @@ -14,6 +19,10 @@

DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY") # Ensure this environment variable is set

def _iter_bytes(data: bytes, *, chunk_size: int):
for i in range(0, len(data), chunk_size):
yield data[i : i + chunk_size]


def _require_api_token_if_configured(request: Request) -> None:
configured_token = (os.getenv("API_TOKEN") or "").strip()
Expand Down Expand Up @@ -83,6 +92,138 @@ def _get_deepgram_timeout_seconds() -> float:
logger.warning("Invalid DEEPGRAM_TIMEOUT_SECONDS=%r; defaulting to 300", raw)
return 300.0


@api_router.post("/get_speech")
async def get_speech(request: Request):
# Collect parameters from query string and multipart/x-www-form-urlencoded form fields
try:
form = await request.form()
except Exception:
form = {}

form_params = {}
if hasattr(form, "items"):
for k, v in form.items():
form_params[k] = v if isinstance(v, str) else str(v)

input_params = {**dict(request.query_params), **form_params}
logger.debug("Params: %s", input_params)

text = (input_params.get("text") or input_params.get("input") or "").strip()
if not text:
raise HTTPException(status_code=400, detail="Missing required field: text")

# use lanchain text splitter to split text into smaller chunks
# Deepgram TTS can handle 2000 characters per request https://developers.deepgram.com/docs/text-to-speech#input-text-limit
splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=0,
separators=["\n\n", "\n", ".", "?", "!", " ", ""],
)
chunks = [c.strip() for c in splitter.split_text(text or "")]
if not chunks:
raise HTTPException(status_code=400, detail="Text is empty")

# https://developers.deepgram.com/reference/text-to-speech/speak-request
deepgram_params = {
"callback": "",
"callback_method": "",
"mip_opt_out": "false",
"tag": "",
"bit_rate": "",
"container": "",
"encoding": "mp3",
"model": "",
"sample_rate": ""
}

params: dict[str, str] = {}
for k, v in deepgram_params.items():
if k in input_params and str(input_params[k]).strip():
params[k] = str(input_params[k]).strip()
elif str(v).strip():
params[k] = str(v).strip()

logger.debug("Deepgram TTS params: %s", params)

url = "https://api.deepgram.com/v1/speak"
headers = {
"Authorization": f"Token {DEEPGRAM_API_KEY}",
"Content-Type": "application/json",
"Accept": "audio/mpeg",
}

audio_parts: list[bytes] = []
try:
deepgram_timeout_seconds = _get_deepgram_timeout_seconds()
timeout = httpx.Timeout(
connect=10.0,
read=deepgram_timeout_seconds,
write=deepgram_timeout_seconds,
pool=10.0,
)
async with httpx.AsyncClient(timeout=timeout) as client:
for idx, chunk in enumerate(chunks, start=1):
response = await client.post(
url,
headers=headers,
params=params,
json={"text": chunk},
)
try:
logger.debug(
"Deepgram TTS response: chunk=%s/%s status=%s content_type=%s bytes=%s",
idx,
len(chunks),
response.status_code,
response.headers.get("Content-Type"),
len(response.content or b""),
)
except Exception:
logger.debug("Failed to log Deepgram TTS response meta")
response.raise_for_status()
audio_parts.append(response.content)
except httpx.HTTPStatusError as e:
try:
status = e.response.status_code if e.response is not None else 502
body_preview = (
e.response.text[:500]
if e.response is not None and hasattr(e.response, "text") and e.response.text
else ""
)
logger.error("Deepgram TTS API error: status=%s body_preview=%s", status, body_preview)
except Exception:
logger.error("Deepgram TTS API error (logging failed)")
raise HTTPException(
status_code=e.response.status_code if e.response is not None else 502,
detail=f"Deepgram API error: {e.response.text if e.response is not None else ''}",
)
except httpx.TimeoutException:
logger.warning("Deepgram TTS request timed out")
raise HTTPException(status_code=504, detail="Deepgram request timed out")
except httpx.RequestError as e:
logger.error("Deepgram TTS request failed: %s", str(e))
raise HTTPException(status_code=502, detail="Failed to reach Deepgram")
except Exception as e:
logger.exception("Unexpected error while calling Deepgram TTS")
raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")

audio_bytes = b"".join(audio_parts)
if not audio_bytes:
raise HTTPException(status_code=500, detail="Deepgram returned empty audio")

filename = f"speech-{uuid.uuid4().hex}.mp3"
headers_out = {
"Content-Disposition": f'attachment; filename="{filename}"',
"Cache-Control": "no-store",
"X-Content-Type-Options": "nosniff",
}
return StreamingResponse(
_iter_bytes(audio_bytes, chunk_size=65536),
media_type="audio/mpeg",
headers=headers_out,
)

@api_router.post('/get_transcription')
async def get_transcription(
request: Request,
Expand Down
109 changes: 109 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,3 +322,112 @@ def test_missing_paragraphs_transcript_is_error(self, mock_client_class, client,
assert response.status_code == 500
assert "Failed to parse transcription response" in response.json()["detail"]


class TestGetSpeech:
"""Tests for the /api/get_speech endpoint."""

@patch("httpx.AsyncClient")
def test_get_speech_returns_mp3_and_filename(self, mock_client_class, client):
mock_response = Mock()
mock_response.content = b"MP3DATA"
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "audio/mpeg"}
mock_response.raise_for_status = Mock()

mock_client = AsyncMock()
mock_client.post = AsyncMock(return_value=mock_response)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_class.return_value = mock_client

response = client.post("/api/get_speech", data={"text": "hello"})

assert response.status_code == 200
assert response.headers["content-type"].startswith("audio/mpeg")
assert response.content == b"MP3DATA"
assert "content-disposition" in response.headers
assert "speech-" in response.headers["content-disposition"]

@patch("httpx.AsyncClient")
def test_get_speech_splits_and_concatenates(self, mock_client_class, client):
resp1 = Mock()
resp1.content = b"AAA"
resp1.status_code = 200
resp1.headers = {"Content-Type": "audio/mpeg"}
resp1.raise_for_status = Mock()

resp2 = Mock()
resp2.content = b"BBB"
resp2.status_code = 200
resp2.headers = {"Content-Type": "audio/mpeg"}
resp2.raise_for_status = Mock()

mock_client = AsyncMock()
mock_client.post = AsyncMock(side_effect=[resp1, resp2])
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_class.return_value = mock_client

# api.py uses a fixed 2000-char chunk size; use a 2001-char input to force 2 requests.
response = client.post("/api/get_speech", data={"text": "a" * 2001})

assert response.status_code == 200
assert response.content == b"AAABBB"
assert mock_client.post.call_count == 2

@patch("httpx.AsyncClient")
def test_get_speech_ignores_unknown_params(self, mock_client_class, client):
mock_response = Mock()
mock_response.content = b"MP3DATA"
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "audio/mpeg"}
mock_response.raise_for_status = Mock()

mock_client = AsyncMock()
mock_client.post = AsyncMock(return_value=mock_response)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_class.return_value = mock_client

response = client.post("/api/get_speech?unknown_param=1", data={"text": "hello"})

assert response.status_code == 200

_, kwargs = mock_client.post.call_args
assert "params" in kwargs
assert "unknown_param" not in kwargs["params"]

def test_get_speech_missing_text_returns_400(self, client):
response = client.post("/api/get_speech", data={})
assert response.status_code == 400

@patch("httpx.AsyncClient")
def test_get_speech_deepgram_timeout_returns_504(self, mock_client_class, client):
mock_client = AsyncMock()
mock_client.post = AsyncMock(side_effect=httpx.ReadTimeout("Timed out", request=Mock()))
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_class.return_value = mock_client

response = client.post("/api/get_speech", data={"text": "hello"})
assert response.status_code == 504

@patch("httpx.AsyncClient")
def test_get_speech_deepgram_http_error_propagates_status(self, mock_client_class, client):
mock_response = Mock()
mock_response.status_code = 401
mock_response.text = "Invalid API key"

error = httpx.HTTPStatusError("Unauthorized", request=Mock(), response=mock_response)

mock_client = AsyncMock()
mock_client.post = AsyncMock(return_value=mock_response)
mock_response.raise_for_status = Mock(side_effect=error)
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
mock_client_class.return_value = mock_client

response = client.post("/api/get_speech", data={"text": "hello"})
assert response.status_code == 401
assert "Deepgram API error" in response.json()["detail"]

Loading