Skip to content

Commit 5bfb1e7

Browse files
committed
Add speed control to web API and UI
Introduces support for a 'speed' parameter in the web API and playground UI, allowing users to adjust audio playback speed from 0.25x to 4.0x. The API now validates and applies the speed parameter, returns speed metadata in response headers, and handles audio processing errors with a dedicated exception. The playground UI includes a speed slider and displays speed/chunk stats. Also updates version references to 3.4.0-alpha3 and improves error handling in audio chunk combination.
1 parent 8c357cf commit 5bfb1e7

File tree

8 files changed

+198
-19
lines changed

8 files changed

+198
-19
lines changed

CHANGELOG.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,32 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [3.4.0-alpha3] - 2025-10-26
9+
10+
### Fixed
11+
- **Critical bug fix**: Speed parameter was not being extracted from API requests in web app
12+
- Web API endpoint `/v1/audio/speech` now correctly extracts and passes `speed` parameter to TTSFM client
13+
- Added proper validation for speed parameter (must be between 0.25 and 4.0)
14+
- Speed adjustment now works correctly for both single-chunk and long-text generation
15+
16+
### Changed
17+
- **Separated Docker build workflows**: Split monolithic workflow into two independent files
18+
- `.github/workflows/docker-build-full.yml` - Builds full variant with ffmpeg
19+
- `.github/workflows/docker-build-slim.yml` - Builds slim variant without ffmpeg
20+
- Improved clarity, debugging, and parallel execution
21+
- Independent cache scopes for each variant
22+
23+
### Added
24+
- Speed metadata headers in API responses:
25+
- `X-Requested-Speed`: The speed value requested by the client
26+
- `X-Speed-Applied`: Whether speed adjustment was actually applied (true/false)
27+
28+
## [3.4.0-alpha2] - 2025-10-25
29+
30+
### Changed
31+
- Improved Docker workflow configuration for dual image variants
32+
- Enhanced documentation for Docker image variants
33+
834
## [3.4.0-alpha1] - 2025-10-23
935

1036
### Added

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ ttsfm = "ttsfm.cli:main"
8686
version_scheme = "no-guess-dev"
8787
local_scheme = "no-local-version"
8888

89-
fallback_version = "3.4.0-alpha1"
89+
fallback_version = "3.4.0-alpha3"
9090
[tool.setuptools]
9191
packages = ["ttsfm"]
9292

tests/test_audio_processing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,10 @@ def test_combine_mp3_without_ffmpeg(self, monkeypatch):
9494
monkeypatch.setattr(ttsfm.audio, "FFMPEG_AVAILABLE", False)
9595

9696
from ttsfm.audio import combine_audio_chunks
97+
from ttsfm.exceptions import AudioProcessingException
9798

9899
chunks = [b"chunk1", b"chunk2"]
99-
with pytest.raises(RuntimeError, match="MP3 audio requires pydub and ffmpeg"):
100+
with pytest.raises(AudioProcessingException, match="MP3 audio requires pydub and ffmpeg"):
100101
combine_audio_chunks(chunks, format_type="mp3")
101102

102103
def test_combine_wav_without_ffmpeg(self, monkeypatch):

ttsfm-web/app.py

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,12 @@
4444
try:
4545
from ttsfm import AudioFormat, TTSClient, TTSException, Voice
4646
from ttsfm.audio import combine_audio_chunks
47-
from ttsfm.exceptions import APIException, NetworkException, ValidationException
47+
from ttsfm.exceptions import (
48+
APIException,
49+
AudioProcessingException,
50+
NetworkException,
51+
ValidationException,
52+
)
4853
from ttsfm.models import get_supported_format
4954
from ttsfm.utils import split_text_by_length
5055
except ImportError:
@@ -768,7 +773,7 @@ def get_status():
768773
{
769774
"status": "online",
770775
"tts_service": "openai.fm (free)",
771-
"package_version": "3.3.7",
776+
"package_version": "3.4.0a3",
772777
"timestamp": datetime.now().isoformat(),
773778
}
774779
)
@@ -792,7 +797,7 @@ def get_status():
792797
def health_check():
793798
"""Simple health check endpoint."""
794799
return jsonify(
795-
{"status": "healthy", "package_version": "3.3.7", "timestamp": datetime.now().isoformat()}
800+
{"status": "healthy", "package_version": "3.4.0a3", "timestamp": datetime.now().isoformat()}
796801
)
797802

798803

@@ -866,6 +871,7 @@ def openai_speech():
866871
voice = data.get("voice", "alloy")
867872
response_format = data.get("response_format", "mp3")
868873
instructions = data.get("instructions", "").strip() or None
874+
speed = data.get("speed") # Optional: 0.25 to 4.0
869875

870876
# TTSFM-specific parameters
871877
# New parameter: auto-combine long text (default: True)
@@ -927,16 +933,48 @@ def openai_speech():
927933
400,
928934
)
929935

936+
# Validate speed parameter if provided
937+
if speed is not None:
938+
try:
939+
speed = float(speed)
940+
if not 0.25 <= speed <= 4.0:
941+
return (
942+
jsonify(
943+
{
944+
"error": {
945+
"message": "Speed must be between 0.25 and 4.0",
946+
"type": "invalid_request_error",
947+
"code": "invalid_speed",
948+
}
949+
}
950+
),
951+
400,
952+
)
953+
except (ValueError, TypeError):
954+
return (
955+
jsonify(
956+
{
957+
"error": {
958+
"message": "Speed must be a number",
959+
"type": "invalid_request_error",
960+
"code": "invalid_speed",
961+
}
962+
}
963+
),
964+
400,
965+
)
966+
930967
effective_format = get_supported_format(format_enum)
931968

932969
logger.info(
933970
"OpenAI API: Generating speech: text='%s...', voice=%s, "
934-
"requested_format=%s (effective=%s), auto_combine=%s",
971+
"requested_format=%s (effective=%s), auto_combine=%s, speed=%s",
935972
input_text[:50],
936973
voice,
937974
response_format,
938975
effective_format.value,
939976
auto_combine,
977+
speed,
940978
)
941979

942980
client = create_tts_client()
@@ -958,6 +996,7 @@ def openai_speech():
958996
instructions=instructions,
959997
max_length=max_length,
960998
preserve_words=True,
999+
speed=speed,
9611000
)
9621001

9631002
if not responses:
@@ -1012,6 +1051,11 @@ def openai_speech():
10121051
"X-Effective-Format": effective_format.value,
10131052
}
10141053

1054+
# Add speed metadata if available (from first response)
1055+
if responses and responses[0].metadata and "requested_speed" in responses[0].metadata:
1056+
headers["X-Requested-Speed"] = str(responses[0].metadata["requested_speed"])
1057+
headers["X-Speed-Applied"] = str(responses[0].metadata.get("speed_applied", False)).lower()
1058+
10151059
return Response(
10161060
stream_with_context(_chunk_bytes(combined_audio)),
10171061
mimetype=content_type,
@@ -1049,6 +1093,7 @@ def openai_speech():
10491093
instructions=instructions,
10501094
max_length=max_length,
10511095
validate_length=True,
1096+
speed=speed,
10521097
)
10531098

10541099
headers = {
@@ -1062,6 +1107,11 @@ def openai_speech():
10621107
"X-Effective-Format": effective_format.value,
10631108
}
10641109

1110+
# Add speed metadata if available
1111+
if response.metadata and "requested_speed" in response.metadata:
1112+
headers["X-Requested-Speed"] = str(response.metadata["requested_speed"])
1113+
headers["X-Speed-Applied"] = str(response.metadata.get("speed_applied", False)).lower()
1114+
10651115
return Response(
10661116
stream_with_context(_chunk_bytes(response.audio_data)),
10671117
mimetype=response.content_type,
@@ -1111,6 +1161,21 @@ def openai_speech():
11111161
503,
11121162
)
11131163

1164+
except AudioProcessingException as e:
1165+
logger.error(f"OpenAI API audio processing error: {e}")
1166+
return (
1167+
jsonify(
1168+
{
1169+
"error": {
1170+
"message": str(e.message),
1171+
"type": "audio_processing_error",
1172+
"code": "audio_processing_error",
1173+
}
1174+
}
1175+
),
1176+
400,
1177+
)
1178+
11141179
except Exception as e:
11151180
logger.error(f"OpenAI API unexpected error: {e}")
11161181
return (

ttsfm-web/static/js/playground-enhanced-fixed.js

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ const PlaygroundApp = (() => {
6565
els.validateLengthCheck = document.getElementById('validate-length-check');
6666
els.autoCombineCheck = document.getElementById('auto-combine-check');
6767
els.autoCombineStatus = document.getElementById('auto-combine-status');
68+
els.speedInput = document.getElementById('speed-input');
69+
els.speedValueDisplay = document.getElementById('speed-value-display');
6870
els.charCount = document.getElementById('char-count');
6971
els.lengthStatus = document.getElementById('length-status');
7072
els.generateBtn = document.getElementById('generate-btn');
@@ -82,6 +84,8 @@ const PlaygroundApp = (() => {
8284
els.audioSize = document.getElementById('audio-size');
8385
els.audioVoice = document.getElementById('audio-voice');
8486
els.audioFormat = document.getElementById('audio-format');
87+
els.audioSpeed = document.getElementById('audio-speed');
88+
els.audioChunks = document.getElementById('audio-chunks');
8589
els.apiKeyToggle = document.getElementById('toggle-api-key-visibility');
8690
}
8791

@@ -107,6 +111,9 @@ const PlaygroundApp = (() => {
107111
if (els.formatSelect) {
108112
els.formatSelect.addEventListener('change', () => updateAudioSummary());
109113
}
114+
if (els.speedInput) {
115+
els.speedInput.addEventListener('input', updateSpeedDisplay);
116+
}
110117
}
111118
function attachUtilityEvents() {
112119
if (els.clearBtn) {
@@ -307,6 +314,7 @@ const PlaygroundApp = (() => {
307314
const format = els.formatSelect?.value || state.format;
308315
const instructions = (els.instructionsInput?.value || '').trim();
309316
const apiKey = (els.apiKeyInput?.value || '').trim();
317+
const speed = els.speedInput?.value ? parseFloat(els.speedInput.value) : 1.0;
310318

311319
if (!text) {
312320
showError('Please enter some text to convert.');
@@ -327,6 +335,9 @@ const PlaygroundApp = (() => {
327335
if (instructions) {
328336
body.instructions = instructions;
329337
}
338+
if (speed !== 1.0) {
339+
body.speed = speed;
340+
}
330341

331342
const response = await fetch('/api/generate', {
332343
method: 'POST',
@@ -359,7 +370,9 @@ const PlaygroundApp = (() => {
359370
textLength: text.length,
360371
instructions,
361372
streaming: false,
362-
sizeBytes: blob.size
373+
sizeBytes: blob.size,
374+
speed: speed !== 1.0 ? speed : null,
375+
chunks: response.headers.get('X-Chunks-Combined') || null
363376
});
364377

365378
showResults(blob, meta);
@@ -402,6 +415,7 @@ const PlaygroundApp = (() => {
402415
const voice = els.voiceSelect?.value || 'alloy';
403416
const format = els.formatSelect?.value || state.format;
404417
const instructions = (els.instructionsInput?.value || '').trim();
418+
const speed = els.speedInput?.value ? parseFloat(els.speedInput.value) : 1.0;
405419

406420
if (!text) {
407421
showError('Please enter some text to convert.');
@@ -421,10 +435,16 @@ const PlaygroundApp = (() => {
421435

422436
const startTime = performance.now();
423437
try {
424-
await state.wsClient.generateSpeech(text, {
438+
const options = {
425439
voice,
426440
format,
427441
chunkSize: 512,
442+
};
443+
if (speed !== 1.0) {
444+
options.speed = speed;
445+
}
446+
await state.wsClient.generateSpeech(text, {
447+
...options,
428448
onStart: (data) => {
429449
state.activeStreamId = data.request_id;
430450
},
@@ -464,7 +484,8 @@ const PlaygroundApp = (() => {
464484
streaming: true,
465485
sizeBytes: result.audioData.byteLength,
466486
chunks: result.chunks.length,
467-
elapsedMs: performance.now() - startTime
487+
elapsedMs: performance.now() - startTime,
488+
speed: speed !== 1.0 ? speed : null
468489
});
469490

470491
showResults(blob, meta);
@@ -656,6 +677,14 @@ const PlaygroundApp = (() => {
656677
updateLengthStatus(current, max);
657678
}
658679

680+
function updateSpeedDisplay() {
681+
if (!els.speedInput || !els.speedValueDisplay) {
682+
return;
683+
}
684+
const speed = parseFloat(els.speedInput.value);
685+
els.speedValueDisplay.textContent = `${speed.toFixed(2)}x`;
686+
}
687+
659688
function updateLengthStatus(current, max) {
660689
if (!els.lengthStatus) {
661690
return;
@@ -845,6 +874,12 @@ const PlaygroundApp = (() => {
845874
if (els.audioFormat) {
846875
els.audioFormat.textContent = '--';
847876
}
877+
if (els.audioSpeed) {
878+
els.audioSpeed.textContent = '--';
879+
}
880+
if (els.audioChunks) {
881+
els.audioChunks.textContent = '--';
882+
}
848883
return;
849884
}
850885

@@ -861,6 +896,12 @@ const PlaygroundApp = (() => {
861896
const label = meta.formatLabel || meta.format || state.format;
862897
els.audioFormat.textContent = label ? label.toString().toUpperCase() : '--';
863898
}
899+
if (els.audioSpeed) {
900+
els.audioSpeed.textContent = meta.speed ? `${meta.speed}x` : '1.0x';
901+
}
902+
if (els.audioChunks) {
903+
els.audioChunks.textContent = meta.chunks || '1';
904+
}
864905
if (els.audioInfo) {
865906
const parts = [];
866907
if (meta.generatedAt) {

ttsfm-web/templates/base.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@
8888
<a class="navbar-brand" href="{{ url_for('index') }}">
8989
<i class="fas fa-microphone-alt me-2"></i>
9090
<span class="fw-bold">TTSFM</span>
91-
<span class="badge bg-primary ms-2 small">v3.3.7</span>
91+
<span class="badge bg-primary ms-2 small">v3.4.0-alpha3</span>
9292
</a>
9393

9494
<button class="navbar-toggler border-0" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
@@ -159,7 +159,7 @@
159159
<div class="d-flex align-items-center">
160160
<i class="fas fa-microphone-alt me-2 text-primary"></i>
161161
<strong class="text-dark">TTSFM</strong>
162-
<span class="ms-2 text-muted">v3.3.7</span>
162+
<span class="ms-2 text-muted">v3.4.0-alpha3</span>
163163
</div>
164164
</div>
165165
<div class="col-md-6 text-md-end">

0 commit comments

Comments
 (0)