Add speed control to web API and UI

dbccccccc · dbccccccc · commit 5bfb1e728493 · 2025-10-26T16:02:46.000+08:00
Introduces support for a 'speed' parameter in the web API and playground UI, allowing users to adjust audio playback speed from 0.25x to 4.0x. The API now validates and applies the speed parameter, returns speed metadata in response headers, and handles audio processing errors with a dedicated exception. The playground UI includes a speed slider and displays speed/chunk stats. Also updates version references to 3.4.0-alpha3 and improves error handling in audio chunk combination.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,32 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.4.0-alpha3] - 2025-10-26
+
+### Fixed
+- **Critical bug fix**: Speed parameter was not being extracted from API requests in web app
+  - Web API endpoint `/v1/audio/speech` now correctly extracts and passes `speed` parameter to TTSFM client
+  - Added proper validation for speed parameter (must be between 0.25 and 4.0)
+  - Speed adjustment now works correctly for both single-chunk and long-text generation
+
+### Changed
+- **Separated Docker build workflows**: Split monolithic workflow into two independent files
+  - `.github/workflows/docker-build-full.yml` - Builds full variant with ffmpeg
+  - `.github/workflows/docker-build-slim.yml` - Builds slim variant without ffmpeg
+  - Improved clarity, debugging, and parallel execution
+  - Independent cache scopes for each variant
+
+### Added
+- Speed metadata headers in API responses:
+  - `X-Requested-Speed`: The speed value requested by the client
+  - `X-Speed-Applied`: Whether speed adjustment was actually applied (true/false)
+
+## [3.4.0-alpha2] - 2025-10-25
+
+### Changed
+- Improved Docker workflow configuration for dual image variants
+- Enhanced documentation for Docker image variants
+
 ## [3.4.0-alpha1] - 2025-10-23
 
 ### Added
diff --git a/pyproject.toml b/pyproject.toml
@@ -86,7 +86,7 @@ ttsfm = "ttsfm.cli:main"
 version_scheme = "no-guess-dev"
 local_scheme = "no-local-version"
 
-fallback_version = "3.4.0-alpha1"
+fallback_version = "3.4.0-alpha3"
 [tool.setuptools]
 packages = ["ttsfm"]
 
diff --git a/tests/test_audio_processing.py b/tests/test_audio_processing.py
@@ -94,9 +94,10 @@ def test_combine_mp3_without_ffmpeg(self, monkeypatch):
         monkeypatch.setattr(ttsfm.audio, "FFMPEG_AVAILABLE", False)
 
         from ttsfm.audio import combine_audio_chunks
+        from ttsfm.exceptions import AudioProcessingException
 
         chunks = [b"chunk1", b"chunk2"]
-        with pytest.raises(RuntimeError, match="MP3 audio requires pydub and ffmpeg"):
+        with pytest.raises(AudioProcessingException, match="MP3 audio requires pydub and ffmpeg"):
             combine_audio_chunks(chunks, format_type="mp3")
 
     def test_combine_wav_without_ffmpeg(self, monkeypatch):
diff --git a/ttsfm-web/app.py b/ttsfm-web/app.py
@@ -44,7 +44,12 @@
 try:
     from ttsfm import AudioFormat, TTSClient, TTSException, Voice
     from ttsfm.audio import combine_audio_chunks
-    from ttsfm.exceptions import APIException, NetworkException, ValidationException
+    from ttsfm.exceptions import (
+        APIException,
+        AudioProcessingException,
+        NetworkException,
+        ValidationException,
+    )
     from ttsfm.models import get_supported_format
     from ttsfm.utils import split_text_by_length
 except ImportError:
@@ -768,7 +773,7 @@ def get_status():
             {
                 "status": "online",
                 "tts_service": "openai.fm (free)",
-                "package_version": "3.3.7",
+                "package_version": "3.4.0a3",
                 "timestamp": datetime.now().isoformat(),
             }
         )
@@ -792,7 +797,7 @@ def get_status():
 def health_check():
     """Simple health check endpoint."""
     return jsonify(
-        {"status": "healthy", "package_version": "3.3.7", "timestamp": datetime.now().isoformat()}
+        {"status": "healthy", "package_version": "3.4.0a3", "timestamp": datetime.now().isoformat()}
     )
 
 
@@ -866,6 +871,7 @@ def openai_speech():
         voice = data.get("voice", "alloy")
         response_format = data.get("response_format", "mp3")
         instructions = data.get("instructions", "").strip() or None
+        speed = data.get("speed")  # Optional: 0.25 to 4.0
 
         # TTSFM-specific parameters
         # New parameter: auto-combine long text (default: True)
@@ -927,16 +933,48 @@ def openai_speech():
                 400,
             )
 
+        # Validate speed parameter if provided
+        if speed is not None:
+            try:
+                speed = float(speed)
+                if not 0.25 <= speed <= 4.0:
+                    return (
+                        jsonify(
+                            {
+                                "error": {
+                                    "message": "Speed must be between 0.25 and 4.0",
+                                    "type": "invalid_request_error",
+                                    "code": "invalid_speed",
+                                }
+                            }
+                        ),
+                        400,
+                    )
+            except (ValueError, TypeError):
+                return (
+                    jsonify(
+                        {
+                            "error": {
+                                "message": "Speed must be a number",
+                                "type": "invalid_request_error",
+                                "code": "invalid_speed",
+                            }
+                        }
+                    ),
+                    400,
+                )
+
         effective_format = get_supported_format(format_enum)
 
         logger.info(
             "OpenAI API: Generating speech: text='%s...', voice=%s, "
-            "requested_format=%s (effective=%s), auto_combine=%s",
+            "requested_format=%s (effective=%s), auto_combine=%s, speed=%s",
             input_text[:50],
             voice,
             response_format,
             effective_format.value,
             auto_combine,
+            speed,
         )
 
         client = create_tts_client()
@@ -958,6 +996,7 @@ def openai_speech():
                 instructions=instructions,
                 max_length=max_length,
                 preserve_words=True,
+                speed=speed,
             )
 
             if not responses:
@@ -1012,6 +1051,11 @@ def openai_speech():
                 "X-Effective-Format": effective_format.value,
             }
 
+            # Add speed metadata if available (from first response)
+            if responses and responses[0].metadata and "requested_speed" in responses[0].metadata:
+                headers["X-Requested-Speed"] = str(responses[0].metadata["requested_speed"])
+                headers["X-Speed-Applied"] = str(responses[0].metadata.get("speed_applied", False)).lower()
+
             return Response(
                 stream_with_context(_chunk_bytes(combined_audio)),
                 mimetype=content_type,
@@ -1049,6 +1093,7 @@ def openai_speech():
                 instructions=instructions,
                 max_length=max_length,
                 validate_length=True,
+                speed=speed,
             )
 
             headers = {
@@ -1062,6 +1107,11 @@ def openai_speech():
                 "X-Effective-Format": effective_format.value,
             }
 
+            # Add speed metadata if available
+            if response.metadata and "requested_speed" in response.metadata:
+                headers["X-Requested-Speed"] = str(response.metadata["requested_speed"])
+                headers["X-Speed-Applied"] = str(response.metadata.get("speed_applied", False)).lower()
+
             return Response(
                 stream_with_context(_chunk_bytes(response.audio_data)),
                 mimetype=response.content_type,
@@ -1111,6 +1161,21 @@ def openai_speech():
             503,
         )
 
+    except AudioProcessingException as e:
+        logger.error(f"OpenAI API audio processing error: {e}")
+        return (
+            jsonify(
+                {
+                    "error": {
+                        "message": str(e.message),
+                        "type": "audio_processing_error",
+                        "code": "audio_processing_error",
+                    }
+                }
+            ),
+            400,
+        )
+
     except Exception as e:
         logger.error(f"OpenAI API unexpected error: {e}")
         return (
diff --git a/ttsfm-web/static/js/playground-enhanced-fixed.js b/ttsfm-web/static/js/playground-enhanced-fixed.js
@@ -65,6 +65,8 @@ const PlaygroundApp = (() => {
         els.validateLengthCheck = document.getElementById('validate-length-check');
         els.autoCombineCheck = document.getElementById('auto-combine-check');
         els.autoCombineStatus = document.getElementById('auto-combine-status');
+        els.speedInput = document.getElementById('speed-input');
+        els.speedValueDisplay = document.getElementById('speed-value-display');
         els.charCount = document.getElementById('char-count');
         els.lengthStatus = document.getElementById('length-status');
         els.generateBtn = document.getElementById('generate-btn');
@@ -82,6 +84,8 @@ const PlaygroundApp = (() => {
         els.audioSize = document.getElementById('audio-size');
         els.audioVoice = document.getElementById('audio-voice');
         els.audioFormat = document.getElementById('audio-format');
+        els.audioSpeed = document.getElementById('audio-speed');
+        els.audioChunks = document.getElementById('audio-chunks');
         els.apiKeyToggle = document.getElementById('toggle-api-key-visibility');
     }
 
@@ -107,6 +111,9 @@ const PlaygroundApp = (() => {
         if (els.formatSelect) {
             els.formatSelect.addEventListener('change', () => updateAudioSummary());
         }
+        if (els.speedInput) {
+            els.speedInput.addEventListener('input', updateSpeedDisplay);
+        }
     }
     function attachUtilityEvents() {
         if (els.clearBtn) {
@@ -307,6 +314,7 @@ const PlaygroundApp = (() => {
         const format = els.formatSelect?.value || state.format;
         const instructions = (els.instructionsInput?.value || '').trim();
         const apiKey = (els.apiKeyInput?.value || '').trim();
+        const speed = els.speedInput?.value ? parseFloat(els.speedInput.value) : 1.0;
 
         if (!text) {
             showError('Please enter some text to convert.');
@@ -327,6 +335,9 @@ const PlaygroundApp = (() => {
             if (instructions) {
                 body.instructions = instructions;
             }
+            if (speed !== 1.0) {
+                body.speed = speed;
+            }
 
             const response = await fetch('/api/generate', {
                 method: 'POST',
@@ -359,7 +370,9 @@ const PlaygroundApp = (() => {
                 textLength: text.length,
                 instructions,
                 streaming: false,
-                sizeBytes: blob.size
+                sizeBytes: blob.size,
+                speed: speed !== 1.0 ? speed : null,
+                chunks: response.headers.get('X-Chunks-Combined') || null
             });
 
             showResults(blob, meta);
@@ -402,6 +415,7 @@ const PlaygroundApp = (() => {
         const voice = els.voiceSelect?.value || 'alloy';
         const format = els.formatSelect?.value || state.format;
         const instructions = (els.instructionsInput?.value || '').trim();
+        const speed = els.speedInput?.value ? parseFloat(els.speedInput.value) : 1.0;
 
         if (!text) {
             showError('Please enter some text to convert.');
@@ -421,10 +435,16 @@ const PlaygroundApp = (() => {
 
         const startTime = performance.now();
         try {
-            await state.wsClient.generateSpeech(text, {
+            const options = {
                 voice,
                 format,
                 chunkSize: 512,
+            };
+            if (speed !== 1.0) {
+                options.speed = speed;
+            }
+            await state.wsClient.generateSpeech(text, {
+                ...options,
                 onStart: (data) => {
                     state.activeStreamId = data.request_id;
                 },
@@ -464,7 +484,8 @@ const PlaygroundApp = (() => {
                         streaming: true,
                         sizeBytes: result.audioData.byteLength,
                         chunks: result.chunks.length,
-                        elapsedMs: performance.now() - startTime
+                        elapsedMs: performance.now() - startTime,
+                        speed: speed !== 1.0 ? speed : null
                     });
 
                     showResults(blob, meta);
@@ -656,6 +677,14 @@ const PlaygroundApp = (() => {
         updateLengthStatus(current, max);
     }
 
+    function updateSpeedDisplay() {
+        if (!els.speedInput || !els.speedValueDisplay) {
+            return;
+        }
+        const speed = parseFloat(els.speedInput.value);
+        els.speedValueDisplay.textContent = `${speed.toFixed(2)}x`;
+    }
+
     function updateLengthStatus(current, max) {
         if (!els.lengthStatus) {
             return;
@@ -845,6 +874,12 @@ const PlaygroundApp = (() => {
             if (els.audioFormat) {
                 els.audioFormat.textContent = '--';
             }
+            if (els.audioSpeed) {
+                els.audioSpeed.textContent = '--';
+            }
+            if (els.audioChunks) {
+                els.audioChunks.textContent = '--';
+            }
             return;
         }
 
@@ -861,6 +896,12 @@ const PlaygroundApp = (() => {
             const label = meta.formatLabel || meta.format || state.format;
             els.audioFormat.textContent = label ? label.toString().toUpperCase() : '--';
         }
+        if (els.audioSpeed) {
+            els.audioSpeed.textContent = meta.speed ? `${meta.speed}x` : '1.0x';
+        }
+        if (els.audioChunks) {
+            els.audioChunks.textContent = meta.chunks || '1';
+        }
         if (els.audioInfo) {
             const parts = [];
             if (meta.generatedAt) {
diff --git a/ttsfm-web/templates/base.html b/ttsfm-web/templates/base.html
@@ -88,7 +88,7 @@
             <a class="navbar-brand" href="{{ url_for('index') }}">
                 <i class="fas fa-microphone-alt me-2"></i>
                 <span class="fw-bold">TTSFM</span>
-                <span class="badge bg-primary ms-2 small">v3.3.7</span>
+                <span class="badge bg-primary ms-2 small">v3.4.0-alpha3</span>
             </a>
 
             <button class="navbar-toggler border-0" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
@@ -159,7 +159,7 @@
                     <div class="d-flex align-items-center">
                         <i class="fas fa-microphone-alt me-2 text-primary"></i>
                         <strong class="text-dark">TTSFM</strong>
-                        <span class="ms-2 text-muted">v3.3.7</span>
+                        <span class="ms-2 text-muted">v3.4.0-alpha3</span>
                     </div>
                 </div>
                 <div class="col-md-6 text-md-end">
diff --git a/ttsfm-web/templates/playground.html b/ttsfm-web/templates/playground.html
diff --git a/ttsfm/audio.py b/ttsfm/audio.py