Implement real audio format conversion with ffmpeg

dbccccccc · dbccccccc · commit f6c8c5182cbb · 2025-10-28T10:25:50.000+08:00
Replaces legacy format mapping with actual conversion for OPUS, AAC, FLAC, and PCM using ffmpeg, ensuring correct MIME types and file extensions. Adds format selector to playground UI, updates client and async client to handle conversion and content-type headers, and removes get_supported_format and maps_to_wav from models. Updates version to 3.4.0-alpha4 and fixes speed display in playground.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,43 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.4.0-alpha4] - 2025-10-28
+
+### Added
+- **Format conversion with ffmpeg**: All 6 audio formats now properly converted using ffmpeg
+  - MP3, WAV: Direct from openai.fm (no conversion needed)
+  - OPUS, AAC, FLAC, PCM: Converted from WAV using ffmpeg
+  - Proper MIME type headers for each format (audio/opus, audio/aac, audio/flac, audio/pcm)
+  - Downloads now have correct file extensions (.opus, .aac, .flac, .pcm)
+- **Format selector in playground**: Added dropdown to select audio format in web UI
+  - Clean display showing only format names (mp3, wav, opus, aac, flac, pcm)
+  - Integrated with existing playground functionality
+
+### Fixed
+- **Content-Type headers after format conversion**: Fixed issue where converted formats returned wrong content-type
+  - Added `_get_content_type_for_format()` helper method to both sync and async clients
+  - Content-type now properly updated after ffmpeg conversion
+  - Downloads now use correct file extensions based on actual format
+- **Speed display in playground**: Fixed bug where speed always showed "1.0x" regardless of actual speed
+  - Updated `buildGenerationMeta()` to include speed and speedApplied fields
+  - Speed now correctly displayed in audio stats (0.25x, 0.5x, 1.0x, 1.5x, 2.0x, 4.0x)
+
+### Changed
+- **Removed legacy format mapping**: Eliminated header-based format "faking" in favor of real conversion
+  - Removed `get_supported_format()` and `maps_to_wav()` functions from `ttsfm/models.py`
+  - Simplified client code by ~30 lines
+  - All formats now return actual requested format, not approximations
+- **Migrated playground to OpenAI API**: Removed old `/api/generate` endpoints
+  - Playground now uses `/v1/audio/speech` endpoint exclusively
+  - Consistent API format across all interfaces
+  - Speed parameter now works correctly in playground
+
+### Technical
+- Format conversion uses `convert_audio_format()` from `audio_processing.py`
+- Async client runs ffmpeg conversion in thread pool to avoid blocking
+- Graceful fallback to original format if ffmpeg unavailable
+- All 25 tests passing with new format conversion logic
+
 ## [3.4.0-alpha3] - 2025-10-26
 
 ### Fixed
diff --git a/pyproject.toml b/pyproject.toml
@@ -86,7 +86,7 @@ ttsfm = "ttsfm.cli:main"
 version_scheme = "no-guess-dev"
 local_scheme = "no-local-version"
 
-fallback_version = "3.4.0-alpha3"
+fallback_version = "3.4.0-alpha4"
 [tool.setuptools]
 packages = ["ttsfm"]
 
diff --git a/tests/test_audio_processing.py b/tests/test_audio_processing.py
@@ -48,10 +48,7 @@ def test_adjust_audio_speed_no_change(self):
         result = adjust_audio_speed(dummy_audio, speed=1.0)
         assert result == dummy_audio
 
-    @pytest.mark.skipif(
-        not shutil.which("ffmpeg"),
-        reason="ffmpeg not available"
-    )
+    @pytest.mark.skipif(not shutil.which("ffmpeg"), reason="ffmpeg not available")
     def test_adjust_audio_speed_requires_ffmpeg(self):
         """Test that speed adjustment requires ffmpeg."""
         # This test only runs if ffmpeg is available
@@ -90,6 +87,7 @@ def test_combine_mp3_without_ffmpeg(self, monkeypatch):
         """Test that MP3 combining fails gracefully without ffmpeg."""
         # Mock both pydub and ffmpeg as unavailable
         import ttsfm.audio
+
         monkeypatch.setattr(ttsfm.audio, "AudioSegment", None)
         monkeypatch.setattr(ttsfm.audio, "FFMPEG_AVAILABLE", False)
 
@@ -104,6 +102,7 @@ def test_combine_wav_without_ffmpeg(self, monkeypatch):
         """Test that WAV combining works without ffmpeg."""
         # Mock pydub as unavailable but allow WAV concatenation
         import ttsfm.audio
+
         monkeypatch.setattr(ttsfm.audio, "AudioSegment", None)
 
         from ttsfm.audio import combine_audio_chunks
@@ -115,4 +114,3 @@ def test_combine_wav_without_ffmpeg(self, monkeypatch):
         # Should not raise error for WAV
         result = combine_audio_chunks(chunks, format_type="wav")
         assert isinstance(result, bytes)
-
diff --git a/ttsfm-web/app.py b/ttsfm-web/app.py
@@ -42,25 +42,23 @@
 
 # Import the TTSFM package
 try:
-    from ttsfm import AudioFormat, TTSClient, TTSException, Voice
+    from ttsfm import AudioFormat, TTSClient, Voice
     from ttsfm.audio import combine_audio_chunks
     from ttsfm.exceptions import (
         APIException,
         AudioProcessingException,
         NetworkException,
         ValidationException,
     )
-    from ttsfm.models import get_supported_format
     from ttsfm.utils import split_text_by_length
 except ImportError:
     # Fallback for development when package is not installed
     import sys
 
     sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-    from ttsfm import AudioFormat, TTSClient, TTSException, Voice
+    from ttsfm import AudioFormat, TTSClient, Voice
     from ttsfm.audio import combine_audio_chunks
     from ttsfm.exceptions import APIException, NetworkException, ValidationException
-    from ttsfm.models import get_supported_format
     from ttsfm.utils import split_text_by_length
 
 # Load environment variables
@@ -486,10 +484,6 @@ def validate_text():
         return jsonify({"error": "Text validation failed"}), 500
 
 
-
-
-
-
 @app.route("/api/status", methods=["GET"])
 def get_status():
     """Get service status."""
@@ -503,7 +497,7 @@ def get_status():
             {
                 "status": "online",
                 "tts_service": "openai.fm (free)",
-                "package_version": "3.4.0a3",
+                "package_version": "3.4.0a4",
                 "timestamp": datetime.now().isoformat(),
             }
         )
@@ -527,7 +521,7 @@ def get_status():
 def health_check():
     """Simple health check endpoint."""
     return jsonify(
-        {"status": "healthy", "package_version": "3.4.0a3", "timestamp": datetime.now().isoformat()}
+        {"status": "healthy", "package_version": "3.4.0a4", "timestamp": datetime.now().isoformat()}
     )
 
 
@@ -694,15 +688,12 @@ def openai_speech():
                     400,
                 )
 
-        effective_format = get_supported_format(format_enum)
-
         logger.info(
             "OpenAI API: Generating speech: text='%s...', voice=%s, "
-            "requested_format=%s (effective=%s), auto_combine=%s, speed=%s",
+            "requested_format=%s, auto_combine=%s, speed=%s",
             input_text[:50],
             voice,
             response_format,
-            effective_format.value,
             auto_combine,
             speed,
         )
@@ -715,14 +706,14 @@ def openai_speech():
             logger.info(
                 "Long text detected (%s chars); auto-combining with format %s",
                 len(input_text),
-                effective_format.value,
+                format_enum.value,
             )
 
             # Generate speech chunks
             responses = client.generate_speech_long_text(
                 text=input_text,
                 voice=voice_enum,
-                response_format=effective_format,
+                response_format=format_enum,
                 instructions=instructions,
                 max_length=max_length,
                 preserve_words=True,
@@ -778,13 +769,14 @@ def openai_speech():
                 "X-Auto-Combine": "true",
                 "X-Powered-By": "TTSFM-OpenAI-Compatible",
                 "X-Requested-Format": format_enum.value,
-                "X-Effective-Format": effective_format.value,
             }
 
             # Add speed metadata if available (from first response)
             if responses and responses[0].metadata and "requested_speed" in responses[0].metadata:
                 headers["X-Requested-Speed"] = str(responses[0].metadata["requested_speed"])
-                headers["X-Speed-Applied"] = str(responses[0].metadata.get("speed_applied", False)).lower()
+                headers["X-Speed-Applied"] = str(
+                    responses[0].metadata.get("speed_applied", False)
+                ).lower()
 
             return Response(
                 stream_with_context(_chunk_bytes(combined_audio)),
@@ -834,13 +826,14 @@ def openai_speech():
                 "X-Auto-Combine": str(auto_combine).lower(),
                 "X-Powered-By": "TTSFM-OpenAI-Compatible",
                 "X-Requested-Format": format_enum.value,
-                "X-Effective-Format": effective_format.value,
             }
 
             # Add speed metadata if available
             if response.metadata and "requested_speed" in response.metadata:
                 headers["X-Requested-Speed"] = str(response.metadata["requested_speed"])
-                headers["X-Speed-Applied"] = str(response.metadata.get("speed_applied", False)).lower()
+                headers["X-Speed-Applied"] = str(
+                    response.metadata.get("speed_applied", False)
+                ).lower()
 
             return Response(
                 stream_with_context(_chunk_bytes(response.audio_data)),
diff --git a/ttsfm-web/static/js/playground-enhanced-fixed.js b/ttsfm-web/static/js/playground-enhanced-fixed.js
@@ -652,10 +652,17 @@ const PlaygroundApp = (() => {
 
     async function loadFormats({ refresh = false } = {}) {
         try {
-            const data = await fetchFormats({ refresh });
             if (!els.formatSelect) {
                 return;
             }
+
+            // If format select already has options (from HTML), don't reload
+            if (els.formatSelect.options.length > 1 && !refresh) {
+                updateAudioSummary();
+                return;
+            }
+
+            const data = await fetchFormats({ refresh });
             els.formatSelect.innerHTML = '';
             data.formats.forEach((format) => {
                 const option = document.createElement('option');
diff --git a/ttsfm-web/templates/base.html b/ttsfm-web/templates/base.html
@@ -88,7 +88,7 @@
             <a class="navbar-brand" href="{{ url_for('index') }}">
                 <i class="fas fa-microphone-alt me-2"></i>
                 <span class="fw-bold">TTSFM</span>
-                <span class="badge bg-primary ms-2 small">v3.4.0-alpha3</span>
+                <span class="badge bg-primary ms-2 small">v3.4.0-alpha4</span>
             </a>
 
             <button class="navbar-toggler border-0" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
@@ -159,7 +159,7 @@
                     <div class="d-flex align-items-center">
                         <i class="fas fa-microphone-alt me-2 text-primary"></i>
                         <strong class="text-dark">TTSFM</strong>
-                        <span class="ms-2 text-muted">v3.4.0-alpha3</span>
+                        <span class="ms-2 text-muted">v3.4.0-alpha4</span>
                     </div>
                 </div>
                 <div class="col-md-6 text-md-end">
diff --git a/ttsfm-web/templates/playground.html b/ttsfm-web/templates/playground.html
@@ -95,7 +95,7 @@ <h4 class="mb-0 d-flex align-items-center">
 
                         <div class="row">
                             <!-- Enhanced Voice Selection -->
-                            <div class="col-md-12 mb-4">
+                            <div class="col-md-6 mb-4">
                                 <label for="voice-select" class="form-label fw-bold d-flex align-items-center">
                                     <i class="fas fa-microphone me-2 text-primary"></i>
                                     {{ _('playground.voice_label') }}
@@ -107,6 +107,26 @@ <h4 class="mb-0 d-flex align-items-center">
                                     <span>{{ _('common.choose_voice') }}</span>
                                 </div>
                             </div>
+
+                            <!-- Format Selection -->
+                            <div class="col-md-6 mb-4">
+                                <label for="format-select" class="form-label fw-bold d-flex align-items-center">
+                                    <i class="fas fa-file-audio me-2 text-primary"></i>
+                                    {{ _('playground.format_label') if _('playground.format_label') != 'playground.format_label' else 'Audio Format' }}
+                                </label>
+                                <select class="form-select shadow-sm" id="format-select" required>
+                                    <option value="mp3" selected>mp3</option>
+                                    <option value="wav">wav</option>
+                                    <option value="opus">opus</option>
+                                    <option value="aac">aac</option>
+                                    <option value="flac">flac</option>
+                                    <option value="pcm">pcm</option>
+                                </select>
+                                <div class="form-text">
+                                    <i class="fas fa-info-circle me-1"></i>
+                                    {{ _('playground.format_description') if _('playground.format_description') != 'playground.format_description' else 'Choose audio output format. Converted formats require ffmpeg.' }}
+                                </div>
+                            </div>
                         </div>
 
                         <!-- Advanced Options -->
diff --git a/ttsfm/__init__.py b/ttsfm/__init__.py
@@ -62,7 +62,7 @@
 )
 from .utils import split_text_by_length, validate_text_length
 
-__version__ = "3.3.7"
+__version__ = "3.4.0-alpha4"
 __author__ = "dbcccc"
 __email__ = "120614547+dbccccccc@users.noreply.github.com"
 __description__ = "Text-to-Speech API Client with OpenAI compatibility"
diff --git a/ttsfm/async_client.py b/ttsfm/async_client.py
diff --git a/ttsfm/audio.py b/ttsfm/audio.py
diff --git a/ttsfm/client.py b/ttsfm/client.py
diff --git a/ttsfm/models.py b/ttsfm/models.py

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@`
`62`	`62`	`)`
`63`	`63`	`from .utils import split_text_by_length, validate_text_length`
`64`	`64`
`65`		`-__version__ = "3.3.7"`
	`65`	`+__version__ = "3.4.0-alpha4"`
`66`	`66`	`__author__ = "dbcccc"`
`67`	`67`	`__email__ = "[email protected]"`
`68`	`68`	`__description__ = "Text-to-Speech API Client with OpenAI compatibility"`