Add system capabilities detection and slim image support

dbccccccc · dbccccccc · commit dd59d6b27123 · 2025-10-28T12:03:19.000+08:00
Introduces a new capabilities module for runtime detection of ffmpeg and image variant, with API endpoints for feature discovery. Enhances error handling and validation for ffmpeg-dependent features, updates UI to reflect available features and image variant, and bumps version to 3.4.0-beta1. Improves user feedback for unavailable features and adds comprehensive tests for slim image scenarios.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,56 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.4.0-beta1] - 2025-10-28
+
+### Added
+- **Image variant detection system**: Automatic detection of full vs slim Docker images
+  - New `ttsfm/capabilities.py` module with `SystemCapabilities` class
+  - Runtime detection of ffmpeg availability using `shutil.which("ffmpeg")`
+  - Global singleton instance via `get_capabilities()` function
+- **New API endpoints for feature discovery**:
+  - `/api/capabilities` - Returns complete system capabilities report
+    - `ffmpeg_available`: Boolean indicating ffmpeg availability
+    - `image_variant`: "full" or "slim"
+    - `features`: Dictionary of available features (speed_adjustment, format_conversion, mp3_auto_combine, basic_formats)
+    - `supported_formats`: List of available audio formats
+  - Enhanced `/api/health` endpoint with `image_variant` and `ffmpeg_available` fields
+- **Early validation for ffmpeg-dependent features**:
+  - Advanced formats (OPUS, AAC, FLAC, PCM) checked before processing
+  - Speed adjustment (speed != 1.0) validated before processing
+  - MP3 auto-combine for long text validated before processing
+  - Returns 400 error with helpful hints when features unavailable
+- **Playground UI enhancements for slim image**:
+  - Automatic capabilities loading on page load
+  - Image variant badge in navbar ("Full Image" green / "Slim Image" yellow)
+  - Speed slider disabled with tooltip when ffmpeg unavailable
+  - Advanced format options disabled and marked "(requires full image)"
+  - Error messages include hints from API responses
+- **Comprehensive test scripts**:
+  - `scripts/test_slim_image.py` - Integration tests against running server
+  - `scripts/test_slim_simulation.py` - Unit tests with mocked ffmpeg unavailability
+
+### Fixed
+- **Slim image error handling**: Slim image now properly reports errors instead of failing silently
+  - Clear error messages for unavailable features
+  - Helpful hints directing users to full Docker image
+  - Proper HTTP 400 status codes with structured error responses
+- **RuntimeError exception handling**: Web API now catches ffmpeg-related errors from audio_processing module
+
+### Changed
+- **Improved error response format**: All feature unavailability errors now include:
+  - `message`: Clear description of the issue
+  - `type`: "feature_unavailable_error"
+  - `code`: "ffmpeg_required"
+  - `hint`: Helpful suggestion to use full Docker image
+  - `available_formats`: List of supported formats (when applicable)
+
+### Technical
+- Capabilities detection uses singleton pattern for efficiency
+- Early validation prevents expensive operations when features unavailable
+- Playground JavaScript loads capabilities asynchronously
+- All 25 tests passing plus new integration and simulation tests
+
 ## [3.4.0-alpha4] - 2025-10-28
 
 ### Added
diff --git a/pyproject.toml b/pyproject.toml
@@ -86,7 +86,7 @@ ttsfm = "ttsfm.cli:main"
 version_scheme = "no-guess-dev"
 local_scheme = "no-local-version"
 
-fallback_version = "3.4.0-alpha4"
+fallback_version = "3.4.0-beta1"
 [tool.setuptools]
 packages = ["ttsfm"]
 
diff --git a/ttsfm-web/app.py b/ttsfm-web/app.py
@@ -44,6 +44,7 @@
 try:
     from ttsfm import AudioFormat, TTSClient, Voice
     from ttsfm.audio import combine_audio_chunks
+    from ttsfm.capabilities import get_capabilities
     from ttsfm.exceptions import (
         APIException,
         AudioProcessingException,
@@ -58,6 +59,7 @@
     sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
     from ttsfm import AudioFormat, TTSClient, Voice
     from ttsfm.audio import combine_audio_chunks
+    from ttsfm.capabilities import get_capabilities
     from ttsfm.exceptions import APIException, NetworkException, ValidationException
     from ttsfm.utils import split_text_by_length
 
@@ -497,7 +499,7 @@ def get_status():
             {
                 "status": "online",
                 "tts_service": "openai.fm (free)",
-                "package_version": "3.4.0a4",
+                "package_version": "3.4.0b1",
                 "timestamp": datetime.now().isoformat(),
             }
         )
@@ -519,12 +521,26 @@ def get_status():
 
 @app.route("/api/health", methods=["GET"])
 def health_check():
-    """Simple health check endpoint."""
+    """Health check endpoint with capabilities info."""
+    caps = get_capabilities()
     return jsonify(
-        {"status": "healthy", "package_version": "3.4.0a4", "timestamp": datetime.now().isoformat()}
+        {
+            "status": "healthy",
+            "package_version": "3.4.0b1",
+            "image_variant": caps.get_capabilities()["image_variant"],
+            "ffmpeg_available": caps.ffmpeg_available,
+            "timestamp": datetime.now().isoformat(),
+        }
     )
 
 
+@app.route("/api/capabilities", methods=["GET"])
+def get_system_capabilities():
+    """Get system capabilities and available features."""
+    caps = get_capabilities()
+    return jsonify(caps.get_capabilities())
+
+
 @app.route("/api/websocket/status", methods=["GET"])
 def websocket_status():
     """Get WebSocket server status and active connections."""
@@ -688,6 +704,66 @@ def openai_speech():
                     400,
                 )
 
+        # Check feature availability before processing
+        caps = get_capabilities()
+
+        # Check if requested format requires ffmpeg
+        if format_enum.value in ["opus", "aac", "flac", "pcm"] and not caps.ffmpeg_available:
+            return (
+                jsonify(
+                    {
+                        "error": {
+                            "message": f"Format '{format_enum.value}' requires ffmpeg. "
+                            f"Available formats: {', '.join(caps.get_supported_formats())}",
+                            "type": "feature_unavailable_error",
+                            "code": "ffmpeg_required",
+                            "available_formats": caps.get_supported_formats(),
+                            "hint": "Use the full Docker image (dbcccc/ttsfm:latest) instead of the slim variant.",
+                        }
+                    }
+                ),
+                400,
+            )
+
+        # Check if speed adjustment requires ffmpeg
+        if speed is not None and speed != 1.0 and not caps.ffmpeg_available:
+            return (
+                jsonify(
+                    {
+                        "error": {
+                            "message": "Speed adjustment requires ffmpeg. "
+                            "Use the full Docker image (dbcccc/ttsfm:latest).",
+                            "type": "feature_unavailable_error",
+                            "code": "ffmpeg_required",
+                            "hint": "Speed adjustment is only available in the full Docker image.",
+                        }
+                    }
+                ),
+                400,
+            )
+
+        # Check if MP3 auto-combine requires ffmpeg (for long text)
+        if (
+            len(input_text) > max_length
+            and auto_combine
+            and format_enum == AudioFormat.MP3
+            and not caps.ffmpeg_available
+        ):
+            return (
+                jsonify(
+                    {
+                        "error": {
+                            "message": "MP3 auto-combine for long text requires ffmpeg. "
+                            "Use WAV format, disable auto_combine, or use the full Docker image.",
+                            "type": "feature_unavailable_error",
+                            "code": "ffmpeg_required",
+                            "hint": "MP3 auto-combine is only available in the full Docker image.",
+                        }
+                    }
+                ),
+                400,
+            )
+
         logger.info(
             "OpenAI API: Generating speech: text='%s...', voice=%s, "
             "requested_format=%s, auto_combine=%s, speed=%s",
@@ -899,6 +975,30 @@ def openai_speech():
             400,
         )
 
+    except RuntimeError as e:
+        # Catch ffmpeg-related errors from audio_processing module
+        error_msg = str(e)
+        logger.error(f"OpenAI API runtime error: {error_msg}")
+
+        # Check if it's an ffmpeg-related error
+        if "ffmpeg" in error_msg.lower():
+            return (
+                jsonify(
+                    {
+                        "error": {
+                            "message": error_msg,
+                            "type": "feature_unavailable_error",
+                            "code": "ffmpeg_required",
+                            "hint": "This feature requires the full Docker image. "
+                            "Use dbcccc/ttsfm:latest instead of the slim variant.",
+                        }
+                    }
+                ),
+                400,
+            )
+        # Re-raise if not ffmpeg-related
+        raise
+
     except Exception as e:
         logger.error(f"OpenAI API unexpected error: {e}")
         return (
diff --git a/ttsfm-web/static/js/playground-enhanced-fixed.js b/ttsfm-web/static/js/playground-enhanced-fixed.js
@@ -19,7 +19,8 @@ const PlaygroundApp = (() => {
         wsClient: null,
         streamingMode: false,
         activeStreamId: null,
-        defaultText: ''
+        defaultText: '',
+        capabilities: null  // System capabilities
     };
 
     const els = {};
@@ -38,6 +39,7 @@ const PlaygroundApp = (() => {
         initWebSocket();
 
         checkAuthStatus();
+        loadCapabilities();  // Load system capabilities first
         loadVoices();
 
         if (document.getElementById('format-select')) {
@@ -352,11 +354,19 @@ const PlaygroundApp = (() => {
 
             if (!response.ok) {
                 let message = `Error: ${response.status} ${response.statusText}`;
+                let hint = null;
                 try {
                     const errorData = await response.json();
                     if (errorData.error?.message) {
                         message = errorData.error.message;
                     }
+                    if (errorData.error?.hint) {
+                        hint = errorData.error.hint;
+                    }
+                    // Add hint to message if available
+                    if (hint) {
+                        message += `\n\n💡 ${hint}`;
+                    }
                 } catch (error) {
                     // ignore parse errors
                 }
@@ -628,6 +638,62 @@ const PlaygroundApp = (() => {
         }
     }
 
+    async function loadCapabilities() {
+        try {
+            const response = await fetch('/api/capabilities');
+            if (!response.ok) {
+                console.warn('Failed to load capabilities, assuming full image');
+                return;
+            }
+            const caps = await response.json();
+            state.capabilities = caps;
+            updateUIForCapabilities(caps);
+        } catch (error) {
+            console.error('Failed to load capabilities:', error);
+        }
+    }
+
+    function updateUIForCapabilities(caps) {
+        if (!caps) return;
+
+        // Update speed slider if ffmpeg not available
+        const speedSlider = document.getElementById('speed-slider');
+        const speedValue = document.getElementById('speed-value');
+        if (speedSlider && !caps.features.speed_adjustment) {
+            speedSlider.disabled = true;
+            speedSlider.title = 'Speed adjustment requires full Docker image';
+            if (speedValue) {
+                speedValue.insertAdjacentHTML('afterend',
+                    '<small class="text-warning ms-2">⚠️ Requires full image</small>');
+            }
+        }
+
+        // Filter format options based on availability
+        const formatSelect = document.getElementById('format-select');
+        if (formatSelect && caps.supported_formats) {
+            Array.from(formatSelect.options).forEach(option => {
+                if (!caps.supported_formats.includes(option.value)) {
+                    option.disabled = true;
+                    option.textContent += ' (requires full image)';
+                }
+            });
+        }
+
+        // Show image variant badge in navbar
+        const variant = caps.image_variant;
+        const badgeHtml = variant === 'full'
+            ? '<span class="badge bg-success ms-2">Full Image</span>'
+            : '<span class="badge bg-warning ms-2">Slim Image</span>';
+
+        const navbar = document.querySelector('.navbar-brand');
+        if (navbar && !document.querySelector('.image-variant-badge')) {
+            const badge = document.createElement('span');
+            badge.className = 'image-variant-badge';
+            badge.innerHTML = badgeHtml;
+            navbar.appendChild(badge);
+        }
+    }
+
     async function loadVoices({ refresh = false } = {}) {
         try {
             const data = await fetchVoices({ refresh });
diff --git a/ttsfm-web/templates/base.html b/ttsfm-web/templates/base.html
@@ -88,7 +88,7 @@
             <a class="navbar-brand" href="{{ url_for('index') }}">
                 <i class="fas fa-microphone-alt me-2"></i>
                 <span class="fw-bold">TTSFM</span>
-                <span class="badge bg-primary ms-2 small">v3.4.0-alpha4</span>
+                <span class="badge bg-primary ms-2 small">v3.4.0-beta1</span>
             </a>
 
             <button class="navbar-toggler border-0" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
@@ -159,7 +159,7 @@
                     <div class="d-flex align-items-center">
                         <i class="fas fa-microphone-alt me-2 text-primary"></i>
                         <strong class="text-dark">TTSFM</strong>
-                        <span class="ms-2 text-muted">v3.4.0-alpha4</span>
+                        <span class="ms-2 text-muted">v3.4.0-beta1</span>
                     </div>
                 </div>
                 <div class="col-md-6 text-md-end">
diff --git a/ttsfm/__init__.py b/ttsfm/__init__.py
@@ -62,7 +62,7 @@
 )
 from .utils import split_text_by_length, validate_text_length
 
-__version__ = "3.4.0-alpha4"
+__version__ = "3.4.0-beta1"
 __author__ = "dbcccc"
 __email__ = "120614547+dbccccccc@users.noreply.github.com"
 __description__ = "Text-to-Speech API Client with OpenAI compatibility"
diff --git a/ttsfm/capabilities.py b/ttsfm/capabilities.py

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@`
`62`	`62`	`)`
`63`	`63`	`from .utils import split_text_by_length, validate_text_length`
`64`	`64`
`65`		`-__version__ = "3.4.0-alpha4"`
	`65`	`+__version__ = "3.4.0-beta1"`
`66`	`66`	`__author__ = "dbcccc"`
`67`	`67`	`__email__ = "[email protected]"`
`68`	`68`	`__description__ = "Text-to-Speech API Client with OpenAI compatibility"`