FlyingFathead · FlyingFathead · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/.catgitignore b/.catgitignore
@@ -2,3 +2,4 @@
 # https://github.com/FlyingFathead/catgit
 README.md
 tests/diarize_with_whisper-test.py
+utils/resemblyzer_safety_check.py
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:slim-bookworm
+FROM python:3.12-slim
 
 # Install dependencies & clean up after to reduce Docker file size
 RUN apt-get update && apt-get install -y \
@@ -13,6 +13,9 @@ WORKDIR /app
 # Copy the requirements file first to leverage Docker cache
 COPY requirements.txt .
 
+# Upgrade pip and setuptools
+RUN pip install --upgrade pip setuptools wheel
+
 # Install Python dependencies
 RUN pip3 install --no-cache-dir -r requirements.txt
 

diff --git a/README.md b/README.md
@@ -181,7 +181,7 @@ Replace `'YourTelegramBotToken'` with your actual Telegram bot token. This comma
 
 ## Usage
 
-After launching the bot, you can interact with it via Telegram (message `@whatever_your_bot_name_is_Bot`):
+After launching your bot successfully, you can interact with it via Telegram (send a message to `@your_bot_name_Bot`, or whatever your bot name is):
 
 1. Send a video URL (for `yt-dlp` to download), a voice message or an audio file (i.e. `.wav` or `.mp3` format) to the bot.
 2. The bot will acknowledge the request and begin processing, notifying the user of the process.
@@ -195,6 +195,14 @@ After launching the bot, you can interact with it via Telegram (message `@whatev
 - `/language` - set the model's transcription language (`auto` =  autodetect); if you know the language spoken in the audio, setting the transcription language manually with this command may improve both transcription speed and accuracy.
 
 ## Changes
+- v0.1707 - New `config.ini` option: add sites that require full video download
+   - some media sites don't work well with `yt-dlp`'s audio-only download method
+   - there are now two new options in `config.ini` under `[YTDLPSettings]`:
+   - `download_original_video_for_domains_active = true` (default)
+   - `download_original_video_domains = site1.com, site2.com, site3.com`
+   - at the moment it's used for media platforms that have had reported issues during testing
+   - when active, a comma-separated list is used to check up on media sites that require their contents to be downloaded as the original video instead of audio-only
+   - _(the tradeoff is obviously download size and hence speed; the audio-only method is usually the fastest and should be preferred for most popular sites, hence only add problematic sites to the video-only list)_   
 - v0.1706 - Disable asking for token if running inside Docker
    - by default, the app will ask for the token if it's not found, unless Dockerized
    - can be better for headless use case scenarios where you need the error message rather than a prompt for the bot token

diff --git a/config/config.ini b/config/config.ini
@@ -61,5 +61,27 @@ cooldown_seconds = 10
 max_requests_per_minute = 5
 
 [YTDLPSettings]
+# use your own `cookies.txt` (true/false)
+# this is sometimes required for sites that require login
+# or, in some cases, with sites like YouTube that don't like downloaders.
 use_cookies = False
-cookies_file = config/cookies.txt
+cookies_file = config/cookies.txt
+# some media sites don't always work well with yt-dlp's audio download feature
+# for compatibility, it's recommended to enable the flag below (true)
+download_original_video_for_domains_active = true
+# list your sites below to download original videos from, comma separated.
+# example:
+# download_original_video_domains = site1.com, site2.com, site3.com
+# these are the sites we use to download original videos from
+download_original_video_domains = rumble.com
+# use worst video quality (true/false)
+# this is usually recommended, because we will only need the _audio_ for transcription.
+# adding a high-quality video will cause massive file size increases.
+# however, in some cases you might want to turn this off
+use_worst_video_quality = true
+
+[VideoDescriptionSettings]
+# Set to True to use only a snippet of the video description
+use_snippet_for_description = False
+# Maximum number of lines to include in the description snippet
+description_max_lines = 30
diff --git a/src/config_loader.py b/src/config_loader.py
@@ -45,6 +45,20 @@ def get_notification_settings(cls):
             'completion_message': completion_message
         }
 
+    # NEW: Method to get yt-dlp domain settings
+    @classmethod
+    def get_ytdlp_domain_settings(cls):
+        config = cls.get_config()
+        active = config.getboolean('YTDLPSettings', 'download_original_video_for_domains_active', fallback=False)
+        domains = config.get('YTDLPSettings', 'download_original_video_domains', fallback='')
+        # Split by comma and strip whitespace
+        domain_list = [domain.strip().lower() for domain in domains.split(',') if domain.strip()]
+
+        return {
+            'active': active,
+            'domains': domain_list
+        }
+
 # Usage example:
 # from config_loader import ConfigLoader
 # notification_settings = ConfigLoader.get_notification_settings()
diff --git a/src/main.py b/src/main.py
@@ -3,7 +3,7 @@
 # openai-whisper transcriber-bot for Telegram
 
 # version of this program
-version_number = "0.1706"
+version_number = "0.1707"
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # https://github.com/FlyingFathead/whisper-transcriber-telegram-bot/

diff --git a/src/transcription_handler.py b/src/transcription_handler.py
@@ -33,17 +33,10 @@
 from config_loader import ConfigLoader
 config = ConfigLoader.get_config()
 
-# # Load config
-# config = configparser.ConfigParser()
-# config.read('config/config.ini')
-# send_as_files = config.getboolean('TranscriptionSettings', 'sendasfiles', fallback=True)
-# send_as_messages = config.getboolean('TranscriptionSettings', 'sendasmessages', fallback=False)
-
 # Toggle this to use the full description or a snippet.
-USE_SNIPPET_FOR_DESCRIPTION = False
-
+USE_SNIPPET_FOR_DESCRIPTION = config.getboolean('VideoDescriptionSettings', 'use_snippet_for_description', fallback=False)
 # If we're using a snippet of the description, maximum number of lines to include
-DESCRIPTION_MAX_LINES = 30
+DESCRIPTION_MAX_LINES = config.getint('VideoDescriptionSettings', 'description_max_lines', fallback=30)
 
 # Output directory for transcriptions; create if doesn't exist
 output_dir = "transcriptions"
@@ -178,82 +171,134 @@ def get_transcription_settings():
             'send_as_messages': False,
         }
 
-# # (old) get transcription settings
-# def get_transcription_settings():
-#     config = configparser.ConfigParser()
-#     config_path = os.path.join(base_dir, 'config', 'config.ini')
-
-#     if not os.path.exists(config_path):
-#         logger.error("Error: config.ini not found at the expected path.")
-#         sys.exit(1)
-
-#     config.read(config_path)
-
-#     if 'TranscriptionSettings' not in config:
-#         logger.error("TranscriptionSettings section missing in config.ini")
-#         sys.exit(1)
-
-#     include_header = config.getboolean('TranscriptionSettings', 'IncludeHeaderInTranscription', fallback=False)
-#     keep_audio_files = config.getboolean('TranscriptionSettings', 'KeepAudioFiles', fallback=False)
-
-#     logger.info(f"Transcription settings loaded: include_header={include_header}, keep_audio_files={keep_audio_files}")
-
-#     return {
-#         'include_header': include_header,
-#         'keep_audio_files': keep_audio_files
-#     }
-
 # split long messages
 def split_message(message, max_length=4096):
     return [message[i:i+max_length] for i in range(0, len(message), max_length)]
 
-# audio download
-async def download_audio(url, output_path):
-    logger.info(f"Attempting to download audio from: {url}")
-
-    # Read settings from configuration
+# // audio download (new method)
+async def download_audio(url, audio_path):
+    config = ConfigLoader.get_config()
+    ytdlp_settings = ConfigLoader.get_ytdlp_domain_settings()
     use_cookies = config.getboolean('YTDLPSettings', 'use_cookies', fallback=False)
     cookies_file = config.get('YTDLPSettings', 'cookies_file', fallback='config/cookies.txt')
+    use_worst_video_quality = config.getboolean('YTDLPSettings', 'use_worst_video_quality', fallback=True)
+
+    parsed_url = urlparse(url)
+    domain = parsed_url.netloc.lower()
+    if domain.startswith('www.'):
+        domain = domain[4:]  # Remove 'www.'
+
+    should_download_video = ytdlp_settings['active'] and domain in ytdlp_settings['domains']
+
+    if should_download_video:
+        logger.info("Identified domain requiring full video download.")
+        # Step 1: Get available formats in JSON
+        command = [
+            "yt-dlp",
+            "--no-warnings",
+            "--dump-json",
+            url
+        ]
+        if use_cookies and os.path.exists(cookies_file):
+            command.extend(["--cookies", cookies_file])
 
-    # Specify a cache directory that yt-dlp can write to
-    cache_dir = ".cache"
-
-    # Check if the cache directory exists, create it if it doesn't
-    if not os.path.exists(cache_dir):
-        try:
-            os.makedirs(cache_dir)
-            logger.info(f"Created cache directory: {cache_dir}")
-        except Exception as e:
-            logger.error(f"Failed to create cache directory {cache_dir}: {e}")
-
-    command = [
-        "yt-dlp",
-        "--extract-audio",
-        "--audio-format", "mp3",
-        "--cache-dir", cache_dir,  # Specify the custom cache directory
-    ]
+        process = await asyncio.create_subprocess_exec(
+            *command,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
 
-    if use_cookies:
-        if os.path.exists(cookies_file):
-            command.extend(["--cookies", cookies_file])
-            logger.info(f"Using cookies file: {cookies_file}")
+        stdout_data, stderr_data = await process.communicate()
+        if process.returncode != 0:
+            stderr_output = stderr_data.decode()
+            logger.error(f"Failed to get video formats: {stderr_output}")
+            raise Exception(f"Failed to get video formats: {stderr_output}")
+
+        # Step 2: Parse JSON to find the appropriate format
+        video_info = json.loads(stdout_data.decode())
+        formats = video_info.get('formats', [])
+
+        if not formats:
+            raise Exception("No formats found for the video.")
+
+        # Filter out formats without audio
+        video_formats = [
+            fmt for fmt in formats
+            if fmt.get('vcodec') != 'none' and fmt.get('acodec') != 'none' and fmt.get('acodec') != 'video only'
+        ]
+
+        if not video_formats:
+            raise Exception("No suitable video formats with audio available.")
+
+        if use_worst_video_quality:
+            # Sort video formats by resolution (width x height) or bitrate
+            def get_format_sort_key(fmt):
+                width = fmt.get('width') or 0
+                height = fmt.get('height') or 0
+                total_pixels = width * height
+                tbr = fmt.get('tbr') or 0
+                return (total_pixels, tbr)
+
+            selected_format = min(video_formats, key=get_format_sort_key)
+            logger.info("Selected worst quality video format.")
         else:
-            logger.error(f"Cookies file {cookies_file} does not exist.")
-            raise Exception(f"Cookies file {cookies_file} does not exist.")
+            # Select best quality video format
+            def get_format_sort_key(fmt):
+                width = fmt.get('width') or 0
+                height = fmt.get('height') or 0
+                total_pixels = width * height
+                tbr = fmt.get('tbr') or 0
+                return (-total_pixels, -tbr)
+
+            selected_format = max(video_formats, key=get_format_sort_key)
+            logger.info("Selected best quality video format.")
+
+        selected_format_id = selected_format.get('format_id')
+
+        if not selected_format_id:
+            raise Exception("Could not determine selected format ID.")
+
+        logger.info(f"Selected format ID: {selected_format_id}")
+
+        # Step 3: Download video using the selected format
+        base_output_path = audio_path.replace('.mp3', '')  # e.g., audio/12345_1618033988
+        video_output_template = f"{base_output_path}.%(ext)s"  # e.g., audio/12345_1618033988.mp4
+
+        command = [
+            "yt-dlp",
+            # "--verbose", # uncomment to set verbose
+            "--format", selected_format_id,
+            "--output", video_output_template,
+            url
+        ]
+        if use_cookies and os.path.exists(cookies_file):
+            command.extend(["--cookies", cookies_file])
 
-    command.extend([url, "-o", output_path])
+        logger.info("Downloading the selected quality video with audio...")
+    else:
+        # Download audio-only as mp3
+        command = [
+            "yt-dlp",
+            "--extract-audio",
+            "--audio-format", "mp3",
+            "--output", audio_path,
+            url
+        ]
+        if use_cookies and os.path.exists(cookies_file):
+            command.extend(["--cookies", cookies_file])
+        logger.info("Downloading audio-only...")
 
     # Start the subprocess
     process = await asyncio.create_subprocess_exec(
-        *command, 
-        stdout=asyncio.subprocess.PIPE, 
+        *command,
+        stdout=asyncio.subprocess.PIPE,
         stderr=asyncio.subprocess.PIPE
     )
 
+    # Read and log output
     stdout_lines = []
     stderr_lines = []
 
-    # Define async functions to read from stdout and stderr
     async def read_stream(stream, lines, log_func):
         while True:
             line = await stream.readline()
@@ -264,42 +309,55 @@ async def read_stream(stream, lines, log_func):
             else:
                 break
 
-    # Read from stdout and stderr concurrently
     await asyncio.gather(
         read_stream(process.stdout, stdout_lines, logger.info),
         read_stream(process.stderr, stderr_lines, logger.error)
     )
 
-    # Wait for the process to finish
     await process.wait()
 
-    # Check the return code
     if process.returncode != 0:
         stderr_output = '\n'.join(stderr_lines)
-        # Check for specific error messages
-        if any(keyword in stderr_output for keyword in [
-            "Sign in to confirm you're not a bot",
-            "unable to extract initial player response",
-            "This video is unavailable",
-            "ERROR:"
-        ]):
-            custom_error_message = (
-                "Failed to download audio due to YouTube's anti-bot measures or video restrictions. "
-                "Possible reasons include age restrictions, region locks, or the video requiring sign-in. "
-                "Please try a different video, or if you're the administrator, consider using cookies with `yt-dlp`."
-            )
-            logger.error(f"Error: {custom_error_message}")
-            raise Exception(custom_error_message)
-        else:
-            # For other errors, raise a generic exception with stderr output
-            logger.error(f"yt-dlp failed with error:\n{stderr_output}")
-            raise Exception(f"Failed to download audio: {stderr_output}")
+        logger.error(f"yt-dlp failed with error:\n{stderr_output}")
+        raise Exception(f"Failed to download media: {stderr_output}")
+
+    if should_download_video:
+        # Step 4: Extract audio from the downloaded video
+        video_extensions = ['mp4', 'webm', 'mkv', 'avi', 'mov', 'flv', 'wmv', 'mpg', 'mpeg']
+        video_file = None
+        for ext in video_extensions:
+            potential_video = f"{base_output_path}.{ext}"
+            if os.path.exists(potential_video):
+                video_file = potential_video
+                break
+
+        if not video_file:
+            logger.error("Failed to locate the downloaded video file.")
+            raise Exception("Failed to locate the downloaded video file.")
 
-    # Verify the download success
-    if os.path.exists(output_path):
-        logger.info(f"Audio downloaded successfully: {output_path}")
+        logger.info(f"Video file downloaded: {video_file}")
+
+        try:
+            logger.info("Starting audio extraction from video file...")
+            # Use ffmpeg via pydub to extract audio
+            audio = AudioSegment.from_file(video_file)
+            logger.info("Audio file loaded, exporting to mp3...")
+            audio.export(audio_path, format="mp3")
+            logger.info(f"Audio extracted and saved to: {audio_path}")
+        except Exception as e:
+            logger.error(f"Failed to extract audio from video: {e}")
+            raise Exception(f"Failed to extract audio from video: {e}")
+
+        try:
+            logger.info(f"Removing temporary video file: {video_file}")
+            os.remove(video_file)
+            logger.info(f"Temporary video file {video_file} removed.")
+        except Exception as e:
+            logger.warning(f"Failed to remove temporary video file {video_file}: {e}")
     else:
-        raise Exception(f"Failed to download audio: {output_path}")
+        if not os.path.exists(audio_path):
+            raise Exception(f"Failed to download audio: {audio_path}")
+        logger.info(f"Audio downloaded successfully: {audio_path}")
 
 # Read from stream line by line until EOF, call callback on each line.
 async def read_stream(stream, callback):