diff --git a/.catgitignore b/.catgitignore index 6a30a00..c8b8f92 100644 --- a/.catgitignore +++ b/.catgitignore @@ -2,3 +2,4 @@ # https://github.com/FlyingFathead/catgit README.md tests/diarize_with_whisper-test.py +utils/resemblyzer_safety_check.py diff --git a/Dockerfile b/Dockerfile index b114fa3..6dac21b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:slim-bookworm +FROM python:3.12-slim # Install dependencies & clean up after to reduce Docker file size RUN apt-get update && apt-get install -y \ @@ -13,6 +13,9 @@ WORKDIR /app # Copy the requirements file first to leverage Docker cache COPY requirements.txt . +# Upgrade pip and setuptools +RUN pip install --upgrade pip setuptools wheel + # Install Python dependencies RUN pip3 install --no-cache-dir -r requirements.txt diff --git a/README.md b/README.md index dc32546..461445f 100644 --- a/README.md +++ b/README.md @@ -181,7 +181,7 @@ Replace `'YourTelegramBotToken'` with your actual Telegram bot token. This comma ## Usage -After launching the bot, you can interact with it via Telegram (message `@whatever_your_bot_name_is_Bot`): +After launching your bot successfully, you can interact with it via Telegram (send a message to `@your_bot_name_Bot`, or whatever your bot name is): 1. Send a video URL (for `yt-dlp` to download), a voice message or an audio file (i.e. `.wav` or `.mp3` format) to the bot. 2. The bot will acknowledge the request and begin processing, notifying the user of the process. @@ -195,6 +195,14 @@ After launching the bot, you can interact with it via Telegram (message `@whatev - `/language` - set the model's transcription language (`auto` = autodetect); if you know the language spoken in the audio, setting the transcription language manually with this command may improve both transcription speed and accuracy. ## Changes +- v0.1707 - New `config.ini` option: add sites that require full video download + - some media sites don't work well with `yt-dlp`'s audio-only download method + - there are now two new options in `config.ini` under `[YTDLPSettings]`: + - `download_original_video_for_domains_active = true` (default) + - `download_original_video_domains = site1.com, site2.com, site3.com` + - at the moment it's used for media platforms that have had reported issues during testing + - when active, a comma-separated list is used to check up on media sites that require their contents to be downloaded as the original video instead of audio-only + - _(the tradeoff is obviously download size and hence speed; the audio-only method is usually the fastest and should be preferred for most popular sites, hence only add problematic sites to the video-only list)_ - v0.1706 - Disable asking for token if running inside Docker - by default, the app will ask for the token if it's not found, unless Dockerized - can be better for headless use case scenarios where you need the error message rather than a prompt for the bot token diff --git a/config/config.ini b/config/config.ini index 8ca5575..afaa4c5 100644 --- a/config/config.ini +++ b/config/config.ini @@ -61,5 +61,27 @@ cooldown_seconds = 10 max_requests_per_minute = 5 [YTDLPSettings] +# use your own `cookies.txt` (true/false) +# this is sometimes required for sites that require login +# or, in some cases, with sites like YouTube that don't like downloaders. use_cookies = False -cookies_file = config/cookies.txt \ No newline at end of file +cookies_file = config/cookies.txt +# some media sites don't always work well with yt-dlp's audio download feature +# for compatibility, it's recommended to enable the flag below (true) +download_original_video_for_domains_active = true +# list your sites below to download original videos from, comma separated. +# example: +# download_original_video_domains = site1.com, site2.com, site3.com +# these are the sites we use to download original videos from +download_original_video_domains = rumble.com +# use worst video quality (true/false) +# this is usually recommended, because we will only need the _audio_ for transcription. +# adding a high-quality video will cause massive file size increases. +# however, in some cases you might want to turn this off +use_worst_video_quality = true + +[VideoDescriptionSettings] +# Set to True to use only a snippet of the video description +use_snippet_for_description = False +# Maximum number of lines to include in the description snippet +description_max_lines = 30 diff --git a/src/config_loader.py b/src/config_loader.py index 7bd1ca5..ee2dd2d 100644 --- a/src/config_loader.py +++ b/src/config_loader.py @@ -45,6 +45,20 @@ def get_notification_settings(cls): 'completion_message': completion_message } + # NEW: Method to get yt-dlp domain settings + @classmethod + def get_ytdlp_domain_settings(cls): + config = cls.get_config() + active = config.getboolean('YTDLPSettings', 'download_original_video_for_domains_active', fallback=False) + domains = config.get('YTDLPSettings', 'download_original_video_domains', fallback='') + # Split by comma and strip whitespace + domain_list = [domain.strip().lower() for domain in domains.split(',') if domain.strip()] + + return { + 'active': active, + 'domains': domain_list + } + # Usage example: # from config_loader import ConfigLoader # notification_settings = ConfigLoader.get_notification_settings() diff --git a/src/main.py b/src/main.py index ec5dc24..73f373a 100755 --- a/src/main.py +++ b/src/main.py @@ -3,7 +3,7 @@ # openai-whisper transcriber-bot for Telegram # version of this program -version_number = "0.1706" +version_number = "0.1707" # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # https://github.com/FlyingFathead/whisper-transcriber-telegram-bot/ diff --git a/src/transcription_handler.py b/src/transcription_handler.py index c0b0885..c82ea6f 100644 --- a/src/transcription_handler.py +++ b/src/transcription_handler.py @@ -33,17 +33,10 @@ from config_loader import ConfigLoader config = ConfigLoader.get_config() -# # Load config -# config = configparser.ConfigParser() -# config.read('config/config.ini') -# send_as_files = config.getboolean('TranscriptionSettings', 'sendasfiles', fallback=True) -# send_as_messages = config.getboolean('TranscriptionSettings', 'sendasmessages', fallback=False) - # Toggle this to use the full description or a snippet. -USE_SNIPPET_FOR_DESCRIPTION = False - +USE_SNIPPET_FOR_DESCRIPTION = config.getboolean('VideoDescriptionSettings', 'use_snippet_for_description', fallback=False) # If we're using a snippet of the description, maximum number of lines to include -DESCRIPTION_MAX_LINES = 30 +DESCRIPTION_MAX_LINES = config.getint('VideoDescriptionSettings', 'description_max_lines', fallback=30) # Output directory for transcriptions; create if doesn't exist output_dir = "transcriptions" @@ -178,82 +171,134 @@ def get_transcription_settings(): 'send_as_messages': False, } -# # (old) get transcription settings -# def get_transcription_settings(): -# config = configparser.ConfigParser() -# config_path = os.path.join(base_dir, 'config', 'config.ini') - -# if not os.path.exists(config_path): -# logger.error("Error: config.ini not found at the expected path.") -# sys.exit(1) - -# config.read(config_path) - -# if 'TranscriptionSettings' not in config: -# logger.error("TranscriptionSettings section missing in config.ini") -# sys.exit(1) - -# include_header = config.getboolean('TranscriptionSettings', 'IncludeHeaderInTranscription', fallback=False) -# keep_audio_files = config.getboolean('TranscriptionSettings', 'KeepAudioFiles', fallback=False) - -# logger.info(f"Transcription settings loaded: include_header={include_header}, keep_audio_files={keep_audio_files}") - -# return { -# 'include_header': include_header, -# 'keep_audio_files': keep_audio_files -# } - # split long messages def split_message(message, max_length=4096): return [message[i:i+max_length] for i in range(0, len(message), max_length)] -# audio download -async def download_audio(url, output_path): - logger.info(f"Attempting to download audio from: {url}") - - # Read settings from configuration +# // audio download (new method) +async def download_audio(url, audio_path): + config = ConfigLoader.get_config() + ytdlp_settings = ConfigLoader.get_ytdlp_domain_settings() use_cookies = config.getboolean('YTDLPSettings', 'use_cookies', fallback=False) cookies_file = config.get('YTDLPSettings', 'cookies_file', fallback='config/cookies.txt') + use_worst_video_quality = config.getboolean('YTDLPSettings', 'use_worst_video_quality', fallback=True) + + parsed_url = urlparse(url) + domain = parsed_url.netloc.lower() + if domain.startswith('www.'): + domain = domain[4:] # Remove 'www.' + + should_download_video = ytdlp_settings['active'] and domain in ytdlp_settings['domains'] + + if should_download_video: + logger.info("Identified domain requiring full video download.") + # Step 1: Get available formats in JSON + command = [ + "yt-dlp", + "--no-warnings", + "--dump-json", + url + ] + if use_cookies and os.path.exists(cookies_file): + command.extend(["--cookies", cookies_file]) - # Specify a cache directory that yt-dlp can write to - cache_dir = ".cache" - - # Check if the cache directory exists, create it if it doesn't - if not os.path.exists(cache_dir): - try: - os.makedirs(cache_dir) - logger.info(f"Created cache directory: {cache_dir}") - except Exception as e: - logger.error(f"Failed to create cache directory {cache_dir}: {e}") - - command = [ - "yt-dlp", - "--extract-audio", - "--audio-format", "mp3", - "--cache-dir", cache_dir, # Specify the custom cache directory - ] + process = await asyncio.create_subprocess_exec( + *command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) - if use_cookies: - if os.path.exists(cookies_file): - command.extend(["--cookies", cookies_file]) - logger.info(f"Using cookies file: {cookies_file}") + stdout_data, stderr_data = await process.communicate() + if process.returncode != 0: + stderr_output = stderr_data.decode() + logger.error(f"Failed to get video formats: {stderr_output}") + raise Exception(f"Failed to get video formats: {stderr_output}") + + # Step 2: Parse JSON to find the appropriate format + video_info = json.loads(stdout_data.decode()) + formats = video_info.get('formats', []) + + if not formats: + raise Exception("No formats found for the video.") + + # Filter out formats without audio + video_formats = [ + fmt for fmt in formats + if fmt.get('vcodec') != 'none' and fmt.get('acodec') != 'none' and fmt.get('acodec') != 'video only' + ] + + if not video_formats: + raise Exception("No suitable video formats with audio available.") + + if use_worst_video_quality: + # Sort video formats by resolution (width x height) or bitrate + def get_format_sort_key(fmt): + width = fmt.get('width') or 0 + height = fmt.get('height') or 0 + total_pixels = width * height + tbr = fmt.get('tbr') or 0 + return (total_pixels, tbr) + + selected_format = min(video_formats, key=get_format_sort_key) + logger.info("Selected worst quality video format.") else: - logger.error(f"Cookies file {cookies_file} does not exist.") - raise Exception(f"Cookies file {cookies_file} does not exist.") + # Select best quality video format + def get_format_sort_key(fmt): + width = fmt.get('width') or 0 + height = fmt.get('height') or 0 + total_pixels = width * height + tbr = fmt.get('tbr') or 0 + return (-total_pixels, -tbr) + + selected_format = max(video_formats, key=get_format_sort_key) + logger.info("Selected best quality video format.") + + selected_format_id = selected_format.get('format_id') + + if not selected_format_id: + raise Exception("Could not determine selected format ID.") + + logger.info(f"Selected format ID: {selected_format_id}") + + # Step 3: Download video using the selected format + base_output_path = audio_path.replace('.mp3', '') # e.g., audio/12345_1618033988 + video_output_template = f"{base_output_path}.%(ext)s" # e.g., audio/12345_1618033988.mp4 + + command = [ + "yt-dlp", + # "--verbose", # uncomment to set verbose + "--format", selected_format_id, + "--output", video_output_template, + url + ] + if use_cookies and os.path.exists(cookies_file): + command.extend(["--cookies", cookies_file]) - command.extend([url, "-o", output_path]) + logger.info("Downloading the selected quality video with audio...") + else: + # Download audio-only as mp3 + command = [ + "yt-dlp", + "--extract-audio", + "--audio-format", "mp3", + "--output", audio_path, + url + ] + if use_cookies and os.path.exists(cookies_file): + command.extend(["--cookies", cookies_file]) + logger.info("Downloading audio-only...") # Start the subprocess process = await asyncio.create_subprocess_exec( - *command, - stdout=asyncio.subprocess.PIPE, + *command, + stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) + # Read and log output stdout_lines = [] stderr_lines = [] - # Define async functions to read from stdout and stderr async def read_stream(stream, lines, log_func): while True: line = await stream.readline() @@ -264,42 +309,55 @@ async def read_stream(stream, lines, log_func): else: break - # Read from stdout and stderr concurrently await asyncio.gather( read_stream(process.stdout, stdout_lines, logger.info), read_stream(process.stderr, stderr_lines, logger.error) ) - # Wait for the process to finish await process.wait() - # Check the return code if process.returncode != 0: stderr_output = '\n'.join(stderr_lines) - # Check for specific error messages - if any(keyword in stderr_output for keyword in [ - "Sign in to confirm you're not a bot", - "unable to extract initial player response", - "This video is unavailable", - "ERROR:" - ]): - custom_error_message = ( - "Failed to download audio due to YouTube's anti-bot measures or video restrictions. " - "Possible reasons include age restrictions, region locks, or the video requiring sign-in. " - "Please try a different video, or if you're the administrator, consider using cookies with `yt-dlp`." - ) - logger.error(f"Error: {custom_error_message}") - raise Exception(custom_error_message) - else: - # For other errors, raise a generic exception with stderr output - logger.error(f"yt-dlp failed with error:\n{stderr_output}") - raise Exception(f"Failed to download audio: {stderr_output}") + logger.error(f"yt-dlp failed with error:\n{stderr_output}") + raise Exception(f"Failed to download media: {stderr_output}") + + if should_download_video: + # Step 4: Extract audio from the downloaded video + video_extensions = ['mp4', 'webm', 'mkv', 'avi', 'mov', 'flv', 'wmv', 'mpg', 'mpeg'] + video_file = None + for ext in video_extensions: + potential_video = f"{base_output_path}.{ext}" + if os.path.exists(potential_video): + video_file = potential_video + break + + if not video_file: + logger.error("Failed to locate the downloaded video file.") + raise Exception("Failed to locate the downloaded video file.") - # Verify the download success - if os.path.exists(output_path): - logger.info(f"Audio downloaded successfully: {output_path}") + logger.info(f"Video file downloaded: {video_file}") + + try: + logger.info("Starting audio extraction from video file...") + # Use ffmpeg via pydub to extract audio + audio = AudioSegment.from_file(video_file) + logger.info("Audio file loaded, exporting to mp3...") + audio.export(audio_path, format="mp3") + logger.info(f"Audio extracted and saved to: {audio_path}") + except Exception as e: + logger.error(f"Failed to extract audio from video: {e}") + raise Exception(f"Failed to extract audio from video: {e}") + + try: + logger.info(f"Removing temporary video file: {video_file}") + os.remove(video_file) + logger.info(f"Temporary video file {video_file} removed.") + except Exception as e: + logger.warning(f"Failed to remove temporary video file {video_file}: {e}") else: - raise Exception(f"Failed to download audio: {output_path}") + if not os.path.exists(audio_path): + raise Exception(f"Failed to download audio: {audio_path}") + logger.info(f"Audio downloaded successfully: {audio_path}") # Read from stream line by line until EOF, call callback on each line. async def read_stream(stream, callback):