Merge pull request #21 from HenestrosaDev/refactor

HenestrosaDev · web-flow · commit 776e5a1aefd8 · 2024-06-05T18:36:15.000+02:00
Refactor
diff --git a/src/controllers/main_controller.py b/src/controllers/main_controller.py
diff --git a/src/handlers/google_api_handler.py b/src/handlers/google_api_handler.py
@@ -0,0 +1,99 @@
+import os
+import shutil
+import traceback
+
+import speech_recognition as sr
+import utils.config_manager as cm
+from models.transcription import Transcription
+from moviepy.video.io.VideoFileClip import VideoFileClip
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+from utils import constants as c
+from utils.path_helper import ROOT_PATH
+
+
+class GoogleApiHandler:
+    @staticmethod
+    async def transcribe_file(transcription: Transcription) -> str:
+        """
+        Transcribes audio from a file using the Google Speech-to-Text API.
+
+        :param transcription: An instance of Transcription containing information
+                              about the audio file.
+        :return: The transcribed text or an error message if transcription fails.
+        """
+        # Can be the transcription or an error text
+        text = ""
+
+        file_path = transcription.source_path
+
+        # Create a directory to store the audio chunks
+        chunks_directory = ROOT_PATH / "audio-chunks"
+        chunks_directory.mkdir(exist_ok=True)
+
+        try:
+            # Get file extension
+            content_type = file_path.suffix
+
+            sound = None
+            # Open the audio file using pydub
+            if content_type in c.AUDIO_FILE_EXTENSIONS:
+                sound = AudioSegment.from_file(file_path)
+
+            elif content_type in c.VIDEO_FILE_EXTENSIONS:
+                clip = VideoFileClip(str(file_path))
+                video_audio_path = chunks_directory / f"{file_path.stem}.wav"
+                clip.audio.write_audiofile(video_audio_path)
+                sound = AudioSegment.from_wav(video_audio_path)
+
+            audio_chunks = split_on_silence(
+                sound,
+                # Minimum duration of silence required to consider a segment as a split point
+                min_silence_len=500,
+                # Audio with a level -X decibels below the original audio level will be considered as silence
+                silence_thresh=sound.dBFS - 40,
+                # Adds a buffer of silence before and after each split point
+                keep_silence=100,
+            )
+
+            # Create a speech recognition object
+            r = sr.Recognizer()
+
+            # Get Google API key (if any)
+            config_google_api = cm.ConfigManager.get_config_google_api()
+            api_key = config_google_api.api_key or None
+
+            # Process each chunk
+            for idx, audio_chunk in enumerate(audio_chunks):
+                # Export audio chunk and save it in the `chunks_directory` directory.
+                chunk_filename = os.path.join(chunks_directory, f"chunk{idx}.wav")
+                audio_chunk.export(chunk_filename, bitrate="192k", format="wav")
+
+                # Recognize the chunk
+                with sr.AudioFile(chunk_filename) as source:
+                    r.adjust_for_ambient_noise(source)
+                    audio_listened = r.record(source)
+
+                    try:
+                        # Try converting it to text
+                        chunk_text = r.recognize_google(
+                            audio_listened,
+                            language=transcription.language_code,
+                            key=api_key,
+                        )
+
+                        chunk_text = f"{chunk_text.capitalize()}. "
+                        text += chunk_text
+                        print(f"chunk text: {chunk_text}")
+
+                    except Exception:
+                        continue
+
+        except Exception:
+            text = traceback.format_exc()
+
+        finally:
+            # Delete temporal directory and files
+            shutil.rmtree(chunks_directory)
+
+            return text
diff --git a/src/handlers/whisperx_handler.py b/src/handlers/whisperx_handler.py
@@ -0,0 +1,99 @@
+import os
+import traceback
+from pathlib import Path
+
+import utils.config_manager as cm
+import whisperx
+from models.transcription import Transcription
+
+
+class WhisperXHandler:
+    def __init__(self):
+        self._whisperx_result = None
+
+    async def transcribe_file(self, transcription: Transcription) -> str:
+        """
+        Transcribe audio from a file using the WhisperX library.
+
+        :param transcription: An instance of Transcription containing information about
+                              the audio file.
+        :type transcription: Transcription
+        :return: The transcribed text or an error message if transcription fails.
+        :rtype: str
+        """
+        config_whisperx = cm.ConfigManager.get_config_whisperx()
+
+        device = "cpu" if config_whisperx.use_cpu else "cuda"
+        task = "translate" if transcription.should_translate else "transcribe"
+
+        try:
+            model = whisperx.load_model(
+                config_whisperx.model_size,
+                device,
+                compute_type=config_whisperx.compute_type,
+                task=task,
+                language=transcription.language_code,
+            )
+
+            audio_path = str(transcription.source_path)
+            audio = whisperx.load_audio(audio_path)
+            self._whisperx_result = model.transcribe(
+                audio, batch_size=config_whisperx.batch_size
+            )
+
+            text_combined = " ".join(
+                segment["text"].strip() for segment in self._whisperx_result["segments"]
+            )
+
+            # Align output if should subtitle
+            if transcription.should_subtitle:
+                model_aligned, metadata = whisperx.load_align_model(
+                    language_code=transcription.language_code, device=device
+                )
+                self._whisperx_result = whisperx.align(
+                    self._whisperx_result["segments"],
+                    model_aligned,
+                    metadata,
+                    audio,
+                    device,
+                    return_char_alignments=False,
+                )
+
+            return text_combined
+
+        except Exception:
+            return traceback.format_exc()
+
+    def generate_subtitles(self, file_path: Path, should_overwrite: bool):
+        """
+        Generate subtitles for a video or audio file.
+
+        :param file_path: The path to the video or audio file for which subtitles are
+                          to be generated.
+        :type file_path: Path
+        :param should_overwrite: Indicates whether existing subtitle files should be
+                                overwritten if they exist. If False, subtitles will
+                                only be generated if no subtitle file exists for
+                                the given format.
+        :type should_overwrite: bool
+        """
+        config_subtitles = cm.ConfigManager.get_config_subtitles()
+
+        output_formats = ["srt", "vtt"]
+        output_dir = file_path.parent
+
+        for output_format in output_formats:
+            path_to_check = file_path.parent / f"{file_path.stem}.{output_format}"
+
+            if should_overwrite or not os.path.exists(path_to_check):
+                writer = whisperx.transcribe.get_writer(output_format, str(output_dir))
+                writer_args = {
+                    "highlight_words": config_subtitles.highlight_words,
+                    "max_line_count": config_subtitles.max_line_count,
+                    "max_line_width": config_subtitles.max_line_width,
+                }
+
+                # https://github.com/m-bain/whisperX/issues/455#issuecomment-1707547704
+                self._whisperx_result["language"] = "en"
+
+                writer(self._whisperx_result, file_path, writer_args)
diff --git a/src/handlers/youtube_handler.py b/src/handlers/youtube_handler.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+from typing import Optional
+
+from pytube import YouTube
+
+
+class YouTubeHandler:
+    @staticmethod
+    def download_audio_from_video(
+        url: str,
+        output_path: str = ".",
+        output_filename: str = "yt-audio.mp3",
+    ) -> Optional[Path]:
+        """
+        Downloads audio from a YouTube video.
+
+        :param url: The URL of the YouTube video.
+        :param output_path: (Optional) The directory where the audio file will be saved.
+                            Default is the current directory.
+        :param output_filename: (Optional) The name of the audio file to be saved.
+                                 Default is "yt-audio.mp3".
+        :return: The path to the downloaded audio file as a Path object,
+                 or None if the download fails.
+        """
+        try:
+            yt = YouTube(url)
+            stream = yt.streams.filter(only_audio=True).first()
+            output_file = stream.download(
+                output_path=output_path, filename=output_filename
+            )
+
+            return Path(output_file) if output_file else None
+
+        except Exception:
+            return None
diff --git a/src/models/transcription.py b/src/models/transcription.py
@@ -9,7 +9,7 @@
 class Transcription:
     text: Optional[str] = None
     language_code: Optional[str] = None
-    source: Optional[AudioSource] = None
+    source_type: Optional[AudioSource] = None
     source_path: Path = Path("/")
     method: Optional[int] = None
     should_translate: bool = False
diff --git a/src/utils/audio_utils.py b/src/utils/audio_utils.py
@@ -2,7 +2,15 @@
 from pydub import AudioSegment
 
 
-def save_audio_data(audio_data, filename):
+def save_audio_data(audio_data: list[sr.AudioData], filename: str):
+    """
+    Save recorded audio data to a WAV file.
+
+    :param audio_data: A list of recorded audio chunks.
+    :type audio_data: list[sr.AudioData]
+    :param filename: The name of the file to save the audio data to.
+    :type filename: str
+    """
     if audio_data:
         raw_audio_data = b"".join(
             [
diff --git a/src/utils/config_manager.py b/src/utils/config_manager.py
@@ -9,11 +9,11 @@
 
 
 class ConfigManager:
-    _FILE_PATH = ROOT_PATH / "config.ini"
+    _CONFIG_FILE_PATH = ROOT_PATH / "config.ini"
     KeyType = Union[ConfigWhisperX.Key, ConfigGoogleApi.Key, ConfigSubtitles.Key]
 
     @staticmethod
-    def read_config(file_path: Path = _FILE_PATH) -> Optional[ConfigParser]:
+    def read_config(file_path: Path = _CONFIG_FILE_PATH) -> Optional[ConfigParser]:
         config = ConfigParser()
         config.read(file_path)
         return config
@@ -62,7 +62,7 @@ def get_config_subtitles() -> ConfigSubtitles:
     def get_value(
         section: KeyType,
         key: KeyType,
-        file_path: Path = _FILE_PATH,
+        file_path: Path = _CONFIG_FILE_PATH,
     ) -> Optional[Union[str, bool, int, float]]:
         config = ConfigManager.read_config(file_path)
 
@@ -91,7 +91,7 @@ def modify_value(
         section: KeyType,
         key: KeyType,
         new_value: str,
-        file_path: Path = _FILE_PATH,
+        file_path: Path = _CONFIG_FILE_PATH,
     ):
         config = ConfigManager.read_config(file_path)
 
diff --git a/src/utils/constants.py b/src/utils/constants.py
@@ -128,3 +128,6 @@
     ".ogv", ".ogx",  # OGG
     ".wmv", ".asf"  # AIFF / ASF
 ]
+# fmt: on
+
+SUPPORTED_FILE_EXTENSIONS = AUDIO_FILE_EXTENSIONS + VIDEO_FILE_EXTENSIONS
diff --git a/src/utils/i18n.py b/src/utils/i18n.py
diff --git a/src/views/main_window.py b/src/views/main_window.py

Original file line number	Diff line number	Diff line change
`@@ -128,3 +128,6 @@`
`128`	`128`	`".ogv", ".ogx", # OGG`
`129`	`129`	`".wmv", ".asf" # AIFF / ASF`
`130`	`130`	`]`
	`131`	`+# fmt: on`
	`132`	`+`
	`133`	`+SUPPORTED_FILE_EXTENSIONS = AUDIO_FILE_EXTENSIONS + VIDEO_FILE_EXTENSIONS`