yagyesh-bobde · dhananjaylab · Aug 13, 2025
diff --git a/whisper/.env b/whisper/.env
@@ -0,0 +1 @@
+OPENAI_API_KEY=OPENAI_API_KEY
diff --git a/whisper/1_basic_call_english_only.py b/whisper/1_basic_call_english_only.py
@@ -0,0 +1,15 @@
+import whisper
+from pathlib import Path
+
+
+MODEL = whisper.load_model("medium.en")
+AUDIO_DIR = Path(__file__).parent / "test_audio_files"
+
+
+def get_transcription(audio_file: str):
+    result = MODEL.transcribe(audio_file)
+    print(result)
+    return result
+
+
+get_transcription(str(AUDIO_DIR / "terrible_quality.mp3"))
diff --git a/whisper/1_multiple_languages.py b/whisper/1_multiple_languages.py
@@ -0,0 +1,35 @@
+import whisper
+from pathlib import Path
+
+AUDIO_DIR = Path(__file__).parent / "test_audio_files"
+model = whisper.load_model("medium")
+
+
+def detect_language_and_transcribe(audio_file: str):
+    audio = whisper.load_audio(audio_file)
+    audio = whisper.pad_or_trim(audio)
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    _, language_probs = model.detect_language(mel)
+    language: str = max(language_probs, key=language_probs.get)  # type: ignore
+    print(f"Detected language: {language}")
+    options = whisper.DecodingOptions(language=language, task="transcribe")
+    result = whisper.decode(model, mel, options)
+    print(result)
+    return result.text  # type: ignore
+
+
+# dutch_test = detect_language_and_transcribe(
+#     str(AUDIO_DIR / "dutch_the_netherlands.mp3")
+# )
+
+
+# result = model.transcribe(str(AUDIO_DIR / "dutch_the_netherlands.mp3"), verbose=True)
+# print(result["text"])
+
+result = model.transcribe(
+    str(AUDIO_DIR / "dutch_the_netherlands.mp3"),
+    verbose=True,
+    language="nl",
+    task="translate",
+)
+print(result["text"])
diff --git a/whisper/2_whisper_pods.py b/whisper/2_whisper_pods.py
@@ -0,0 +1,88 @@
+import uuid
+from pathlib import Path
+
+import gradio as gr
+import whisper
+from whisper.utils import WriteSRT, WriteVTT
+
+from settings import BASE_DIR, OUTPUT_TEMP_DIR, STYLES_DIR
+from utils import podcast, subtitles
+
+
+WHISPER_MODEL = whisper.load_model("base")
+VTT_WRITER = WriteVTT(output_dir=str(OUTPUT_TEMP_DIR))
+SRT_WRITER = WriteSRT(output_dir=str(OUTPUT_TEMP_DIR))
+
+
+def transcribe_and_summarize(page_link: str) -> tuple[str, str, str, str]:
+    unique_id = uuid.uuid4()
+
+    podcast_download_url = podcast.scrape_link_from_page(page_link)
+    mp3_file: Path = podcast.download(podcast_download_url, unique_id, OUTPUT_TEMP_DIR)
+
+    whisper_output = WHISPER_MODEL.transcribe(str(mp3_file))
+    with open(BASE_DIR / "pods_log.txt", "w", encoding="utf-8") as f:
+        f.write(str(whisper_output))
+
+    transcription = str(whisper_output["text"])
+    summary = podcast.get_summary(transcription)
+
+    get_sub_path = lambda ext: OUTPUT_TEMP_DIR / f"{unique_id}{ext}"
+    vtt_subs = subtitles.write_to_file(whisper_output, VTT_WRITER, get_sub_path(".vtt"))
+    srt_subs = subtitles.write_to_file(whisper_output, SRT_WRITER, get_sub_path(".srt"))
+
+    return (summary, transcription, str(vtt_subs), str(srt_subs))
+
+
+if __name__ == "__main__":
+    block = gr.Blocks(css=str(STYLES_DIR / "whisper_pods.css"))
+
+    with block:
+        with gr.Group():
+            gr.HTML(
+                f"""
+                <div class="header">
+                <img src="https://i.imgur.com/8Xu2rwG.png" referrerpolicy="no-referrer" />
+                </div>
+                """
+            )
+
+            podcast_link_input = gr.Textbox(label="Google Podcasts Link:")
+
+            with gr.Row():
+                btn = gr.Button("🎙️ Transcribe and summarize my podcast! 🎙️")
+
+            summary_output = gr.Textbox(
+                label="Podcast Summary",
+                placeholder="Podcast Summary",
+                lines=4,
+                autoscroll=False,
+            )
+
+            transcription_output = gr.Textbox(
+                label="Podcast Transcription",
+                placeholder="Podcast Transcription",
+                lines=8,
+                autoscroll=False,
+            )
+
+            with gr.Row():
+                vtt_sub_output = gr.File(
+                    label="VTT Subtitle file download", elem_classes=["vtt-sub-file"]
+                )
+                srt_sub_output = gr.File(
+                    label="SRT Subtitle file download", elem_classes=["srt-sub-file"]
+                )
+
+            btn.click(
+                transcribe_and_summarize,
+                inputs=[podcast_link_input],
+                outputs=[
+                    summary_output,
+                    transcription_output,
+                    vtt_sub_output,
+                    srt_sub_output,
+                ],
+            )
+
+    block.launch(debug=True)
diff --git a/whisper/3_subtitle_master.py b/whisper/3_subtitle_master.py
@@ -0,0 +1,76 @@
+import os
+import uuid
+
+import gradio as gr
+import whisper
+from whisper.utils import WriteVTT
+
+from settings import BASE_DIR, OUTPUT_TEMP_DIR, OUTPUT_VIDEO_DIR, STYLES_DIR
+from utils import command, subtitles, video
+
+
+MODEL = whisper.load_model("base.en")
+VTT_WRITER = WriteVTT(output_dir=str(OUTPUT_TEMP_DIR))
+
+
+def get_unique_project_name(input_video: str) -> str:
+    """Get a unique subtitle-master project name to avoid file-name clashes."""
+    unique_id = uuid.uuid4()
+    filename = os.path.basename(input_video)
+    base_fname, _ = os.path.splitext(filename)
+    return f"{base_fname}_{unique_id}"
+
+
+def main(input_video: str) -> str:
+    """Takes a video file as string path and returns a video file with subtitles embedded as string path."""
+    unique_project_name = get_unique_project_name(input_video)
+    get_temp_output_path = lambda ext: OUTPUT_TEMP_DIR / f"{unique_project_name}{ext}"
+    mp3_file = video.to_mp3(
+        input_video,
+        log_directory=BASE_DIR,
+        output_path=get_temp_output_path(".mp3"),
+    )
+
+    whisper_output = MODEL.transcribe(mp3_file, beam_size=5)
+    vtt_subs = subtitles.write_to_file(
+        whisper_output,
+        writer=VTT_WRITER,
+        output_path=get_temp_output_path(".vtt"),
+    )
+
+    vtt_string_path = command.format_ffmpeg_filepath(vtt_subs)
+    output_video_path = OUTPUT_VIDEO_DIR / f"{unique_project_name}_subs.mp4"
+    embed_subs_into_vid_command = f'ffmpeg -i "{input_video}" -vf "subtitles=\'{vtt_string_path}\'" "{output_video_path}"'
+
+    command.run_and_log(embed_subs_into_vid_command, log_directory=BASE_DIR)
+
+    return str(output_video_path)
+
+
+if __name__ == "__main__":
+    block = gr.Blocks(
+        css=str(STYLES_DIR / "subtitle_master.css"),
+        theme=gr.themes.Soft(primary_hue=gr.themes.colors.emerald),
+    )
+
+    with block:
+        with gr.Group():
+            gr.HTML(
+                f"""
+                <div class="header">
+                <img src="https://i.imgur.com/dxHMfCI.png" referrerpolicy="no-referrer" />
+                </div>
+                """
+            )
+            with gr.Row():
+                input_video = gr.Video(
+                    label="Input Video", sources=["upload"], mirror_webcam=False
+                )
+                output_video = gr.Video()
+            with gr.Row():
+                button_text = "🎞️ Subtitle my video! 🎞️"
+                btn = gr.Button(value=button_text, elem_classes=["button-row"])
+
+            btn.click(main, inputs=[input_video], outputs=[output_video])
+
+    block.launch(debug=True)
diff --git a/whisper/4_faster_whisper.py b/whisper/4_faster_whisper.py
@@ -0,0 +1,22 @@
+from faster_whisper import WhisperModel
+from settings import TEST_AUDIO_DIR
+
+model_size = "small"
+
+model = WhisperModel(model_size, device="cpu", compute_type="int8")
+# # Choose only one of these, depending on if you're running on CPU or GPU (cuda). (I'll be using the second option)
+# model = WhisperModel(model_size, device="cuda", compute_type="float16")
+
+
+segments, info = model.transcribe(
+    str(TEST_AUDIO_DIR / "dutch_long_repeat_file.mp3"),
+    beam_size=5,
+    without_timestamps=True,
+)
+
+print(
+    f"Detected language '{info.language}' with probability {info.language_probability}"
+)
+
+for segment in segments:
+    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
diff --git a/whisper/4_vid_to_quiz.py b/whisper/4_vid_to_quiz.py
@@ -0,0 +1,69 @@
+import os
+import uuid
+from pathlib import Path
+
+import gradio as gr
+
+from settings import BASE_DIR, OUTPUT_TEMP_DIR, STYLES_DIR
+from utils import openai_api, video
+
+
+API_UPLOAD_LIMIT_BYTES = 26214400  # 25mb
+
+
+def check_upload_size(input_file: str) -> None:
+    """Check the video file size is within the API upload limit."""
+    input_file_size = os.path.getsize(input_file)
+    if input_file_size > API_UPLOAD_LIMIT_BYTES:
+        raise ValueError(
+            f"File size of {input_file_size} bytes ({input_file_size / 1024 / 1024:.2f} MB) exceeds the API upload limit of {API_UPLOAD_LIMIT_BYTES} bytes ({API_UPLOAD_LIMIT_BYTES / 1024 / 1024:.2f} MB). Please use a shorter video or lower the audio quality settings."
+        )
+
+
+def main(input_video: str) -> str:
+    """Takes a video file as string path and returns a quiz as string."""
+    unique_id = uuid.uuid4()
+
+    mp3_file = video.to_mp3(
+        input_video,
+        log_directory=BASE_DIR,
+        output_path=OUTPUT_TEMP_DIR / f"{unique_id}.mp3",
+        mono=True,
+    )
+
+    check_upload_size(mp3_file)
+    transcription = openai_api.transcribe(
+        Path(mp3_file), language="en", translate=False, response_format="text"
+    )
+
+    quiz = openai_api.text_to_quiz(transcription)
+    return quiz
+
+
+if __name__ == "__main__":
+    block = gr.Blocks(
+        css=str(STYLES_DIR / "vid2quiz.css"),
+        theme=gr.themes.Soft(primary_hue=gr.themes.colors.yellow),
+    )
+
+    with block:
+        with gr.Group():
+            gr.HTML(
+                f"""
+                <div class="header">
+                <img src="https://i.imgur.com/oEtZKEh.png" referrerpolicy="no-referrer" class="header-img" />
+                </div>
+                """
+            )
+            with gr.Row():
+                input_video = gr.Video(
+                    label="Input Video", sources=["upload"], mirror_webcam=False
+                )
+                output_quiz_text = gr.Textbox(label="Quiz")
+            with gr.Row():
+                button_text = "📝 Make a quiz about this video! 📝"
+                btn = gr.Button(value=button_text, elem_classes=["button-row"])
+
+            btn.click(main, inputs=[input_video], outputs=[output_quiz_text])
+
+    block.launch(debug=True)
diff --git a/whisper/images/subtitle_master.png b/whisper/images/subtitle_master.png
diff --git a/whisper/images/vid2quiz.png b/whisper/images/vid2quiz.png
diff --git a/whisper/images/whisper_pods.png b/whisper/images/whisper_pods.png
diff --git a/whisper/settings.py b/whisper/settings.py
@@ -0,0 +1,7 @@
+from pathlib import Path
+
+BASE_DIR = Path(__file__).parent
+OUTPUT_TEMP_DIR = BASE_DIR / "output_temp_files"
+OUTPUT_VIDEO_DIR = BASE_DIR / "output_video"
+STYLES_DIR = BASE_DIR / "styles"
+TEST_AUDIO_DIR = BASE_DIR / "test_audio_files"
diff --git a/whisper/styles/subtitle_master.css b/whisper/styles/subtitle_master.css
@@ -0,0 +1,8 @@
+.header {
+  padding: 2em 8em;
+}
+
+.header,
+.button-row {
+  background-color: #1d366f7e;
+}
diff --git a/whisper/styles/vid2quiz.css b/whisper/styles/vid2quiz.css
@@ -0,0 +1,15 @@
+.header {
+  display: flex;
+  justify-content: center;
+  align-items: center;
+  padding: 2em 8em;
+}
+
+.header-img {
+  max-width: 50%;
+}
+
+.header,
+.button-row {
+  background-color: #0c1d36;
+}
diff --git a/whisper/styles/whisper_pods.css b/whisper/styles/whisper_pods.css
@@ -0,0 +1,8 @@
+.header {
+  padding: 2em 8em;
+}
+
+.vtt-sub-file,
+.srt-sub-file {
+  height: 80px;
+}
diff --git a/whisper/test_audio_files/dutch_long_repeat_file.mp3 b/whisper/test_audio_files/dutch_long_repeat_file.mp3
diff --git a/whisper/test_audio_files/dutch_the_netherlands.mp3 b/whisper/test_audio_files/dutch_the_netherlands.mp3
diff --git a/whisper/test_audio_files/high_quality.mp3 b/whisper/test_audio_files/high_quality.mp3
diff --git a/whisper/test_audio_files/low_quality.mp3 b/whisper/test_audio_files/low_quality.mp3
diff --git a/whisper/test_audio_files/terrible_quality.mp3 b/whisper/test_audio_files/terrible_quality.mp3
diff --git a/whisper/utils/__init__.py b/whisper/utils/__init__.py
diff --git a/whisper/utils/command.py b/whisper/utils/command.py
@@ -0,0 +1,26 @@
+import datetime
+import subprocess
+from pathlib import Path
+
+
+def print_blue(message: str) -> None:
+    print(f"\033[94m{message}\033[00m")
+
+
+def run_and_log(command: str, log_directory: Path) -> None:
+    print_blue(f"Running command: \n{command}")
+    with open(log_directory / "commands_log.txt", "a+", encoding="utf-8") as file:
+        subprocess.call(
+            command,
+            stdout=file,
+            stderr=file,
+        )
+        file.write(
+            f"\nRan command: {command}\nDate/time: {datetime.datetime.now()}\n\n\n\n"
+        )
+
+
+def format_ffmpeg_filepath(path: Path) -> str:
+    r"""Turns C:\Users\dirk\test/subtitle.vtt into C\:\\Users\\dirk\\test\\subtitle.vtt"""
+    string_path = str(path)
+    return string_path.replace("\\", "\\\\").replace("/", "\\\\").replace(":", "\\:")