diff --git a/whisper/.env b/whisper/.env
new file mode 100644
index 0000000..85b4f6e
--- /dev/null
+++ b/whisper/.env
@@ -0,0 +1 @@
+OPENAI_API_KEY=OPENAI_API_KEY
\ No newline at end of file
diff --git a/whisper/1_basic_call_english_only.py b/whisper/1_basic_call_english_only.py
new file mode 100644
index 0000000..ecc3d96
--- /dev/null
+++ b/whisper/1_basic_call_english_only.py
@@ -0,0 +1,15 @@
+import whisper
+from pathlib import Path
+
+
+MODEL = whisper.load_model("medium.en")
+AUDIO_DIR = Path(__file__).parent / "test_audio_files"
+
+
+def get_transcription(audio_file: str):
+ result = MODEL.transcribe(audio_file)
+ print(result)
+ return result
+
+
+get_transcription(str(AUDIO_DIR / "terrible_quality.mp3"))
diff --git a/whisper/1_multiple_languages.py b/whisper/1_multiple_languages.py
new file mode 100644
index 0000000..7240ca6
--- /dev/null
+++ b/whisper/1_multiple_languages.py
@@ -0,0 +1,35 @@
+import whisper
+from pathlib import Path
+
+AUDIO_DIR = Path(__file__).parent / "test_audio_files"
+model = whisper.load_model("medium")
+
+
+def detect_language_and_transcribe(audio_file: str):
+ audio = whisper.load_audio(audio_file)
+ audio = whisper.pad_or_trim(audio)
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
+ _, language_probs = model.detect_language(mel)
+ language: str = max(language_probs, key=language_probs.get) # type: ignore
+ print(f"Detected language: {language}")
+ options = whisper.DecodingOptions(language=language, task="transcribe")
+ result = whisper.decode(model, mel, options)
+ print(result)
+ return result.text # type: ignore
+
+
+# dutch_test = detect_language_and_transcribe(
+# str(AUDIO_DIR / "dutch_the_netherlands.mp3")
+# )
+
+
+# result = model.transcribe(str(AUDIO_DIR / "dutch_the_netherlands.mp3"), verbose=True)
+# print(result["text"])
+
+result = model.transcribe(
+ str(AUDIO_DIR / "dutch_the_netherlands.mp3"),
+ verbose=True,
+ language="nl",
+ task="translate",
+)
+print(result["text"])
diff --git a/whisper/2_whisper_pods.py b/whisper/2_whisper_pods.py
new file mode 100644
index 0000000..13348bc
--- /dev/null
+++ b/whisper/2_whisper_pods.py
@@ -0,0 +1,88 @@
+import uuid
+from pathlib import Path
+
+import gradio as gr
+import whisper
+from whisper.utils import WriteSRT, WriteVTT
+
+from settings import BASE_DIR, OUTPUT_TEMP_DIR, STYLES_DIR
+from utils import podcast, subtitles
+
+
+WHISPER_MODEL = whisper.load_model("base")
+VTT_WRITER = WriteVTT(output_dir=str(OUTPUT_TEMP_DIR))
+SRT_WRITER = WriteSRT(output_dir=str(OUTPUT_TEMP_DIR))
+
+
+def transcribe_and_summarize(page_link: str) -> tuple[str, str, str, str]:
+ unique_id = uuid.uuid4()
+
+ podcast_download_url = podcast.scrape_link_from_page(page_link)
+ mp3_file: Path = podcast.download(podcast_download_url, unique_id, OUTPUT_TEMP_DIR)
+
+ whisper_output = WHISPER_MODEL.transcribe(str(mp3_file))
+ with open(BASE_DIR / "pods_log.txt", "w", encoding="utf-8") as f:
+ f.write(str(whisper_output))
+
+ transcription = str(whisper_output["text"])
+ summary = podcast.get_summary(transcription)
+
+ get_sub_path = lambda ext: OUTPUT_TEMP_DIR / f"{unique_id}{ext}"
+ vtt_subs = subtitles.write_to_file(whisper_output, VTT_WRITER, get_sub_path(".vtt"))
+ srt_subs = subtitles.write_to_file(whisper_output, SRT_WRITER, get_sub_path(".srt"))
+
+ return (summary, transcription, str(vtt_subs), str(srt_subs))
+
+
+if __name__ == "__main__":
+ block = gr.Blocks(css=str(STYLES_DIR / "whisper_pods.css"))
+
+ with block:
+ with gr.Group():
+ gr.HTML(
+ f"""
+
+ """
+ )
+
+ podcast_link_input = gr.Textbox(label="Google Podcasts Link:")
+
+ with gr.Row():
+ btn = gr.Button("🎙️ Transcribe and summarize my podcast! 🎙️")
+
+ summary_output = gr.Textbox(
+ label="Podcast Summary",
+ placeholder="Podcast Summary",
+ lines=4,
+ autoscroll=False,
+ )
+
+ transcription_output = gr.Textbox(
+ label="Podcast Transcription",
+ placeholder="Podcast Transcription",
+ lines=8,
+ autoscroll=False,
+ )
+
+ with gr.Row():
+ vtt_sub_output = gr.File(
+ label="VTT Subtitle file download", elem_classes=["vtt-sub-file"]
+ )
+ srt_sub_output = gr.File(
+ label="SRT Subtitle file download", elem_classes=["srt-sub-file"]
+ )
+
+ btn.click(
+ transcribe_and_summarize,
+ inputs=[podcast_link_input],
+ outputs=[
+ summary_output,
+ transcription_output,
+ vtt_sub_output,
+ srt_sub_output,
+ ],
+ )
+
+ block.launch(debug=True)
diff --git a/whisper/3_subtitle_master.py b/whisper/3_subtitle_master.py
new file mode 100644
index 0000000..de125bf
--- /dev/null
+++ b/whisper/3_subtitle_master.py
@@ -0,0 +1,76 @@
+import os
+import uuid
+
+import gradio as gr
+import whisper
+from whisper.utils import WriteVTT
+
+from settings import BASE_DIR, OUTPUT_TEMP_DIR, OUTPUT_VIDEO_DIR, STYLES_DIR
+from utils import command, subtitles, video
+
+
+MODEL = whisper.load_model("base.en")
+VTT_WRITER = WriteVTT(output_dir=str(OUTPUT_TEMP_DIR))
+
+
+def get_unique_project_name(input_video: str) -> str:
+ """Get a unique subtitle-master project name to avoid file-name clashes."""
+ unique_id = uuid.uuid4()
+ filename = os.path.basename(input_video)
+ base_fname, _ = os.path.splitext(filename)
+ return f"{base_fname}_{unique_id}"
+
+
+def main(input_video: str) -> str:
+ """Takes a video file as string path and returns a video file with subtitles embedded as string path."""
+ unique_project_name = get_unique_project_name(input_video)
+ get_temp_output_path = lambda ext: OUTPUT_TEMP_DIR / f"{unique_project_name}{ext}"
+ mp3_file = video.to_mp3(
+ input_video,
+ log_directory=BASE_DIR,
+ output_path=get_temp_output_path(".mp3"),
+ )
+
+ whisper_output = MODEL.transcribe(mp3_file, beam_size=5)
+ vtt_subs = subtitles.write_to_file(
+ whisper_output,
+ writer=VTT_WRITER,
+ output_path=get_temp_output_path(".vtt"),
+ )
+
+ vtt_string_path = command.format_ffmpeg_filepath(vtt_subs)
+ output_video_path = OUTPUT_VIDEO_DIR / f"{unique_project_name}_subs.mp4"
+ embed_subs_into_vid_command = f'ffmpeg -i "{input_video}" -vf "subtitles=\'{vtt_string_path}\'" "{output_video_path}"'
+
+ command.run_and_log(embed_subs_into_vid_command, log_directory=BASE_DIR)
+
+ return str(output_video_path)
+
+
+if __name__ == "__main__":
+ block = gr.Blocks(
+ css=str(STYLES_DIR / "subtitle_master.css"),
+ theme=gr.themes.Soft(primary_hue=gr.themes.colors.emerald),
+ )
+
+ with block:
+ with gr.Group():
+ gr.HTML(
+ f"""
+
+ """
+ )
+ with gr.Row():
+ input_video = gr.Video(
+ label="Input Video", sources=["upload"], mirror_webcam=False
+ )
+ output_video = gr.Video()
+ with gr.Row():
+ button_text = "🎞️ Subtitle my video! 🎞️"
+ btn = gr.Button(value=button_text, elem_classes=["button-row"])
+
+ btn.click(main, inputs=[input_video], outputs=[output_video])
+
+ block.launch(debug=True)
diff --git a/whisper/4_faster_whisper.py b/whisper/4_faster_whisper.py
new file mode 100644
index 0000000..e334213
--- /dev/null
+++ b/whisper/4_faster_whisper.py
@@ -0,0 +1,22 @@
+from faster_whisper import WhisperModel
+from settings import TEST_AUDIO_DIR
+
+model_size = "small"
+
+model = WhisperModel(model_size, device="cpu", compute_type="int8")
+# # Choose only one of these, depending on if you're running on CPU or GPU (cuda). (I'll be using the second option)
+# model = WhisperModel(model_size, device="cuda", compute_type="float16")
+
+
+segments, info = model.transcribe(
+ str(TEST_AUDIO_DIR / "dutch_long_repeat_file.mp3"),
+ beam_size=5,
+ without_timestamps=True,
+)
+
+print(
+ f"Detected language '{info.language}' with probability {info.language_probability}"
+)
+
+for segment in segments:
+ print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
diff --git a/whisper/4_vid_to_quiz.py b/whisper/4_vid_to_quiz.py
new file mode 100644
index 0000000..ad42b22
--- /dev/null
+++ b/whisper/4_vid_to_quiz.py
@@ -0,0 +1,69 @@
+import os
+import uuid
+from pathlib import Path
+
+import gradio as gr
+
+from settings import BASE_DIR, OUTPUT_TEMP_DIR, STYLES_DIR
+from utils import openai_api, video
+
+
+API_UPLOAD_LIMIT_BYTES = 26214400 # 25mb
+
+
+def check_upload_size(input_file: str) -> None:
+ """Check the video file size is within the API upload limit."""
+ input_file_size = os.path.getsize(input_file)
+ if input_file_size > API_UPLOAD_LIMIT_BYTES:
+ raise ValueError(
+ f"File size of {input_file_size} bytes ({input_file_size / 1024 / 1024:.2f} MB) exceeds the API upload limit of {API_UPLOAD_LIMIT_BYTES} bytes ({API_UPLOAD_LIMIT_BYTES / 1024 / 1024:.2f} MB). Please use a shorter video or lower the audio quality settings."
+ )
+
+
+def main(input_video: str) -> str:
+ """Takes a video file as string path and returns a quiz as string."""
+ unique_id = uuid.uuid4()
+
+ mp3_file = video.to_mp3(
+ input_video,
+ log_directory=BASE_DIR,
+ output_path=OUTPUT_TEMP_DIR / f"{unique_id}.mp3",
+ mono=True,
+ )
+
+ check_upload_size(mp3_file)
+ transcription = openai_api.transcribe(
+ Path(mp3_file), language="en", translate=False, response_format="text"
+ )
+
+ quiz = openai_api.text_to_quiz(transcription)
+ return quiz
+
+
+if __name__ == "__main__":
+ block = gr.Blocks(
+ css=str(STYLES_DIR / "vid2quiz.css"),
+ theme=gr.themes.Soft(primary_hue=gr.themes.colors.yellow),
+ )
+
+ with block:
+ with gr.Group():
+ gr.HTML(
+ f"""
+
+ """
+ )
+ with gr.Row():
+ input_video = gr.Video(
+ label="Input Video", sources=["upload"], mirror_webcam=False
+ )
+ output_quiz_text = gr.Textbox(label="Quiz")
+ with gr.Row():
+ button_text = "📝 Make a quiz about this video! 📝"
+ btn = gr.Button(value=button_text, elem_classes=["button-row"])
+
+ btn.click(main, inputs=[input_video], outputs=[output_quiz_text])
+
+ block.launch(debug=True)
diff --git a/whisper/images/subtitle_master.png b/whisper/images/subtitle_master.png
new file mode 100644
index 0000000..419fbf5
Binary files /dev/null and b/whisper/images/subtitle_master.png differ
diff --git a/whisper/images/vid2quiz.png b/whisper/images/vid2quiz.png
new file mode 100644
index 0000000..72f54a8
Binary files /dev/null and b/whisper/images/vid2quiz.png differ
diff --git a/whisper/images/whisper_pods.png b/whisper/images/whisper_pods.png
new file mode 100644
index 0000000..55ed2ff
Binary files /dev/null and b/whisper/images/whisper_pods.png differ
diff --git a/whisper/settings.py b/whisper/settings.py
new file mode 100644
index 0000000..6ecce6f
--- /dev/null
+++ b/whisper/settings.py
@@ -0,0 +1,7 @@
+from pathlib import Path
+
+BASE_DIR = Path(__file__).parent
+OUTPUT_TEMP_DIR = BASE_DIR / "output_temp_files"
+OUTPUT_VIDEO_DIR = BASE_DIR / "output_video"
+STYLES_DIR = BASE_DIR / "styles"
+TEST_AUDIO_DIR = BASE_DIR / "test_audio_files"
diff --git a/whisper/styles/subtitle_master.css b/whisper/styles/subtitle_master.css
new file mode 100644
index 0000000..f6bae17
--- /dev/null
+++ b/whisper/styles/subtitle_master.css
@@ -0,0 +1,8 @@
+.header {
+ padding: 2em 8em;
+}
+
+.header,
+.button-row {
+ background-color: #1d366f7e;
+}
diff --git a/whisper/styles/vid2quiz.css b/whisper/styles/vid2quiz.css
new file mode 100644
index 0000000..36fd9ad
--- /dev/null
+++ b/whisper/styles/vid2quiz.css
@@ -0,0 +1,15 @@
+.header {
+ display: flex;
+ justify-content: center;
+ align-items: center;
+ padding: 2em 8em;
+}
+
+.header-img {
+ max-width: 50%;
+}
+
+.header,
+.button-row {
+ background-color: #0c1d36;
+}
diff --git a/whisper/styles/whisper_pods.css b/whisper/styles/whisper_pods.css
new file mode 100644
index 0000000..eac2b60
--- /dev/null
+++ b/whisper/styles/whisper_pods.css
@@ -0,0 +1,8 @@
+.header {
+ padding: 2em 8em;
+}
+
+.vtt-sub-file,
+.srt-sub-file {
+ height: 80px;
+}
diff --git a/whisper/test_audio_files/dutch_long_repeat_file.mp3 b/whisper/test_audio_files/dutch_long_repeat_file.mp3
new file mode 100644
index 0000000..86989f9
Binary files /dev/null and b/whisper/test_audio_files/dutch_long_repeat_file.mp3 differ
diff --git a/whisper/test_audio_files/dutch_the_netherlands.mp3 b/whisper/test_audio_files/dutch_the_netherlands.mp3
new file mode 100644
index 0000000..9f1d479
Binary files /dev/null and b/whisper/test_audio_files/dutch_the_netherlands.mp3 differ
diff --git a/whisper/test_audio_files/high_quality.mp3 b/whisper/test_audio_files/high_quality.mp3
new file mode 100644
index 0000000..11ca665
Binary files /dev/null and b/whisper/test_audio_files/high_quality.mp3 differ
diff --git a/whisper/test_audio_files/low_quality.mp3 b/whisper/test_audio_files/low_quality.mp3
new file mode 100644
index 0000000..7449caf
Binary files /dev/null and b/whisper/test_audio_files/low_quality.mp3 differ
diff --git a/whisper/test_audio_files/terrible_quality.mp3 b/whisper/test_audio_files/terrible_quality.mp3
new file mode 100644
index 0000000..4a6230a
Binary files /dev/null and b/whisper/test_audio_files/terrible_quality.mp3 differ
diff --git a/whisper/utils/__init__.py b/whisper/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/whisper/utils/command.py b/whisper/utils/command.py
new file mode 100644
index 0000000..eb1f003
--- /dev/null
+++ b/whisper/utils/command.py
@@ -0,0 +1,26 @@
+import datetime
+import subprocess
+from pathlib import Path
+
+
+def print_blue(message: str) -> None:
+ print(f"\033[94m{message}\033[00m")
+
+
+def run_and_log(command: str, log_directory: Path) -> None:
+ print_blue(f"Running command: \n{command}")
+ with open(log_directory / "commands_log.txt", "a+", encoding="utf-8") as file:
+ subprocess.call(
+ command,
+ stdout=file,
+ stderr=file,
+ )
+ file.write(
+ f"\nRan command: {command}\nDate/time: {datetime.datetime.now()}\n\n\n\n"
+ )
+
+
+def format_ffmpeg_filepath(path: Path) -> str:
+ r"""Turns C:\Users\dirk\test/subtitle.vtt into C\:\\Users\\dirk\\test\\subtitle.vtt"""
+ string_path = str(path)
+ return string_path.replace("\\", "\\\\").replace("/", "\\\\").replace(":", "\\:")
diff --git a/whisper/utils/openai_api.py b/whisper/utils/openai_api.py
new file mode 100644
index 0000000..7e53b50
--- /dev/null
+++ b/whisper/utils/openai_api.py
@@ -0,0 +1,64 @@
+import typing
+from pathlib import Path
+
+from decouple import config
+from openai import OpenAI
+from tenacity import retry, stop_after_attempt, stop_after_delay
+
+
+CLIENT = OpenAI(api_key=str(config("OPENAI_API_KEY")))
+MODEL = "whisper-1"
+
+ResponseFormat = typing.Literal["text", "srt", "vtt"]
+
+
+def transcribe(
+ file: Path,
+ language: str | None = None,
+ translate: bool = False,
+ response_format: ResponseFormat = "text",
+) -> str:
+ print("Transcribing file...")
+ options = {
+ "file": file,
+ "model": MODEL,
+ "response_format": response_format,
+ }
+
+ if translate:
+ transcript = CLIENT.audio.translations.create(**options)
+ else:
+ if language:
+ options["language"] = language
+ transcript = CLIENT.audio.transcriptions.create(**options)
+
+ if type(transcript) != str:
+ raise TypeError(
+ f"Expected a string value to be returned, but got {type(transcript)} instead."
+ )
+ print(f"Transcription successful:\n{transcript[:100]}...")
+
+ return transcript
+
+
+PROMPT_SETUP = """You are a text-to-quiz app. The user will provide you a video transcription in textual format. You will generate a list of questions for the user to answer about this video. Depending on the length of the transcription, stick to a maximum of 5 questions. All questions should be solely about the video transcription content provided by the user and should be answerable by reading the transcription. Do not provide the answers, but only the questions. The transcription the user provides is based on a video, and may include timestamps, please ignore these timestamps and just treat it as one single transcription containing all the content in the video.
+List and number each item on a separate line.
+"""
+
+
+@retry(stop=stop_after_attempt(3) | stop_after_delay(60))
+def text_to_quiz(text: str) -> str:
+ print("Converting text to quiz...")
+ messages = [
+ {"role": "system", "content": PROMPT_SETUP},
+ {"role": "user", "content": text},
+ ]
+ result = CLIENT.chat.completions.create(
+ model="gpt-3.5-turbo-1106",
+ messages=messages, # type: ignore
+ )
+ content = result.choices[0].message.content
+ if content == None: # Just a quick sanity check
+ raise ValueError("There was an error while trying to generate the quiz.")
+ print(f"Text to quiz conversion completed.")
+ return content
diff --git a/whisper/utils/podcast.py b/whisper/utils/podcast.py
new file mode 100644
index 0000000..e1c02cf
--- /dev/null
+++ b/whisper/utils/podcast.py
@@ -0,0 +1,44 @@
+import re
+import uuid
+from pathlib import Path
+
+import requests
+from decouple import config
+from openai import OpenAI
+
+
+GPT_MODEL = "gpt-3.5-turbo-1106"
+CLIENT = OpenAI(api_key=str(config("OPENAI_API_KEY")))
+
+
+def scrape_link_from_page(page_url: str) -> str:
+ podcast_page = requests.get(page_url).text
+ regex = r"(?P\;https?://[^\s]+)"
+ podcast_url_dirty = re.findall(regex, podcast_page)[0]
+ podcast_url = podcast_url_dirty.split(";")[1]
+ return podcast_url
+
+
+def download(podcast_url: str, unique_id: uuid.UUID, output_dir: Path) -> Path:
+ print("Downloading podcast...")
+ podcast_audio = requests.get(podcast_url)
+ save_location = output_dir / f"{unique_id}.mp3"
+
+ with open(save_location, "wb") as file:
+ file.write(podcast_audio.content)
+ print("Podcast successfully downloaded!")
+
+ return save_location
+
+
+def get_summary(transcription: str) -> str:
+ print("Summarizing podcast...")
+ prompt = f"Summarize the following podcast into the most important points:\n\n{transcription}\n\nSummary:"
+
+ response = CLIENT.chat.completions.create(
+ model=GPT_MODEL, messages=[{"role": "user", "content": prompt}]
+ )
+
+ print("Podcast summarized!")
+ summary = response.choices[0].message.content
+ return summary if summary else "There was a problem generating the summary."
diff --git a/whisper/utils/subtitles.py b/whisper/utils/subtitles.py
new file mode 100644
index 0000000..d335b4d
--- /dev/null
+++ b/whisper/utils/subtitles.py
@@ -0,0 +1,11 @@
+from typing import Callable
+from pathlib import Path
+
+
+def write_to_file(whisper_output: dict, writer: Callable, output_path: Path) -> Path:
+ """Takes the whisper output, a writer function, and an output path, and writes subtitles to disk in the specified format."""
+ with open(output_path, "w", encoding="utf-8") as sub_file:
+ writer.write_result(result=whisper_output, file=sub_file)
+ print(f"Subtitles generated and saved to {output_path}")
+
+ return output_path
diff --git a/whisper/utils/video.py b/whisper/utils/video.py
new file mode 100644
index 0000000..6f2d1dc
--- /dev/null
+++ b/whisper/utils/video.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+from . import command
+
+
+def to_mp3(
+ input_video: str, log_directory: Path, output_path: Path, mono: bool = False
+) -> str:
+ output_path_string = str(output_path)
+
+ channels = 1 if mono else 2
+ bitrate = 80 if mono else 192
+
+ FFMPEG_PATH = "C:/ffmpeg/" # Update this path as needed
+ command_to_run = f'"{FFMPEG_PATH}" -i "{input_video}" -vn -ar 44100 -ac {channels} -b:a {bitrate}k "{output_path_string}"'
+
+ # command_to_run = f'ffmpeg -i "{input_video}" -vn -ar 44100 -ac {channels} -b:a {bitrate}k "{output_path_string}"'
+ command.run_and_log(command_to_run, log_directory)
+ print(f"Video converted to mp3 and saved to {output_path_string}")
+
+ return output_path_string
+
+# from moviepy.editor import VideoFileClip
+# from pathlib import Path
+
+# def to_mp3(input_video: str, log_directory: Path, output_path: Path, mono: bool = False) -> str:
+# output_path_string = str(output_path)
+
+# video = VideoFileClip(input_video)
+# video.audio.write_audiofile(output_path_string, ffmpeg_params=["-ac", "1" if mono else "2"])
+
+# print(f"Video converted to mp3 and saved to {output_path_string}")
+# return output_path_string
+
+
+
+
+
diff --git a/whisper/video (1080p).mp4 b/whisper/video (1080p).mp4
new file mode 100644
index 0000000..d722e0c
Binary files /dev/null and b/whisper/video (1080p).mp4 differ