diff --git a/.gitignore b/.gitignore index 3038f7f..8280dc4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ venv .venv **.pyc -downloads/ \ No newline at end of file +downloads/.DS_Store diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..46b5140 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +scribewizard diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..4a4fe55 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,19 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Streamlit", + "type": "python", + "request": "launch", + "module": "streamlit", + "args": [ + "run", + "main.py" + ], + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..db54601 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,22 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "Run Streamlit", + "type": "shell", + "command": "python3", + "args": [ + "-m", + "streamlit", + "run", + "main.py" + ], + "group": { + "kind": "build", + "isDefault": true + }, + "problemMatcher": [], + "detail": "Runs the Streamlit app using the Python module command." + } + ] +} \ No newline at end of file diff --git a/download.py b/download.py index 847ae5e..9a59cb6 100644 --- a/download.py +++ b/download.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import re import yt_dlp as youtube_dl import os import time @@ -48,6 +49,21 @@ def get_ydl_opts(external_logger=lambda x: None): } +def resample_mp3(filename, sampling_rate): + # rename the mp3 file to replace spaces and dots + escaped_filename = filename.replace(' ', r'\ ') + out_filename = './' + filename.replace(' ', '_').replace('.', '-')[2:].replace('-mp3', f'_resampled_{sampling_rate}k.mp3') + escaped_out_filename = out_filename.replace(' ', r'\ ') + + print(f"Resampling {filename} to {sampling_rate} to target size 25 MB limit") + os.system(f"ffmpeg -i {escaped_filename} -b:a {sampling_rate}k -bufsize {sampling_rate}k -f mp3 {escaped_out_filename}") + new_filesize = os.path.getsize(out_filename) + print(f"Resampled {out_filename} to size {int(new_filesize / (1024*1024))}MB") + # TODO: uncomment this line + # os.remove(filename) + return new_filesize, out_filename + + def download_video_audio(url, external_logger=lambda x: None): retries = 0 while retries < max_retries: @@ -57,12 +73,14 @@ def download_video_audio(url, external_logger=lambda x: None): print("Going to download ", url) info = ydl.extract_info(url, download=False) filesize = info.get("filesize", 0) - if filesize > MAX_FILE_SIZE: - raise Exception(FILE_TOO_LARGE_MESSAGE) filename = ydl.prepare_filename(info) res = ydl.download([url]) print("youtube-dl result :", res) mp3_filename = os.path.splitext(filename)[0] + '.mp3' + if filesize > MAX_FILE_SIZE: + filesize, mp3_filename = resample_mp3(mp3_filename, 48) + if filesize > MAX_FILE_SIZE: + raise Exception(FILE_TOO_LARGE_MESSAGE) print('mp3 file name - ', mp3_filename) return mp3_filename except Exception as e: diff --git a/main.py b/main.py index 345bd4d..612cf52 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,5 @@ +import re +import pyperclip import streamlit as st from groq import Groq import json @@ -6,6 +8,7 @@ from md2pdf.core import md2pdf from dotenv import load_dotenv from download import download_video_audio, delete_download +from youtube_transcript_api import YouTubeTranscriptApi load_dotenv() @@ -183,6 +186,7 @@ def transcribe_audio(audio_file): results = transcription.text return results + def generate_notes_structure(transcript: str, model: str = "llama3-70b-8192"): """ Returns notes structure content as well as total tokens and total time for generation. @@ -202,11 +206,11 @@ def generate_notes_structure(transcript: str, model: str = "llama3-70b-8192"): messages=[ { "role": "system", - "content": "Write in JSON format:\n\n{\"Title of section goes here\":\"Description of section goes here\",\"Title of section goes here\":\"Description of section goes here\",\"Title of section goes here\":\"Description of section goes here\"}" + "content": "Write the response strictly in JSON format with no additional text or bullet points. Structure it like this:\n\n{\"Section Title\":\"Description\", \"Section Title\":\"Description\", \"Section Title\":\"Description\"}. DONT include information from the example, but ONLY from the TRANSCRIPT." }, { "role": "user", - "content": f"### Transcript {transcript}\n\n### Example\n\n{shot_example}### Instructions\n\nCreate a structure for comprehensive notes on the above transcribed audio. Section titles and content descriptions must be comprehensive. Quality over quantity." + "content": f"### Transcript {transcript}\n\n### Example\n\n{shot_example}### Instructions\n\nCreate a structure for comprehensive notes on the above transcribed audio. Section titles and content descriptions must be concise and formatted as a valid JSON object." } ], temperature=0.3, @@ -222,6 +226,7 @@ def generate_notes_structure(transcript: str, model: str = "llama3-70b-8192"): return statistics_to_return, completion.choices[0].message.content + def generate_section(transcript: str, existing_notes: str, section: str, model: str = "llama3-8b-8192"): stream = st.session_state.groq.chat.completions.create( model=model, @@ -276,22 +281,21 @@ def enable(): def empty_st(): st.empty() + +def get_youtube_captions(youtube_link): + pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11}).*' + match = re.search(pattern, youtube_link) + if match: + video_id = match.group(1) + transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=('en', 'pt')) + return ' '.join([entry['text'] for entry in transcript]) + else: + return None + + try: with st.sidebar: - audio_files = { - "Transformers Explained by Google Cloud Tech": { - "file_path": "assets/audio/transformers_explained.m4a", - "youtube_link": "https://www.youtube.com/watch?v=SZorAJ4I-sA" - }, - "The Essence of Calculus by 3Blue1Brown": { - "file_path": "assets/audio/essence_calculus.m4a", - "youtube_link": "https://www.youtube.com/watch?v=WUvTyaaNkzM" - }, - "First 20 minutes of Groq's AMA": { - "file_path": "assets/audio/groq_ama_trimmed_20min.m4a", - "youtube_link": "https://www.youtube.com/watch?v=UztfweS-7MU" - } - } + audio_files = {} st.write(f"# 🧙‍♂️ ScribeWizard \n## Generate notes from audio in seconds using Groq, Whisper, and Llama3") st.markdown(f"[Github Repository](https://github.com/bklieger/scribewizard)\n\nAs with all generative AI, content may include inaccurate or placeholder information. ScribeWizard is in beta and all feedback is welcome!") @@ -323,9 +327,9 @@ def empty_st(): st.write("# Customization Settings\n🧪 These settings are experimental.\n") st.write(f"By default, ScribeWizard uses Llama3-70b for generating the notes outline and Llama3-8b for the content. This balances quality with speed and rate limit usage. You can customize these selections below.") - outline_model_options = ["llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"] + outline_model_options = ["llama-3.1-70b-versatile", "llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"] outline_selected_model = st.selectbox("Outline generation:", outline_model_options) - content_model_options = ["llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768", "gemma-7b-it", "gemma2-9b-it"] + content_model_options = ["llama-3.1-70b-versatile", "llama-3.1-8b-instant", "llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768", "gemma-7b-it", "gemma2-9b-it"] content_selected_model = st.selectbox("Content generation:", content_model_options) @@ -357,10 +361,16 @@ def empty_st(): else: raise ValueError("Please generate content first before downloading the notes.") - input_method = st.radio("Choose input method:", ["Upload audio file", "YouTube link"]) + input_method = st.radio("Choose input method:", ["YouTube link", "Upload audio file"]) audio_file = None youtube_link = None groq_input_key = None + + def copy_to_clipboard(): + content = st.session_state.notes.get_markdown_content() + pyperclip.copy(content) + st.success("Content copied to clipboard!") + with st.form("groqform"): if not GROQ_API_KEY: groq_input_key = st.text_input("Enter your Groq API Key (gsk_yA...):", "", type="password") @@ -412,47 +422,60 @@ def display_statistics(): # Show temporary message before transcription is generated and statistics show audio_file_path = None + transcription_text = None if input_method == "YouTube link": - display_status("Downloading audio from YouTube link ....") - audio_file_path = download_video_audio(youtube_link, display_download_status) - if audio_file_path is None: - st.error("Failed to download audio from YouTube link. Please try again.") - enable() - clear_status() - else: - # Read the downloaded file and create a file-like objec - display_status("Processing Youtube audio ....") - with open(audio_file_path, 'rb') as f: - file_contents = f.read() - audio_file = BytesIO(file_contents) + if True: + # try to download the captions (auto captions included) from youtube + # to avoid the need of whisper transcriptions + transcription_text = get_youtube_captions(youtube_link) + + # if captions are not available, download the video and transcribe its audio + if not transcription_text: + display_status("Downloading audio from YouTube link ....") + audio_file_path = download_video_audio(youtube_link, display_download_status) + if audio_file_path is None: + st.error("Failed to download audio from YouTube link. Please try again.") + enable() + clear_status() + else: + # Read the downloaded file and create a file-like objec + display_status("Processing Youtube audio ....") + with open(audio_file_path, 'rb') as f: + file_contents = f.read() + audio_file = BytesIO(file_contents) - # Check size first to ensure will work with Whisper - if os.path.getsize(audio_file_path) > MAX_FILE_SIZE: - raise ValueError(FILE_TOO_LARGE_MESSAGE) + # Check size first to ensure will work with Whisper + if os.path.getsize(audio_file_path) > MAX_FILE_SIZE: + raise ValueError(FILE_TOO_LARGE_MESSAGE) - audio_file.name = os.path.basename(audio_file_path) # Set the file name - delete_download(audio_file_path) - clear_download_status() + audio_file.name = os.path.basename(audio_file_path) # Set the file name + # TODO: uncomment this line + # delete_download(audio_file_path) + clear_download_status() if not GROQ_API_KEY: st.session_state.groq = Groq(api_key=groq_input_key) - display_status("Transcribing audio in background....") - transcription_text = transcribe_audio(audio_file) + if not transcription_text: + display_status("Transcribing audio in background....") + print("Transcribing audio in background....") + transcription_text = transcribe_audio(audio_file) + else: + print("Transcription already exists (from YouTube captions)") display_statistics() - display_status("Generating notes structure....") + print("Generating notes structure....") large_model_generation_statistics, notes_structure = generate_notes_structure(transcription_text, model=str(outline_selected_model)) - print("Structure: ",notes_structure) + print("Structure: ", notes_structure) display_status("Generating notes ...") + print("Generating notes ...") total_generation_statistics = GenerationStatistics(model_name=str(content_selected_model)) clear_status() - try: notes_structure_json = json.loads(notes_structure) notes = NoteSection(structure=notes_structure_json,transcript=transcription_text) @@ -480,11 +503,16 @@ def stream_section_content(sections): stream_section_content(content) stream_section_content(notes_structure_json) + except json.JSONDecodeError: st.error("Failed to decode the notes structure. Please try again.") enable() + if st.button('Copy'): + copy_to_clipboard() + + except Exception as e: st.session_state.button_disabled = False @@ -499,4 +527,4 @@ def stream_section_content(sections): # Remove audio after exception to prevent data storage leak if audio_file_path is not None: - delete_download(audio_file_path) \ No newline at end of file + delete_download(audio_file_path) diff --git a/requirements.txt b/requirements.txt index 9149a8e..15b3bea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -72,3 +72,5 @@ webencodings==0.5.1 websockets==12.0 yt-dlp @ https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz zopfli==0.2.3 +youtube_transcript_api +pyperclip diff --git a/start.sh b/start.sh new file mode 100755 index 0000000..5027b53 --- /dev/null +++ b/start.sh @@ -0,0 +1 @@ +python3 -m streamlit run main.py