Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize Transcription with YouTube Captions, Audio Resampling, and Copy Button #28

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
venv
.venv
**.pyc
downloads/
downloads/.DS_Store
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
scribewizard
19 changes: 19 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Streamlit",
"type": "python",
"request": "launch",
"module": "streamlit",
"args": [
"run",
"main.py"
],
"console": "integratedTerminal"
}
]
}
22 changes: 22 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"version": "2.0.0",
"tasks": [
{
"label": "Run Streamlit",
"type": "shell",
"command": "python3",
"args": [
"-m",
"streamlit",
"run",
"main.py"
],
"group": {
"kind": "build",
"isDefault": true
},
"problemMatcher": [],
"detail": "Runs the Streamlit app using the Python module command."
}
]
}
22 changes: 20 additions & 2 deletions download.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import unicode_literals
import re
import yt_dlp as youtube_dl
import os
import time
Expand Down Expand Up @@ -48,6 +49,21 @@ def get_ydl_opts(external_logger=lambda x: None):
}


def resample_mp3(filename, sampling_rate):
# rename the mp3 file to replace spaces and dots
escaped_filename = filename.replace(' ', r'\ ')
out_filename = './' + filename.replace(' ', '_').replace('.', '-')[2:].replace('-mp3', f'_resampled_{sampling_rate}k.mp3')
escaped_out_filename = out_filename.replace(' ', r'\ ')

print(f"Resampling {filename} to {sampling_rate} to target size 25 MB limit")
os.system(f"ffmpeg -i {escaped_filename} -b:a {sampling_rate}k -bufsize {sampling_rate}k -f mp3 {escaped_out_filename}")
new_filesize = os.path.getsize(out_filename)
print(f"Resampled {out_filename} to size {int(new_filesize / (1024*1024))}MB")
# TODO: uncomment this line
# os.remove(filename)
return new_filesize, out_filename


def download_video_audio(url, external_logger=lambda x: None):
retries = 0
while retries < max_retries:
Expand All @@ -57,12 +73,14 @@ def download_video_audio(url, external_logger=lambda x: None):
print("Going to download ", url)
info = ydl.extract_info(url, download=False)
filesize = info.get("filesize", 0)
if filesize > MAX_FILE_SIZE:
raise Exception(FILE_TOO_LARGE_MESSAGE)
filename = ydl.prepare_filename(info)
res = ydl.download([url])
print("youtube-dl result :", res)
mp3_filename = os.path.splitext(filename)[0] + '.mp3'
if filesize > MAX_FILE_SIZE:
filesize, mp3_filename = resample_mp3(mp3_filename, 48)
if filesize > MAX_FILE_SIZE:
raise Exception(FILE_TOO_LARGE_MESSAGE)
print('mp3 file name - ', mp3_filename)
return mp3_filename
except Exception as e:
Expand Down
114 changes: 71 additions & 43 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re
import pyperclip
import streamlit as st
from groq import Groq
import json
Expand All @@ -6,6 +8,7 @@
from md2pdf.core import md2pdf
from dotenv import load_dotenv
from download import download_video_audio, delete_download
from youtube_transcript_api import YouTubeTranscriptApi

load_dotenv()

Expand Down Expand Up @@ -183,6 +186,7 @@ def transcribe_audio(audio_file):
results = transcription.text
return results


def generate_notes_structure(transcript: str, model: str = "llama3-70b-8192"):
"""
Returns notes structure content as well as total tokens and total time for generation.
Expand All @@ -202,11 +206,11 @@ def generate_notes_structure(transcript: str, model: str = "llama3-70b-8192"):
messages=[
{
"role": "system",
"content": "Write in JSON format:\n\n{\"Title of section goes here\":\"Description of section goes here\",\"Title of section goes here\":\"Description of section goes here\",\"Title of section goes here\":\"Description of section goes here\"}"
"content": "Write the response strictly in JSON format with no additional text or bullet points. Structure it like this:\n\n{\"Section Title\":\"Description\", \"Section Title\":\"Description\", \"Section Title\":\"Description\"}. DONT include information from the example, but ONLY from the TRANSCRIPT."
},
{
"role": "user",
"content": f"### Transcript {transcript}\n\n### Example\n\n{shot_example}### Instructions\n\nCreate a structure for comprehensive notes on the above transcribed audio. Section titles and content descriptions must be comprehensive. Quality over quantity."
"content": f"### Transcript {transcript}\n\n### Example\n\n{shot_example}### Instructions\n\nCreate a structure for comprehensive notes on the above transcribed audio. Section titles and content descriptions must be concise and formatted as a valid JSON object."
}
],
temperature=0.3,
Expand All @@ -222,6 +226,7 @@ def generate_notes_structure(transcript: str, model: str = "llama3-70b-8192"):

return statistics_to_return, completion.choices[0].message.content


def generate_section(transcript: str, existing_notes: str, section: str, model: str = "llama3-8b-8192"):
stream = st.session_state.groq.chat.completions.create(
model=model,
Expand Down Expand Up @@ -276,22 +281,21 @@ def enable():
def empty_st():
st.empty()


def get_youtube_captions(youtube_link):
pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11}).*'
match = re.search(pattern, youtube_link)
if match:
video_id = match.group(1)
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=('en', 'pt'))
return ' '.join([entry['text'] for entry in transcript])
else:
return None


try:
with st.sidebar:
audio_files = {
"Transformers Explained by Google Cloud Tech": {
"file_path": "assets/audio/transformers_explained.m4a",
"youtube_link": "https://www.youtube.com/watch?v=SZorAJ4I-sA"
},
"The Essence of Calculus by 3Blue1Brown": {
"file_path": "assets/audio/essence_calculus.m4a",
"youtube_link": "https://www.youtube.com/watch?v=WUvTyaaNkzM"
},
"First 20 minutes of Groq's AMA": {
"file_path": "assets/audio/groq_ama_trimmed_20min.m4a",
"youtube_link": "https://www.youtube.com/watch?v=UztfweS-7MU"
}
}
audio_files = {}

st.write(f"# 🧙‍♂️ ScribeWizard \n## Generate notes from audio in seconds using Groq, Whisper, and Llama3")
st.markdown(f"[Github Repository](https://github.com/bklieger/scribewizard)\n\nAs with all generative AI, content may include inaccurate or placeholder information. ScribeWizard is in beta and all feedback is welcome!")
Expand Down Expand Up @@ -323,9 +327,9 @@ def empty_st():

st.write("# Customization Settings\n🧪 These settings are experimental.\n")
st.write(f"By default, ScribeWizard uses Llama3-70b for generating the notes outline and Llama3-8b for the content. This balances quality with speed and rate limit usage. You can customize these selections below.")
outline_model_options = ["llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"]
outline_model_options = ["llama-3.1-70b-versatile", "llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it"]
outline_selected_model = st.selectbox("Outline generation:", outline_model_options)
content_model_options = ["llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768", "gemma-7b-it", "gemma2-9b-it"]
content_model_options = ["llama-3.1-70b-versatile", "llama-3.1-8b-instant", "llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768", "gemma-7b-it", "gemma2-9b-it"]
content_selected_model = st.selectbox("Content generation:", content_model_options)


Expand Down Expand Up @@ -357,10 +361,16 @@ def empty_st():
else:
raise ValueError("Please generate content first before downloading the notes.")

input_method = st.radio("Choose input method:", ["Upload audio file", "YouTube link"])
input_method = st.radio("Choose input method:", ["YouTube link", "Upload audio file"])
audio_file = None
youtube_link = None
groq_input_key = None

def copy_to_clipboard():
content = st.session_state.notes.get_markdown_content()
pyperclip.copy(content)
st.success("Content copied to clipboard!")

with st.form("groqform"):
if not GROQ_API_KEY:
groq_input_key = st.text_input("Enter your Groq API Key (gsk_yA...):", "", type="password")
Expand Down Expand Up @@ -412,47 +422,60 @@ def display_statistics():
# Show temporary message before transcription is generated and statistics show

audio_file_path = None
transcription_text = None

if input_method == "YouTube link":
display_status("Downloading audio from YouTube link ....")
audio_file_path = download_video_audio(youtube_link, display_download_status)
if audio_file_path is None:
st.error("Failed to download audio from YouTube link. Please try again.")
enable()
clear_status()
else:
# Read the downloaded file and create a file-like objec
display_status("Processing Youtube audio ....")
with open(audio_file_path, 'rb') as f:
file_contents = f.read()
audio_file = BytesIO(file_contents)
if True:
# try to download the captions (auto captions included) from youtube
# to avoid the need of whisper transcriptions
transcription_text = get_youtube_captions(youtube_link)

# if captions are not available, download the video and transcribe its audio
if not transcription_text:
display_status("Downloading audio from YouTube link ....")
audio_file_path = download_video_audio(youtube_link, display_download_status)
if audio_file_path is None:
st.error("Failed to download audio from YouTube link. Please try again.")
enable()
clear_status()
else:
# Read the downloaded file and create a file-like objec
display_status("Processing Youtube audio ....")
with open(audio_file_path, 'rb') as f:
file_contents = f.read()
audio_file = BytesIO(file_contents)

# Check size first to ensure will work with Whisper
if os.path.getsize(audio_file_path) > MAX_FILE_SIZE:
raise ValueError(FILE_TOO_LARGE_MESSAGE)
# Check size first to ensure will work with Whisper
if os.path.getsize(audio_file_path) > MAX_FILE_SIZE:
raise ValueError(FILE_TOO_LARGE_MESSAGE)

audio_file.name = os.path.basename(audio_file_path) # Set the file name
delete_download(audio_file_path)
clear_download_status()
audio_file.name = os.path.basename(audio_file_path) # Set the file name
# TODO: uncomment this line
# delete_download(audio_file_path)
clear_download_status()

if not GROQ_API_KEY:
st.session_state.groq = Groq(api_key=groq_input_key)

display_status("Transcribing audio in background....")
transcription_text = transcribe_audio(audio_file)
if not transcription_text:
display_status("Transcribing audio in background....")
print("Transcribing audio in background....")
transcription_text = transcribe_audio(audio_file)
else:
print("Transcription already exists (from YouTube captions)")

display_statistics()


display_status("Generating notes structure....")
print("Generating notes structure....")
large_model_generation_statistics, notes_structure = generate_notes_structure(transcription_text, model=str(outline_selected_model))
print("Structure: ",notes_structure)
print("Structure: ", notes_structure)

display_status("Generating notes ...")
print("Generating notes ...")
total_generation_statistics = GenerationStatistics(model_name=str(content_selected_model))
clear_status()


try:
notes_structure_json = json.loads(notes_structure)
notes = NoteSection(structure=notes_structure_json,transcript=transcription_text)
Expand Down Expand Up @@ -480,11 +503,16 @@ def stream_section_content(sections):
stream_section_content(content)

stream_section_content(notes_structure_json)

except json.JSONDecodeError:
st.error("Failed to decode the notes structure. Please try again.")

enable()

if st.button('Copy'):
copy_to_clipboard()


except Exception as e:
st.session_state.button_disabled = False

Expand All @@ -499,4 +527,4 @@ def stream_section_content(sections):

# Remove audio after exception to prevent data storage leak
if audio_file_path is not None:
delete_download(audio_file_path)
delete_download(audio_file_path)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,5 @@ webencodings==0.5.1
websockets==12.0
yt-dlp @ https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz
zopfli==0.2.3
youtube_transcript_api
pyperclip
1 change: 1 addition & 0 deletions start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python3 -m streamlit run main.py