Skip to content

Commit

Permalink
Version 0.5 . See CHANGELOG.txt
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronkaplan committed Mar 18, 2024
1 parent 7e74052 commit a8e3c32
Show file tree
Hide file tree
Showing 9 changed files with 137 additions and 30 deletions.
4 changes: 2 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
"python.linting.pylintEnabled": false,
"python.linting.pycodestyleEnabled": true,
"python.linting.enabled": true,
"python.linting.pycodestyleArgs": ["--max-line-length=128", "--ignore=E501", ]

"python.linting.pycodestyleArgs": ["--max-line-length=128", "--ignore=E501", ],
"python.linting.pylintArgs": [ "--disable=F0401" ]
}
6 changes: 6 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Version 0.5 (204/3/19)
==========================

* added PDF upload functionality
* cleaned up stuff pylint was complaining about

2 changes: 1 addition & 1 deletion VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.4
0.5
6 changes: 4 additions & 2 deletions app/auth.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
"""Authorization helper."""

import os

from fastapi import Depends, HTTPException
from fastapi.security import HTTPBasic, HTTPBasicCredentials
Expand All @@ -14,9 +16,9 @@

# dependency to check if the credentials are valid
def get_current_username(credentials: HTTPBasicCredentials = Depends(security)):
"""Check if user in the allowed list"""
username = credentials.username
password = credentials.password
if username in fake_users and password == fake_users[username]:
return username
raise HTTPException(status_code=401, detail="Invalid credentials")

101 changes: 82 additions & 19 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,38 @@
"""Main FastAPI file. Provides the app WSGI entry point."""
import os
import sys
import json
import tempfile
from urllib.parse import urlparse
from distutils.util import strtobool # pylint: disable=deprecated-module

import requests

import fitz # PyMuPDF

import uvicorn
from fastapi import FastAPI, Request, Form, Depends
from fastapi import FastAPI, Request, Form, Depends, UploadFile, File
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles
from starlette.middleware.base import BaseHTTPMiddleware
import markdown

from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv, find_dotenv

from summarizer import Summarizer
# from settings import Settings
from auth import get_current_username
from distutils.util import strtobool
from summarizer import Summarizer # pylint: ignore=import-error
from auth import get_current_username # pylint: ignore=import-error


from settings import log
from settings import log # pylint: ignore=import-error


# first get the env parametting
if not load_dotenv(find_dotenv(), verbose=True, override=False): # read local .env file
log.warning("Could not find .env file! Assuming ENV vars work")

try:
VERSION = open('../VERSION.txt', encoding='utf-8').readline().rstrip('\n')
with open('../VERSION.txt', encoding='utf-8') as _f:
VERSION = _f.readline().rstrip('\n')
except Exception as e:
log.error("could not find VERSION.txt, bailing out.")
sys.exit(-1)
Expand All @@ -40,20 +42,22 @@
templates = Jinja2Templates(directory="/templates")
app.mount("/static", StaticFiles(directory="/static"), name="static")
GO_AZURE = False # default
OUTPUT_JSON = bool(strtobool(os.getenv('OUTPUT_JSON', 'false')))
OUTPUT_JSON = bool(strtobool(os.getenv('OUTPUT_JSON', 'false')))
DRY_RUN = bool(strtobool(os.getenv('DRY_RUN', 'false')))
OPENAI_MODEL = os.getenv('OPENAI_MODEL')
OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')


# First detect if we should invoke OpenAI via MS Azure or directly
try:
GO_AZURE = bool(strtobool(os.getenv('USE_MS_AZURE', 'false')))
except Exception as e:
log.warning(f"Could not read 'USE_MS_AZURE' env var. Reason: '{str(e)}'. Reverting to false.")
log.warning(
f"Could not read 'USE_MS_AZURE' env var. Reason: '{str(e)}'. Reverting to false.")
GO_AZURE = False


class HTTPSRedirectMiddleware(BaseHTTPMiddleware):
"""HTTP to HTTPS redirection"""
async def dispatch(self, request: Request, call_next):
if 'X-Forwarded-Proto' in request.headers and request.headers['X-Forwarded-Proto'] == 'https':
request.scope['scheme'] = 'https'
Expand All @@ -63,7 +67,8 @@ async def dispatch(self, request: Request, call_next):

app.add_middleware(HTTPSRedirectMiddleware)

summarizer = Summarizer(go_azure=GO_AZURE, model=OPENAI_MODEL, max_tokens=8192, output_json=OUTPUT_JSON)
summarizer = Summarizer(go_azure=GO_AZURE, model=OPENAI_MODEL,
max_tokens=8192, output_json=OUTPUT_JSON)


async def fetch_text_from_url(url: str) -> str:
Expand All @@ -86,27 +91,85 @@ def get_index(request: Request, username: str = Depends(get_current_username)):
return templates.TemplateResponse("index.html", {"request": request, "system_prompt": os.environ['SYSTEM_PROMPT'], "username": username})


def convert_pdf_to_markdown(filename: str) -> str:
"""Convert a PDF file given by <filename> to markdown.
Args:
filename: str the file on the filesystem
Returns:
markdown or "" empty string in case of error
"""
# Open the PDF file
doc = fitz.open(filename)

# Initialize a variable to hold the text
markdown_content = ""

# Iterate through each page of the PDF
for page_num in range(len(doc)):
# Get the page
page = doc.load_page(page_num)

# Extract text from the page
text = page.get_text()

# Add the text to our markdown content, followed by a page break
markdown_content += text + "\n\n---\n\n"

return markdown_content


# The main POST method. Input can either be a URL or a PDF file or a textarea text
@app.post("/", response_class=HTMLResponse)
async def index(request: Request, text: str = Form(None), url: str = Form(None),
async def index(request: Request, # request object
text: str = Form(None), # the text in the textarea
url: str = Form(None), # alternatively the URL
pdffile: UploadFile = File(None),
system_prompt: str = Form(None), model: str = Form('model'), token_count: int = Form(100),
username: str = Depends(get_current_username)):
"""HTTP POST method for the default page. This gets called when the user already HTTP POSTs a text which should be summarized."""

if not url and not text:
error = "Expected either url field or text field. Please specify one at least."
if url:
log.warning(f"Got request with url: {url[:20]}")
elif pdffile:
log.warning(f"Got request with pdffile: {pdffile.filename}")
elif text:
log.warning(f"Got request with text: {text[:100]}")
else:
log.error("no pdffile, no text, no url. Bailing out.")
error = "Expected either url field or text field or a PDF file. Please specify one at least."
result = None
return templates.TemplateResponse("index.html", {"request": request, "text": text, "system_prompt": system_prompt, "result": error, "success": False, "username": username}, status_code=400)

summarizer.model = model
summarizer.max_tokens = token_count

if url:
# go and fetch it
try:
text = await fetch_text_from_url(url)
except Exception as ex:
return templates.TemplateResponse("index.html", {"request": request, "text": url, "system_prompt": system_prompt, "result": f"Could not fetch URL. Reason {str(ex)}", "success": False}, status_code=400)

elif pdffile:
log.warning("we got a pdffile")
try:
suffix = ".pdf"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(pdffile.file.read())
tmp_pdf_path = tmp.name # Temp file path
log.warning(f"stored as {tmp_pdf_path}")

# Convert PDF to Markdown
text = convert_pdf_to_markdown(tmp_pdf_path)
log.warning(f"converted as {text[:100]}")

# Cleanup the temporary file
os.unlink(tmp_pdf_path)
except Exception as ex:
return templates.TemplateResponse("index.html", {"request": request, "text": text, "system_prompt": system_prompt, "result": f"Could not process the PDF file. Reason {str(ex)}", "success": False}, status_code=400)

# we got the text from the URL or the pdffile was converted... now check if we should actually summarize
if DRY_RUN:
result = "This is a sample response, we are in dry-run mode. We don't want to waste money for querying the API."
error = None
Expand Down
2 changes: 2 additions & 0 deletions app/settings.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""General settings config."""

import logging


Expand Down
5 changes: 3 additions & 2 deletions app/summarizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""The summarizer class, abstracting away the LLM."""
import os
from typing import Tuple

import openai
from openai import AzureOpenAI

from settings import log
from settings import log # pylint: ignore=import-error

# first get the env parametting
from dotenv import load_dotenv, find_dotenv
Expand Down Expand Up @@ -90,7 +91,7 @@ def summarize(self, text: str, system_prompt: str = "") -> Tuple[str, str]:
max_tokens=self.max_tokens,
response_format=response_format,
n=1)

log.debug(f"Full Response (OpenAI): {response}")
log.debug(f"response.choices[0].text: {response.choices[0].message}")
log.debug(response.model_dump_json(indent=2))
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@ beautifulsoup4
python-dotenv
pytest
pytest-cov
pymupdf
python-multipart
39 changes: 35 additions & 4 deletions templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,8 @@ <h3>Summarize this</h3>
</center>
</div>
<div class="container is-fluid">
<form action="/" method="post">
<form action="/" method="post" enctype="multipart/form-data">
<!-- Limitations checkboxes -->
<div class="is-size-7">
<input type="checkbox" id="accepted_tlp" name="accepted_tlp" onclick="enableSubmit()" />
&nbsp;
Expand All @@ -150,12 +151,13 @@ <h3>Summarize this</h3>
<label for="accepted_need_to_fact_check">I know that I will need to <b>fact-check the generated
report</b></label>
</p>
<!-- system prompt -->
<hr />
<!-- / Limitations checkboxes -->
<!-- system prompt -->
<hr />
</div>
<div class="field">
<p class="control has-icons-right has-icons-right">
<label class="label" for="system-prompt">Base instructions: </label>
<label class="label" for="system-prompt">Base prompt: </label>
</p>
&nbsp;
<p />
Expand All @@ -178,13 +180,42 @@ <h3>Summarize this</h3>
</p>
&nbsp;
<p />
<!-- URL -->
<div class="field">
<input class="is-size-7" type="url" id="url" name="url" style="color: #4c516d; width: 100%;"
placeholder="Enter URL" oninput="clearTextarea()">{% if url %}{{ url }}{% endif %}</input>
<p />
&nbsp;
<p />
</div>
<!-- PDF upload -->
<div class="field">
<label class="label">... or upload a PDF</label>
<div class="file is-boxed">
<label class="file-label">
<input class="file-input" type="file" id="pdffile" name="pdffile" accept="application/pdf">
<span class="file-cta">
<span class="file-icon">
<i class="fas fa-upload"></i>
</span>
<span class="file-label">
Choose a PDF…
</span>
</span>
</label>
</div>
</div>
<!-- <div class="field">
<div class="control">
<button type="submit" class="button is-primary">Upload</button>
</div>
-->
<p />
&nbsp;
<p />
</div>
<!-- / PDF upload -->
<!-- or text... -->
<div class="field">
<!-- or ... -->
<label class="label" for="text">... or this text:</label>
Expand Down

0 comments on commit a8e3c32

Please sign in to comment.