From a8e3c323cfcfcb8146e103cafa2b6e109aabacbc Mon Sep 17 00:00:00 2001 From: Aaron Kaplan Date: Mon, 18 Mar 2024 23:02:48 +0000 Subject: [PATCH] Version 0.5 . See CHANGELOG.txt --- .vscode/settings.json | 4 +- CHANGELOG.txt | 6 +++ VERSION.txt | 2 +- app/auth.py | 6 ++- app/main.py | 101 ++++++++++++++++++++++++++++++++++-------- app/settings.py | 2 + app/summarizer.py | 5 ++- requirements.txt | 2 + templates/index.html | 39 ++++++++++++++-- 9 files changed, 137 insertions(+), 30 deletions(-) create mode 100644 CHANGELOG.txt diff --git a/.vscode/settings.json b/.vscode/settings.json index 9edd824..b4d46fc 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -3,6 +3,6 @@ "python.linting.pylintEnabled": false, "python.linting.pycodestyleEnabled": true, "python.linting.enabled": true, - "python.linting.pycodestyleArgs": ["--max-line-length=128", "--ignore=E501", ] - + "python.linting.pycodestyleArgs": ["--max-line-length=128", "--ignore=E501", ], + "python.linting.pylintArgs": [ "--disable=F0401" ] } \ No newline at end of file diff --git a/CHANGELOG.txt b/CHANGELOG.txt new file mode 100644 index 0000000..495b7d1 --- /dev/null +++ b/CHANGELOG.txt @@ -0,0 +1,6 @@ +Version 0.5 (204/3/19) +========================== + +* added PDF upload functionality +* cleaned up stuff pylint was complaining about + diff --git a/VERSION.txt b/VERSION.txt index bd73f47..2eb3c4f 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -0.4 +0.5 diff --git a/app/auth.py b/app/auth.py index 4f77046..ef95596 100644 --- a/app/auth.py +++ b/app/auth.py @@ -1,4 +1,6 @@ -import os +"""Authorization helper.""" + +import os from fastapi import Depends, HTTPException from fastapi.security import HTTPBasic, HTTPBasicCredentials @@ -14,9 +16,9 @@ # dependency to check if the credentials are valid def get_current_username(credentials: HTTPBasicCredentials = Depends(security)): + """Check if user in the allowed list""" username = credentials.username password = credentials.password if username in fake_users and password == fake_users[username]: return username raise HTTPException(status_code=401, detail="Invalid credentials") - diff --git a/app/main.py b/app/main.py index 7dc237e..eba2ec9 100644 --- a/app/main.py +++ b/app/main.py @@ -1,28 +1,29 @@ """Main FastAPI file. Provides the app WSGI entry point.""" import os import sys -import json +import tempfile +from urllib.parse import urlparse +from distutils.util import strtobool # pylint: disable=deprecated-module + +import requests + +import fitz # PyMuPDF import uvicorn -from fastapi import FastAPI, Request, Form, Depends +from fastapi import FastAPI, Request, Form, Depends, UploadFile, File from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates from fastapi.staticfiles import StaticFiles from starlette.middleware.base import BaseHTTPMiddleware import markdown -from urllib.parse import urlparse -import requests from bs4 import BeautifulSoup from dotenv import load_dotenv, find_dotenv -from summarizer import Summarizer -# from settings import Settings -from auth import get_current_username -from distutils.util import strtobool +from summarizer import Summarizer # pylint: ignore=import-error +from auth import get_current_username # pylint: ignore=import-error - -from settings import log +from settings import log # pylint: ignore=import-error # first get the env parametting @@ -30,7 +31,8 @@ log.warning("Could not find .env file! Assuming ENV vars work") try: - VERSION = open('../VERSION.txt', encoding='utf-8').readline().rstrip('\n') + with open('../VERSION.txt', encoding='utf-8') as _f: + VERSION = _f.readline().rstrip('\n') except Exception as e: log.error("could not find VERSION.txt, bailing out.") sys.exit(-1) @@ -40,20 +42,22 @@ templates = Jinja2Templates(directory="/templates") app.mount("/static", StaticFiles(directory="/static"), name="static") GO_AZURE = False # default -OUTPUT_JSON = bool(strtobool(os.getenv('OUTPUT_JSON', 'false'))) +OUTPUT_JSON = bool(strtobool(os.getenv('OUTPUT_JSON', 'false'))) DRY_RUN = bool(strtobool(os.getenv('DRY_RUN', 'false'))) -OPENAI_MODEL = os.getenv('OPENAI_MODEL') +OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo') # First detect if we should invoke OpenAI via MS Azure or directly try: GO_AZURE = bool(strtobool(os.getenv('USE_MS_AZURE', 'false'))) except Exception as e: - log.warning(f"Could not read 'USE_MS_AZURE' env var. Reason: '{str(e)}'. Reverting to false.") + log.warning( + f"Could not read 'USE_MS_AZURE' env var. Reason: '{str(e)}'. Reverting to false.") GO_AZURE = False class HTTPSRedirectMiddleware(BaseHTTPMiddleware): + """HTTP to HTTPS redirection""" async def dispatch(self, request: Request, call_next): if 'X-Forwarded-Proto' in request.headers and request.headers['X-Forwarded-Proto'] == 'https': request.scope['scheme'] = 'https' @@ -63,7 +67,8 @@ async def dispatch(self, request: Request, call_next): app.add_middleware(HTTPSRedirectMiddleware) -summarizer = Summarizer(go_azure=GO_AZURE, model=OPENAI_MODEL, max_tokens=8192, output_json=OUTPUT_JSON) +summarizer = Summarizer(go_azure=GO_AZURE, model=OPENAI_MODEL, + max_tokens=8192, output_json=OUTPUT_JSON) async def fetch_text_from_url(url: str) -> str: @@ -86,14 +91,54 @@ def get_index(request: Request, username: str = Depends(get_current_username)): return templates.TemplateResponse("index.html", {"request": request, "system_prompt": os.environ['SYSTEM_PROMPT'], "username": username}) +def convert_pdf_to_markdown(filename: str) -> str: + """Convert a PDF file given by to markdown. + + Args: + filename: str the file on the filesystem + + Returns: + markdown or "" empty string in case of error + """ + # Open the PDF file + doc = fitz.open(filename) + + # Initialize a variable to hold the text + markdown_content = "" + + # Iterate through each page of the PDF + for page_num in range(len(doc)): + # Get the page + page = doc.load_page(page_num) + + # Extract text from the page + text = page.get_text() + + # Add the text to our markdown content, followed by a page break + markdown_content += text + "\n\n---\n\n" + + return markdown_content + + +# The main POST method. Input can either be a URL or a PDF file or a textarea text @app.post("/", response_class=HTMLResponse) -async def index(request: Request, text: str = Form(None), url: str = Form(None), +async def index(request: Request, # request object + text: str = Form(None), # the text in the textarea + url: str = Form(None), # alternatively the URL + pdffile: UploadFile = File(None), system_prompt: str = Form(None), model: str = Form('model'), token_count: int = Form(100), username: str = Depends(get_current_username)): """HTTP POST method for the default page. This gets called when the user already HTTP POSTs a text which should be summarized.""" - if not url and not text: - error = "Expected either url field or text field. Please specify one at least." + if url: + log.warning(f"Got request with url: {url[:20]}") + elif pdffile: + log.warning(f"Got request with pdffile: {pdffile.filename}") + elif text: + log.warning(f"Got request with text: {text[:100]}") + else: + log.error("no pdffile, no text, no url. Bailing out.") + error = "Expected either url field or text field or a PDF file. Please specify one at least." result = None return templates.TemplateResponse("index.html", {"request": request, "text": text, "system_prompt": system_prompt, "result": error, "success": False, "username": username}, status_code=400) @@ -101,12 +146,30 @@ async def index(request: Request, text: str = Form(None), url: str = Form(None), summarizer.max_tokens = token_count if url: - # go and fetch it try: text = await fetch_text_from_url(url) except Exception as ex: return templates.TemplateResponse("index.html", {"request": request, "text": url, "system_prompt": system_prompt, "result": f"Could not fetch URL. Reason {str(ex)}", "success": False}, status_code=400) + elif pdffile: + log.warning("we got a pdffile") + try: + suffix = ".pdf" + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(pdffile.file.read()) + tmp_pdf_path = tmp.name # Temp file path + log.warning(f"stored as {tmp_pdf_path}") + + # Convert PDF to Markdown + text = convert_pdf_to_markdown(tmp_pdf_path) + log.warning(f"converted as {text[:100]}") + + # Cleanup the temporary file + os.unlink(tmp_pdf_path) + except Exception as ex: + return templates.TemplateResponse("index.html", {"request": request, "text": text, "system_prompt": system_prompt, "result": f"Could not process the PDF file. Reason {str(ex)}", "success": False}, status_code=400) + + # we got the text from the URL or the pdffile was converted... now check if we should actually summarize if DRY_RUN: result = "This is a sample response, we are in dry-run mode. We don't want to waste money for querying the API." error = None diff --git a/app/settings.py b/app/settings.py index 6746661..ac8ddf2 100644 --- a/app/settings.py +++ b/app/settings.py @@ -1,3 +1,5 @@ +"""General settings config.""" + import logging diff --git a/app/summarizer.py b/app/summarizer.py index 79f91da..099bdeb 100644 --- a/app/summarizer.py +++ b/app/summarizer.py @@ -1,10 +1,11 @@ +"""The summarizer class, abstracting away the LLM.""" import os from typing import Tuple import openai from openai import AzureOpenAI -from settings import log +from settings import log # pylint: ignore=import-error # first get the env parametting from dotenv import load_dotenv, find_dotenv @@ -90,7 +91,7 @@ def summarize(self, text: str, system_prompt: str = "") -> Tuple[str, str]: max_tokens=self.max_tokens, response_format=response_format, n=1) - + log.debug(f"Full Response (OpenAI): {response}") log.debug(f"response.choices[0].text: {response.choices[0].message}") log.debug(response.model_dump_json(indent=2)) diff --git a/requirements.txt b/requirements.txt index 658e387..4479e43 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,5 @@ beautifulsoup4 python-dotenv pytest pytest-cov +pymupdf +python-multipart diff --git a/templates/index.html b/templates/index.html index ebc6c78..53a5f99 100644 --- a/templates/index.html +++ b/templates/index.html @@ -131,7 +131,8 @@

Summarize this

-
+ +
  @@ -150,12 +151,13 @@

Summarize this

- -
+ + +

- +

 

@@ -178,6 +180,7 @@

Summarize this

 

+

{% if url %}{{ url }}{% endif %} @@ -185,6 +188,34 @@

Summarize this

 

+ +
+ +
+ +
+
+ +

+   +

+

+ +