cyclotruc · Vyaas99 · Dec 30, 2024 · Dec 30, 2024 · Dec 31, 2024 · Dec 31, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ slowapi
 starlette
 tiktoken
 uvicorn
+PyPDF2
diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py
@@ -1,6 +1,7 @@
 import os
 from fnmatch import fnmatch
 from typing import Any
+from PyPDF2 import PdfReader
 
 import tiktoken
 
@@ -97,6 +98,21 @@ def _is_safe_symlink(symlink_path: str, base_path: str) -> bool:
         # If there's any error resolving the paths, consider it unsafe
         return False
 
+def _is_pdf_file(file_path: str) -> bool:
+    """
+    Check if the file is a PDF based on its extension.
+
+    Parameters
+    ----------
+    file_path : str
+        The path to the file to check.
+
+    Returns
+    -------
+    bool
+        `True` if the file is a PDF, `False` otherwise.
+    """
+    return file_path.lower().endswith(".pdf")
 
 def _is_text_file(file_path: str) -> bool:
     """
@@ -123,14 +139,32 @@ def _is_text_file(file_path: str) -> bool:
     except OSError:
         return False
 
+def _read_pdf_content(file_path: str) -> str:
+    """
+    Extract text from a PDF file.
+
+    Parameters
+    ----------
+    file_path : str
+        The path to the PDF file.
+
+    Returns
+    -------
+    str
+        The extracted text from the PDF, or an error message if extraction fails.
+    """
+    try:
+        reader = PdfReader(file_path)
+        return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
+    except Exception as e:
+        return f"Error reading PDF file: {str(e)}"
 
 def _read_file_content(file_path: str) -> str:
     """
     Reads the content of a file.
 
-    This function attempts to open a file and read its contents using UTF-8 encoding.
-    If an error occurs during reading (e.g., file is not found or permission error),
-    it returns an error message.
+    This function reads text files using UTF-8 encoding or extracts text from PDF files.
+    If an error occurs during reading, it returns an error message.
 
     Parameters
     ----------
@@ -142,6 +176,8 @@ def _read_file_content(file_path: str) -> str:
     str
         The content of the file, or an error message if the file could not be read.
     """
+    if _is_pdf_file(file_path):
+        return _read_pdf_content(file_path)
     try:
         with open(file_path, encoding="utf-8", errors="ignore") as f:
             return f.read()
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,3 +6,4 @@ slowapi @@
     starlette
     tiktoken
     uvicorn
+    PyPDF2