From ef1f40b1cc95ae1b5b6d0ff990e1d94aa5528538 Mon Sep 17 00:00:00 2001 From: Vasil Ivanov Date: Sun, 5 Nov 2023 11:03:18 +0200 Subject: [PATCH] Replace deprecated PdfFileReader with PdfReader PdfFileReader was deprecated by PyPDF leading to crashes/errors. The pdf_to_text was updated with to use the new PdfReader api --- src/utils/reader.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/utils/reader.py b/src/utils/reader.py index 929b46b..ecde35d 100644 --- a/src/utils/reader.py +++ b/src/utils/reader.py @@ -65,11 +65,10 @@ def pdf_to_text(input_file: Path) -> str: str, the plain text content of the PDF file. """ with open(input_file, "rb") as f: - reader = PyPDF2.PdfFileReader(f) + reader = PyPDF2.PdfReader(f) text = "" - for i in range(reader.getNumPages()): - page = reader.getPage(i) - text += page.extractText() + for page in reader.pages: + text += page.extract_text() return text