diff --git a/monopoly/examples/__init__.py b/monopoly/examples/__init__.py index bc76e6a2..e562e3e1 100644 --- a/monopoly/examples/__init__.py +++ b/monopoly/examples/__init__.py @@ -1,3 +1,3 @@ from .example_bank import MonopolyBank -__all__ = [MonopolyBank] +__all__ = ["MonopolyBank"] diff --git a/monopoly/examples/multiple_statements.py b/monopoly/examples/multiple_statements.py index e30c3be9..fe82d0fc 100644 --- a/monopoly/examples/multiple_statements.py +++ b/monopoly/examples/multiple_statements.py @@ -12,7 +12,8 @@ def ocbc_example(): bank = Ocbc( file_path=file_path, ) - statement = bank.extract() + pages = bank.get_pages() + statement = bank.extract(pages) transformed_df = bank.transform(statement) bank.load(transformed_df, statement) diff --git a/monopoly/examples/single_statement.py b/monopoly/examples/single_statement.py index 09b33fd3..53fd9c7b 100644 --- a/monopoly/examples/single_statement.py +++ b/monopoly/examples/single_statement.py @@ -11,7 +11,8 @@ def example(): # This runs Tesseract on the PDF and # extracts transactions as raw text - statement = bank.extract() + pages = bank.get_pages() + statement = bank.extract(pages) # Dates are converted into an ISO 8601 date format transformed_df = bank.transform(statement) diff --git a/monopoly/main.py b/monopoly/main.py index 4d19196f..cd230435 100644 --- a/monopoly/main.py +++ b/monopoly/main.py @@ -38,7 +38,8 @@ def process_bank_statement(message: Message, banks: dict): with message.save(attachment) as file: processor: StatementProcessor = bank_class(file_path=file) - statement = processor.extract() + pages = processor.get_pages() + statement = processor.extract(pages) transformed_df = processor.transform(statement) processor.load(transformed_df, statement, upload_to_cloud=True) diff --git a/monopoly/pdf.py b/monopoly/pdf.py index 719bbdef..2166d3b1 100644 --- a/monopoly/pdf.py +++ b/monopoly/pdf.py @@ -42,24 +42,29 @@ def __init__(self, file_path: str, config: PdfConfig = None): self.static_string = config.static_string self.remove_vertical_text = True - def open(self): + def open(self, password_override: str = None): + """ + Opens a PDF document. Accepts a manual password override, + if the user does not want to set passwords in the .env file. + """ logger.info("Opening pdf from path %s", self.file_path) document = fitz.Document(self.file_path) + password = self.password or password_override if not document.is_encrypted: return document - if self.password and not self.brute_force_mask: - document.authenticate(self.password) + if password: + document.authenticate(password) if document.is_encrypted: - raise ValueError("Wrong password - document is encrypted") + raise ValueError("Wrong password - unable to open document") return document # This attempts to unlock statements based on a common password, # followed by the last few digits of a card - if document.is_encrypted and self.brute_force_mask: + if not password and self.brute_force_mask and self.static_string: logger.info("Unlocking PDF using a string prefix with mask") password = self.unlock_pdf( pdf_file_path=self.file_path, diff --git a/monopoly/processor.py b/monopoly/processor.py index 57b03470..419c31f3 100644 --- a/monopoly/processor.py +++ b/monopoly/processor.py @@ -1,5 +1,4 @@ import logging -from dataclasses import dataclass from datetime import datetime from typing import Optional @@ -7,23 +6,25 @@ from monopoly.config import PdfConfig, StatementConfig, settings from monopoly.constants import StatementFields -from monopoly.pdf import PdfParser +from monopoly.pdf import PdfPage, PdfParser from monopoly.statement import Statement from monopoly.storage import upload_to_cloud_storage, write_to_csv logger = logging.getLogger(__name__) -@dataclass -class StatementProcessor: - statement_config: StatementConfig - file_path: str - pdf_config: Optional[PdfConfig] = None - transform_dates: bool = True +class StatementProcessor(PdfParser): + def __init__( + self, statement_config, file_path, pdf_config=None, transform_dates=True + ): + self.statement_config: StatementConfig = statement_config + self.file_path: str = file_path + self.pdf_config: Optional[PdfConfig] = pdf_config + self.transform_dates: bool = transform_dates + + super().__init__(file_path=self.file_path, config=pdf_config) - def extract(self) -> Statement: - parser = PdfParser(self.file_path, self.pdf_config) - pages = parser.get_pages() + def extract(self, pages: list[PdfPage]) -> Statement: statement = Statement(pages, self.statement_config) if not statement.transactions: diff --git a/tests/integration/banks/citibank/test_citibank_extract.py b/tests/integration/banks/citibank/test_citibank_extract.py index 360af79c..657f4802 100644 --- a/tests/integration/banks/citibank/test_citibank_extract.py +++ b/tests/integration/banks/citibank/test_citibank_extract.py @@ -6,7 +6,8 @@ def test_citibank_extract_unprotected_pdf(citibank: Citibank): - raw_df = citibank.extract().df + pages = citibank.get_pages() + raw_df = citibank.extract(pages).df expected_df = pd.read_csv("tests/integration/fixtures/citibank/expected.csv") assert_frame_equal(raw_df, expected_df) diff --git a/tests/integration/banks/hsbc/test_hsbc_extract.py b/tests/integration/banks/hsbc/test_hsbc_extract.py index 25e0d29a..8a461266 100644 --- a/tests/integration/banks/hsbc/test_hsbc_extract.py +++ b/tests/integration/banks/hsbc/test_hsbc_extract.py @@ -6,7 +6,8 @@ def test_hsbc_extract_unprotected_pdf(hsbc: Hsbc): - raw_df = hsbc.extract().df + pages = hsbc.get_pages() + raw_df = hsbc.extract(pages).df expected_df = pd.read_csv("tests/integration/fixtures/hsbc/expected.csv") assert_frame_equal(raw_df, expected_df) diff --git a/tests/integration/banks/ocbc/test_ocbc_extract.py b/tests/integration/banks/ocbc/test_ocbc_extract.py index acf3be83..bbfe3c07 100644 --- a/tests/integration/banks/ocbc/test_ocbc_extract.py +++ b/tests/integration/banks/ocbc/test_ocbc_extract.py @@ -6,7 +6,8 @@ def test_ocbc_extract_unprotected_pdf(ocbc: Ocbc): - raw_df = ocbc.extract().df + pages = ocbc.get_pages() + raw_df = ocbc.extract(pages).df expected_df = pd.read_csv("tests/integration/fixtures/ocbc/expected.csv") diff --git a/tests/integration/test_parser.py b/tests/integration/test_parser.py index 7c61470a..ada643b6 100644 --- a/tests/integration/test_parser.py +++ b/tests/integration/test_parser.py @@ -1,5 +1,6 @@ -import pytest +from pytest import raises +from monopoly.banks import Hsbc from monopoly.pdf import PdfParser @@ -14,7 +15,7 @@ def test_wrong_password_raises_error(parser: PdfParser): parser.file_path = "tests/integration/fixtures/protected.pdf" parser.password = "wrong_pw" - with pytest.raises(ValueError, match="document is encrypted"): + with raises(ValueError, match="Wrong password"): parser.open() @@ -30,7 +31,7 @@ def test_get_pages_invalid_returns_error(parser: PdfParser): parser.file_path = "tests/integration/fixtures/4_pages_blank.pdf" parser.page_range = slice(99, -99) - with pytest.raises(ValueError, match="bad page number"): + with raises(ValueError, match="bad page number"): parser.get_pages() @@ -42,3 +43,17 @@ def test_pdf_unlock(parser: PdfParser): ) assert password == "foobar123" + + +def test_override_password(hsbc: Hsbc): + hsbc = Hsbc("tests/integration/fixtures/protected.pdf") + + document = hsbc.open(password_override="foobar123") + assert not document.is_encrypted + + +def test_error_raised_if_override_is_wrong(hsbc: Hsbc): + hsbc = Hsbc("tests/integration/fixtures/protected.pdf") + + with raises(ValueError, match="Wrong password"): + hsbc.open(password_override="wrongpw")