Skip to content

Commit

Permalink
feat(banks): allow for manual password override
Browse files Browse the repository at this point in the history
  • Loading branch information
benjamin-awd committed Oct 18, 2023
1 parent de12113 commit e32b2b7
Show file tree
Hide file tree
Showing 10 changed files with 53 additions and 26 deletions.
2 changes: 1 addition & 1 deletion monopoly/examples/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .example_bank import MonopolyBank

__all__ = [MonopolyBank]
__all__ = ["MonopolyBank"]
3 changes: 2 additions & 1 deletion monopoly/examples/multiple_statements.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ def ocbc_example():
bank = Ocbc(
file_path=file_path,
)
statement = bank.extract()
pages = bank.get_pages()
statement = bank.extract(pages)
transformed_df = bank.transform(statement)
bank.load(transformed_df, statement)

Expand Down
3 changes: 2 additions & 1 deletion monopoly/examples/single_statement.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ def example():

# This runs Tesseract on the PDF and
# extracts transactions as raw text
statement = bank.extract()
pages = bank.get_pages()
statement = bank.extract(pages)

# Dates are converted into an ISO 8601 date format
transformed_df = bank.transform(statement)
Expand Down
3 changes: 2 additions & 1 deletion monopoly/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def process_bank_statement(message: Message, banks: dict):

with message.save(attachment) as file:
processor: StatementProcessor = bank_class(file_path=file)
statement = processor.extract()
pages = processor.get_pages()
statement = processor.extract(pages)
transformed_df = processor.transform(statement)
processor.load(transformed_df, statement, upload_to_cloud=True)

Expand Down
15 changes: 10 additions & 5 deletions monopoly/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,29 @@ def __init__(self, file_path: str, config: PdfConfig = None):
self.static_string = config.static_string
self.remove_vertical_text = True

def open(self):
def open(self, password_override: str = None):
"""
Opens a PDF document. Accepts a manual password override,
if the user does not want to set passwords in the .env file.
"""
logger.info("Opening pdf from path %s", self.file_path)
document = fitz.Document(self.file_path)
password = self.password or password_override

if not document.is_encrypted:
return document

if self.password and not self.brute_force_mask:
document.authenticate(self.password)
if password:
document.authenticate(password)

if document.is_encrypted:
raise ValueError("Wrong password - document is encrypted")
raise ValueError("Wrong password - unable to open document")

return document

# This attempts to unlock statements based on a common password,
# followed by the last few digits of a card
if document.is_encrypted and self.brute_force_mask:
if not password and self.brute_force_mask and self.static_string:
logger.info("Unlocking PDF using a string prefix with mask")
password = self.unlock_pdf(
pdf_file_path=self.file_path,
Expand Down
23 changes: 12 additions & 11 deletions monopoly/processor.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,30 @@
import logging
from dataclasses import dataclass
from datetime import datetime
from typing import Optional

from pandas import DataFrame

from monopoly.config import PdfConfig, StatementConfig, settings
from monopoly.constants import StatementFields
from monopoly.pdf import PdfParser
from monopoly.pdf import PdfPage, PdfParser
from monopoly.statement import Statement
from monopoly.storage import upload_to_cloud_storage, write_to_csv

logger = logging.getLogger(__name__)


@dataclass
class StatementProcessor:
statement_config: StatementConfig
file_path: str
pdf_config: Optional[PdfConfig] = None
transform_dates: bool = True
class StatementProcessor(PdfParser):
def __init__(
self, statement_config, file_path, pdf_config=None, transform_dates=True
):
self.statement_config: StatementConfig = statement_config
self.file_path: str = file_path
self.pdf_config: Optional[PdfConfig] = pdf_config
self.transform_dates: bool = transform_dates

super().__init__(file_path=self.file_path, config=pdf_config)

def extract(self) -> Statement:
parser = PdfParser(self.file_path, self.pdf_config)
pages = parser.get_pages()
def extract(self, pages: list[PdfPage]) -> Statement:
statement = Statement(pages, self.statement_config)

if not statement.transactions:
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/banks/citibank/test_citibank_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@


def test_citibank_extract_unprotected_pdf(citibank: Citibank):
raw_df = citibank.extract().df
pages = citibank.get_pages()
raw_df = citibank.extract(pages).df
expected_df = pd.read_csv("tests/integration/fixtures/citibank/expected.csv")

assert_frame_equal(raw_df, expected_df)
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/banks/hsbc/test_hsbc_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@


def test_hsbc_extract_unprotected_pdf(hsbc: Hsbc):
raw_df = hsbc.extract().df
pages = hsbc.get_pages()
raw_df = hsbc.extract(pages).df
expected_df = pd.read_csv("tests/integration/fixtures/hsbc/expected.csv")

assert_frame_equal(raw_df, expected_df)
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/banks/ocbc/test_ocbc_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@


def test_ocbc_extract_unprotected_pdf(ocbc: Ocbc):
raw_df = ocbc.extract().df
pages = ocbc.get_pages()
raw_df = ocbc.extract(pages).df

expected_df = pd.read_csv("tests/integration/fixtures/ocbc/expected.csv")

Expand Down
21 changes: 18 additions & 3 deletions tests/integration/test_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
from pytest import raises

from monopoly.banks import Hsbc
from monopoly.pdf import PdfParser


Expand All @@ -14,7 +15,7 @@ def test_wrong_password_raises_error(parser: PdfParser):
parser.file_path = "tests/integration/fixtures/protected.pdf"
parser.password = "wrong_pw"

with pytest.raises(ValueError, match="document is encrypted"):
with raises(ValueError, match="Wrong password"):
parser.open()


Expand All @@ -30,7 +31,7 @@ def test_get_pages_invalid_returns_error(parser: PdfParser):
parser.file_path = "tests/integration/fixtures/4_pages_blank.pdf"
parser.page_range = slice(99, -99)

with pytest.raises(ValueError, match="bad page number"):
with raises(ValueError, match="bad page number"):
parser.get_pages()


Expand All @@ -42,3 +43,17 @@ def test_pdf_unlock(parser: PdfParser):
)

assert password == "foobar123"


def test_override_password(hsbc: Hsbc):
hsbc = Hsbc("tests/integration/fixtures/protected.pdf")

document = hsbc.open(password_override="foobar123")
assert not document.is_encrypted


def test_error_raised_if_override_is_wrong(hsbc: Hsbc):
hsbc = Hsbc("tests/integration/fixtures/protected.pdf")

with raises(ValueError, match="Wrong password"):
hsbc.open(password_override="wrongpw")

0 comments on commit e32b2b7

Please sign in to comment.