From a61edc651d8603f74899d15c68cdb2779e96f2af Mon Sep 17 00:00:00 2001 From: Benjamin Dornel Date: Wed, 8 Nov 2023 00:07:32 +0800 Subject: [PATCH] feat: add barebones cli --- .gitignore | 1 + poetry.lock | 2 +- pyproject.toml | 4 ++ src/monopoly/banks/__init__.py | 3 +- src/monopoly/banks/base.py | 3 +- src/monopoly/cli.py | 46 ++++++++++++++ src/monopoly/pdf.py | 3 +- src/monopoly/processor.py | 3 +- tests/unit/test_cli.py | 56 ++++++++++++++++++ .../unit/test_cli/nested_directory/nested.pdf | Bin tests/unit/test_cli/top_level.pdf | Bin tests/unit/test_cli/top_level_2.pdf | Bin 12 files changed, 116 insertions(+), 5 deletions(-) create mode 100644 src/monopoly/cli.py create mode 100644 tests/unit/test_cli.py create mode 100644 tests/unit/test_cli/nested_directory/nested.pdf create mode 100644 tests/unit/test_cli/top_level.pdf create mode 100644 tests/unit/test_cli/top_level_2.pdf diff --git a/.gitignore b/.gitignore index c1c74b3a..27cea874 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ dist # allowed pdf files/fixtures !tests/integration/banks/**/*.csv !tests/integration/banks/**/*.pdf +!tests/unit/test_cli/**/*.pdf !monopoly/examples/*.pdf # john files diff --git a/poetry.lock b/poetry.lock index 974161e3..4d332ac9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1447,4 +1447,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "cb38432ddd384590cea1eae6f8729eb2f0eed49c6638f0af9626ef74b90e2fcd" +content-hash = "0a2b45eb629f092557d82babf0a925a07e1b78e226447cfabe9490d17b3e4ab6" diff --git a/pyproject.toml b/pyproject.toml index 8d83f57e..11555d2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ pymupdf = "^1.23.3" pydantic = "^2.4.2" pdf2john = "^0.1.8" pdftotext = "^2.2.2" +click = "^8.1.7" [tool.poetry.group.dev.dependencies] @@ -84,3 +85,6 @@ ignore_missing_imports = true [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +[tool.poetry.scripts] +monopoly = "monopoly.cli:monopoly" diff --git a/src/monopoly/banks/__init__.py b/src/monopoly/banks/__init__.py index 50f6d81b..fff09cf9 100644 --- a/src/monopoly/banks/__init__.py +++ b/src/monopoly/banks/__init__.py @@ -1,4 +1,5 @@ from dataclasses import Field, fields +from pathlib import Path from typing import Type from monopoly.constants import EncryptionIdentifier, MetadataIdentifier @@ -14,7 +15,7 @@ banks: list[Type[BankBase]] = [Citibank, Dbs, Hsbc, Ocbc, StandardChartered] -def auto_detect_bank(file_path: str) -> BankBase: +def auto_detect_bank(file_path: Path) -> BankBase: """ Reads the encryption metadata or actual metadata (if the PDF is not encrypted), and checks for a bank based on unique identifiers. diff --git a/src/monopoly/banks/base.py b/src/monopoly/banks/base.py index 40e84dc4..675cdc1a 100644 --- a/src/monopoly/banks/base.py +++ b/src/monopoly/banks/base.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Optional from monopoly.constants import EncryptionIdentifier, MetadataIdentifier @@ -11,7 +12,7 @@ class BankBase(StatementProcessor): def __init__( self, - file_path: str, + file_path: Path, identifiers: Optional[list[EncryptionIdentifier | MetadataIdentifier]] = None, password: Optional[str] = None, parser: Optional[PdfParser] = None, diff --git a/src/monopoly/cli.py b/src/monopoly/cli.py new file mode 100644 index 00000000..fe418da2 --- /dev/null +++ b/src/monopoly/cli.py @@ -0,0 +1,46 @@ +from pathlib import Path +from typing import Collection, Iterable + +import click + +from monopoly.banks import auto_detect_bank + + +def run(files: Collection[Path]): + for file in files: + bank = auto_detect_bank(file) + statement = bank.extract() + transformed_df = bank.transform(statement) + bank.load(transformed_df, statement) + + +def get_statement_paths(files: Iterable[Path]) -> set[Path]: + matched_files = set() + for path in files: + if path.is_file() and str(path).endswith(".pdf"): + matched_files.add(path) + + if path.is_dir(): + matched_files |= get_statement_paths(path.iterdir()) + + return matched_files + + +@click.command() +@click.argument( + "files", + nargs=-1, + type=click.Path(exists=True, allow_dash=True, resolve_path=True, path_type=Path), +) +def monopoly(files: list[Path]): + """ + Monopoly helps convert your bank statements from PDF to CSV. + + A file or directory can be passed in via the FILES argument + """ + if files: + matched_files = get_statement_paths(files) + run(matched_files) + + else: + print("No command received") diff --git a/src/monopoly/pdf.py b/src/monopoly/pdf.py index ebefef42..b7efb021 100644 --- a/src/monopoly/pdf.py +++ b/src/monopoly/pdf.py @@ -3,6 +3,7 @@ from dataclasses import dataclass from functools import cached_property from io import BytesIO +from pathlib import Path from typing import Optional import fitz @@ -32,7 +33,7 @@ def lines(self) -> list[str]: class PdfParser: def __init__( self, - file_path: str, + file_path: Path, brute_force_config: Optional[BruteForceConfig] = None, pdf_config: Optional[PdfConfig] = None, ): diff --git a/src/monopoly/processor.py b/src/monopoly/processor.py index a2b72a6c..d1c1a928 100644 --- a/src/monopoly/processor.py +++ b/src/monopoly/processor.py @@ -1,5 +1,6 @@ import logging from datetime import datetime +from pathlib import Path from typing import Optional from pandas import DataFrame @@ -37,7 +38,7 @@ class StatementProcessor(PdfParser): allows for the parser to be reused and avoid re-opening the PDF. """ - def __init__(self, file_path: str, parser: Optional[PdfParser] = None, **kwargs): + def __init__(self, file_path: Path, parser: Optional[PdfParser] = None, **kwargs): keys = [ "statement_config", "transaction_config", diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py new file mode 100644 index 00000000..e2b5e7e7 --- /dev/null +++ b/tests/unit/test_cli.py @@ -0,0 +1,56 @@ +from pathlib import Path +from unittest.mock import DEFAULT, MagicMock, patch + +import pytest + +from monopoly.cli import get_statement_paths, run + + +@pytest.fixture +def test_directory() -> Path: + return Path("tests/unit/test_cli").resolve() + + +class MockBank(MagicMock): + def extract(self): + pass + + def transform(self): + pass + + def load(self): + pass + + +def test_run(monkeypatch): + def mock_auto_detect_bank(file_path: Path): + assert "input.pdf" in str(file_path) + return MockBank() + + monkeypatch.setattr("monopoly.cli.auto_detect_bank", mock_auto_detect_bank) + + # Mock paths + files = [Path("tests/integration/banks/example/input.pdf").resolve()] + + with patch.multiple(MockBank, extract=DEFAULT, transform=DEFAULT, load=DEFAULT): + run(files) + + assert isinstance(MockBank.extract, MagicMock) + assert isinstance(MockBank.transform, MagicMock) + assert isinstance(MockBank.load, MagicMock) + + # Assertions + MockBank.extract.assert_called_once() + MockBank.transform.assert_called_once() + MockBank.load.assert_called_once() + + +def test_get_statement_paths(test_directory: Path) -> None: + path = test_directory + expected = { + path / "top_level.pdf", + path / "top_level_2.pdf", + path / "nested_directory/nested.pdf", + } + res = get_statement_paths(test_directory.iterdir()) + assert res == expected diff --git a/tests/unit/test_cli/nested_directory/nested.pdf b/tests/unit/test_cli/nested_directory/nested.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/unit/test_cli/top_level.pdf b/tests/unit/test_cli/top_level.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/unit/test_cli/top_level_2.pdf b/tests/unit/test_cli/top_level_2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391