From 586b862442c1035e23e36c1fe43288a2128b239e Mon Sep 17 00:00:00 2001 From: PetJer Date: Mon, 1 Jan 2024 11:26:13 +0100 Subject: [PATCH] Brought back the pdf menu parser --- API/gimvicurnik/updaters/eclassroom.py | 6 ++- API/gimvicurnik/updaters/menu.py | 73 ++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/API/gimvicurnik/updaters/eclassroom.py b/API/gimvicurnik/updaters/eclassroom.py index 9aec83d..db6f493 100644 --- a/API/gimvicurnik/updaters/eclassroom.py +++ b/API/gimvicurnik/updaters/eclassroom.py @@ -656,7 +656,7 @@ def _parse_lunch_schedule_xlsx(self, stream: BytesIO, effective: date) -> None: """ # Extract workbook from an XLSX stream - wb = with_span(op="extract")(load_workbook)(stream, read_only=True, data_only=True) + wb = with_span(op="extract")(load_workbook)(stream, data_only=True) lunch_schedule = [] @@ -665,6 +665,9 @@ def _parse_lunch_schedule_xlsx(self, stream: BytesIO, effective: date) -> None: if ws.title != "kosilo": continue + while not ws["A1"].value: + ws.delete_cols(1) + for wr in ws.iter_rows(min_row=3, max_col=5): if not wr[3].value: break @@ -674,7 +677,6 @@ def _parse_lunch_schedule_xlsx(self, stream: BytesIO, effective: date) -> None: assert isinstance(wr[0].value, datetime) assert isinstance(wr[1].value, str) assert isinstance(wr[2].value, str) - assert isinstance(wr[3].value, int) assert isinstance(wr[4].value, str) # Ignore rows that do not contain a class name diff --git a/API/gimvicurnik/updaters/menu.py b/API/gimvicurnik/updaters/menu.py index c50b449..e7e455b 100644 --- a/API/gimvicurnik/updaters/menu.py +++ b/API/gimvicurnik/updaters/menu.py @@ -14,6 +14,7 @@ from .base import BaseMultiUpdater, DocumentInfo from ..database import DocumentType, LunchMenu, SnackMenu from ..errors import MenuApiError, MenuDateError, MenuFormatError +from ..utils.pdf import extract_tables from ..utils.sentry import with_span if typing.TYPE_CHECKING: @@ -120,6 +121,10 @@ def parse_document(self, document: DocumentInfo, stream: BytesIO, effective: dat span.set_tag("document.format", document.extension) match (document.type, document.extension): + case (DocumentType.SNACK_MENU, "pdf"): + self._parse_snack_menu_pdf(stream, effective) + case (DocumentType.LUNCH_MENU, "pdf"): + self._parse_lunch_menu_pdf(stream, effective) case (DocumentType.SNACK_MENU, "xlsx"): self._parse_snack_menu_xlsx(stream, effective) case (DocumentType.LUNCH_MENU, "xlsx"): @@ -131,6 +136,41 @@ def parse_document(self, document: DocumentInfo, stream: BytesIO, effective: dat case _: raise KeyError("Unknown document type for menu") + def _parse_snack_menu_pdf(self, stream: BytesIO, effective: datetime.date) -> None: + """Parse the snack menu PDF document.""" + + # Extract all tables from a PDF stream + tables = with_span(op="extract")(extract_tables)(stream) + + days = 0 + + # Parse tables into menus and store them + for table in tables: + for row in table: + if not row[1] or "NV in N" in row[1]: + continue + + current = effective + datetime.timedelta(days=days) + days += 1 + + menu = { + "date": current, + "normal": row[1], + "poultry": row[2], + "vegetarian": row[3], + "fruitvegetable": row[4], + } + + model = self.session.query(SnackMenu).filter(SnackMenu.date == current).first() + + if not model: + model = SnackMenu() + + for key, value in menu.items(): + setattr(model, key, value) + + self.session.add(model) + def _parse_snack_menu_xlsx(self, stream: BytesIO, effective: datetime.date) -> None: """Parse the snack menu XLSX document.""" @@ -201,6 +241,39 @@ def _parse_snack_menu_xlsx(self, stream: BytesIO, effective: datetime.date) -> N wb.close() + def _parse_lunch_menu_pdf(self, stream: BytesIO, effective: datetime.date) -> None: + """Parse the lunch menu PDF document.""" + + # Extract all tables from a PDF stream + tables = with_span(op="extract")(extract_tables)(stream) + + days = 0 + + # Parse tables into menus and store them + for table in tables: + for row in table: + if not row[1] or "N KOSILO" in row[1]: + continue + + current = effective + datetime.timedelta(days=days) + days += 1 + + menu = { + "date": current, + "normal": row[1], + "vegetarian": row[2], + } + + model = self.session.query(LunchMenu).filter(LunchMenu.date == current).first() + + if not model: + model = LunchMenu() + + for key, value in menu.items(): + setattr(model, key, value) + + self.session.add(model) + def _parse_lunch_menu_xlsx(self, stream: BytesIO, effective: datetime.date) -> None: """Parse the lunch menu XLSX document."""