Merge pull request #83 from filips123/xlsx-parsing

Xlsx parsing
filips123 · Sep 11, 2023 · 44ec009 · 44ec009
2 parents 0f67fcd + 19cb7b7
commit 44ec009
Show file tree

Hide file tree

Showing 3 changed files with 125 additions and 17 deletions.
diff --git a/API/gimvicurnik/errors/__init__.py b/API/gimvicurnik/errors/__init__.py
@@ -1,6 +1,13 @@
 from .base import GimVicUrnikError
 
 from .config import ConfigError, ConfigParseError, ConfigReadError, ConfigValidationError
-from .eclassroom import ClassroomApiError, ClassroomError, InvalidRecordError, InvalidTokenError
+from .eclassroom import (
+    ClassroomApiError,
+    ClassroomError,
+    InvalidRecordError,
+    InvalidTokenError,
+    SubstitutionsFormatError,
+    LunchScheduleFormatError,
+)
 from .menu import MenuApiError, MenuDateError, MenuFormatError
 from .timetable import TimetableApiError
diff --git a/API/gimvicurnik/errors/eclassroom.py b/API/gimvicurnik/errors/eclassroom.py
@@ -15,3 +15,11 @@ class InvalidTokenError(ClassroomApiError):
 
 class InvalidRecordError(ClassroomApiError):
     pass
+
+
+class SubstitutionsFormatError(ClassroomError):
+    pass
+
+
+class LunchScheduleFormatError(ClassroomError):
+    pass
diff --git a/API/gimvicurnik/updaters/eclassroom.py b/API/gimvicurnik/updaters/eclassroom.py
@@ -11,10 +11,17 @@
 
 from mammoth import convert_to_html  # type: ignore
 from sqlalchemy import insert
+from openpyxl import load_workbook
 
 from .base import BaseMultiUpdater, DocumentInfo
 from ..database import Class, Classroom, DocumentType, LunchSchedule, Substitution, Teacher
-from ..errors import ClassroomApiError, InvalidRecordError, InvalidTokenError
+from ..errors import (
+    ClassroomApiError,
+    InvalidRecordError,
+    InvalidTokenError,
+    SubstitutionsFormatError,
+    LunchScheduleFormatError,
+)
 from ..utils.database import get_or_create
 from ..utils.pdf import extract_tables
 from ..utils.sentry import with_span
@@ -186,7 +193,7 @@ def _get_document_type(url: str) -> DocumentType:
 
         if "www.dropbox.com" in url:
             return DocumentType.SUBSTITUTIONS
-        elif "delitevKosila" in url:
+        elif "delitev-kosila" in url:
             return DocumentType.LUNCH_SCHEDULE
         elif "okroznica" in url.lower() or "okrožnica" in url.lower():
             return DocumentType.CIRCULAR
@@ -260,16 +267,30 @@ def parse_document(self, document: DocumentInfo, stream: BytesIO, effective: dat
         span.set_tag("document.type", document.type.value)
         span.set_tag("document.format", document.extension)
 
-        # Extract all tables from a PDF stream
-        tables = with_span(op="extract")(extract_tables)(stream)
-
-        if document.type == DocumentType.SUBSTITUTIONS:
-            self._parse_substitutions(tables, effective)
-        elif document.type == DocumentType.LUNCH_SCHEDULE:
-            self._parse_lunch_schedule(tables, effective)
-        else:
-            # This cannot happen because only menus are provided by the API
-            raise KeyError("Unknown parsable document type from the e-classroom")
+        # Only parse xlsx lunch schedules - a guard for now
+        if document.type == DocumentType.LUNCH_SCHEDULE and document.extension != "xlsx":
+            return
+
+        match (document.type, document.extension):
+            case (DocumentType.SUBSTITUTIONS, "pdf"):
+                self._parse_substitutions_pdf(stream, effective)
+            case (DocumentType.LUNCH_SCHEDULE, "pdf"):
+                self._parse_lunch_schedule_pdf(stream, effective)
+            case (DocumentType.SUBSTITUTIONS, "xlsx"):
+                self._parse_substitutions_xlsx(stream, effective)
+            case (DocumentType.LUNCH_SCHEDULE, "xlsx"):
+                self._parse_lunch_schedule_xlsx(stream, effective)
+            case (DocumentType.SUBSTITUTIONS, _):
+                raise SubstitutionsFormatError(
+                    "Unknown substitutions document format: " + str(document.extension)
+                )
+            case (DocumentType.LUNCH_SCHEDULE, _):
+                raise LunchScheduleFormatError(
+                    "Unknown lunch schedule document format: " + str(document.extension)
+                )
+            case _:
+                # This cannot happen because only menus are provided by the API
+                raise KeyError("Unknown parsable document type from the e-classroom")
 
     def document_needs_extraction(self, document: DocumentInfo) -> bool:
         """Return whether the document content needs to be extracted."""
@@ -415,8 +436,8 @@ def _format_substitution(
         }
         # fmt: on
 
-    def _parse_substitutions(self, tables: Tables, effective: date) -> None:
-        """Parse the substitutions document."""
+    def _parse_substitutions_pdf(self, stream: BytesIO, effective: date) -> None:
+        """Parse the substitutions pdf document."""
 
         # fmt: off
         header_substitutions = ["ODSOTNI UČITELJ/ICA", "URA", "RAZRED", "UČILNICA", "NADOMEŠČA", "PREDMET", "OPOMBA"]
@@ -433,6 +454,9 @@ def _parse_substitutions(self, tables: Tables, effective: date) -> None:
         parser_type = None
         last_original_teacher = None
 
+        # Extract all tables from a PDF stream
+        tables = with_span(op="extract")(extract_tables)(stream)
+
         # Parse tables into substitutions
         for table in tables:
             for row0 in table:
@@ -620,11 +644,19 @@ def _parse_substitutions(self, tables: Tables, effective: date) -> None:
         if substitutions:
             self.session.execute(insert(Substitution), substitutions)
 
-    def _parse_lunch_schedule(self, tables: Tables, effective: date) -> None:
-        """Parse the lunch schedule document."""
+    def _parse_substitutions_xlsx(self, stream: BytesIO, effective: date) -> None:
+        """Parse the substitutions xlsx document."""
+        # Currently not useful.
+        pass
+
+    def _parse_lunch_schedule_pdf(self, stream: BytesIO, effective: date) -> None:
+        """Parse the lunch schedule pdf document."""
 
         schedule = []
 
+        # Extract all tables from a PDF stream
+        tables = with_span(op="extract")(extract_tables)(stream)
+
         for table in tables:
             # Skip instructions
             if not table[0][0] or "Dijaki prihajate v jedilnico" in table[0][0]:
@@ -699,3 +731,64 @@ def _parse_lunch_schedule(self, tables: Tables, effective: date) -> None:
         # Store schedule to a database
         self.session.query(LunchSchedule).filter(LunchSchedule.date == effective).delete()
         self.session.execute(insert(LunchSchedule), schedule)
+
+    def _parse_lunch_schedule_xlsx(self, stream: BytesIO, effective: date) -> None:
+        """
+        Parse the lunch schedule xlsx document.
+
+        Columns should be:
+        - Time (Ura)
+        - Notes (Opombe/Prilagoditev)
+        - Class (Razred)
+        * number of students (stevilo dijakov) [ignored]
+        - Location (Prostor)
+        """
+
+        # Extract workbook from an XLSX stream
+        wb = with_span(op="extract")(load_workbook)(stream, read_only=True, data_only=True)
+
+        lunch_schedule = []
+
+        # Parse lunch schedule
+        for ws in wb:
+            if ws.title != "kosilo":
+                continue
+
+            for wr in ws.iter_rows(min_row=3, max_col=5):
+                if not wr[0].value:
+                    break
+
+                # Check for correct cell value type
+                if typing.TYPE_CHECKING:
+                    assert isinstance(wr[0].value, datetime)
+                    assert isinstance(wr[1].value, str)
+                    assert isinstance(wr[2].value, str)
+                    assert isinstance(wr[4].value, str)
+
+                # Schedule for specific class
+                class_schedule: dict[str, Any] = {}
+
+                # Time in format H:M
+                class_schedule["time"] = wr[0].value
+
+                # Notes
+                class_schedule["notes"] = wr[1].value.strip() if wr[1].value else None
+
+                # Class name (class id)
+                if wr[2].value:
+                    class_schedule["class_id"] = get_or_create(
+                        self.session, model=Class, name=wr[2].value.strip()
+                    )[0].id
+
+                # Location
+                class_schedule["location"] = wr[4].value.strip() if wr[4].value else None
+
+                # Effective date
+                class_schedule["date"] = effective
+                lunch_schedule.append(class_schedule)
+
+        wb.close()
+
+        # Store schedule to a database
+        self.session.query(LunchSchedule).filter(LunchSchedule.date == effective).delete()
+        self.session.execute(insert(LunchSchedule), lunch_schedule)