diff --git a/API/gimvicurnik/errors/__init__.py b/API/gimvicurnik/errors/__init__.py index 60e410a..908d20d 100644 --- a/API/gimvicurnik/errors/__init__.py +++ b/API/gimvicurnik/errors/__init__.py @@ -1,6 +1,13 @@ from .base import GimVicUrnikError from .config import ConfigError, ConfigParseError, ConfigReadError, ConfigValidationError -from .eclassroom import ClassroomApiError, ClassroomError, InvalidRecordError, InvalidTokenError +from .eclassroom import ( + ClassroomApiError, + ClassroomError, + InvalidRecordError, + InvalidTokenError, + SubstitutionsFormatError, + LunchScheduleFormatError, +) from .menu import MenuApiError, MenuDateError, MenuFormatError from .timetable import TimetableApiError diff --git a/API/gimvicurnik/errors/eclassroom.py b/API/gimvicurnik/errors/eclassroom.py index 519c77b..65428c7 100644 --- a/API/gimvicurnik/errors/eclassroom.py +++ b/API/gimvicurnik/errors/eclassroom.py @@ -15,3 +15,11 @@ class InvalidTokenError(ClassroomApiError): class InvalidRecordError(ClassroomApiError): pass + + +class SubstitutionsFormatError(ClassroomError): + pass + + +class LunchScheduleFormatError(ClassroomError): + pass diff --git a/API/gimvicurnik/updaters/eclassroom.py b/API/gimvicurnik/updaters/eclassroom.py index 523937b..0be0ce5 100644 --- a/API/gimvicurnik/updaters/eclassroom.py +++ b/API/gimvicurnik/updaters/eclassroom.py @@ -11,10 +11,17 @@ from mammoth import convert_to_html # type: ignore from sqlalchemy import insert +from openpyxl import load_workbook from .base import BaseMultiUpdater, DocumentInfo from ..database import Class, Classroom, DocumentType, LunchSchedule, Substitution, Teacher -from ..errors import ClassroomApiError, InvalidRecordError, InvalidTokenError +from ..errors import ( + ClassroomApiError, + InvalidRecordError, + InvalidTokenError, + SubstitutionsFormatError, + LunchScheduleFormatError, +) from ..utils.database import get_or_create from ..utils.pdf import extract_tables from ..utils.sentry import with_span @@ -186,7 +193,7 @@ def _get_document_type(url: str) -> DocumentType: if "www.dropbox.com" in url: return DocumentType.SUBSTITUTIONS - elif "delitevKosila" in url: + elif "delitev-kosila" in url: return DocumentType.LUNCH_SCHEDULE elif "okroznica" in url.lower() or "okrožnica" in url.lower(): return DocumentType.CIRCULAR @@ -260,16 +267,30 @@ def parse_document(self, document: DocumentInfo, stream: BytesIO, effective: dat span.set_tag("document.type", document.type.value) span.set_tag("document.format", document.extension) - # Extract all tables from a PDF stream - tables = with_span(op="extract")(extract_tables)(stream) - - if document.type == DocumentType.SUBSTITUTIONS: - self._parse_substitutions(tables, effective) - elif document.type == DocumentType.LUNCH_SCHEDULE: - self._parse_lunch_schedule(tables, effective) - else: - # This cannot happen because only menus are provided by the API - raise KeyError("Unknown parsable document type from the e-classroom") + # Only parse xlsx lunch schedules - a guard for now + if document.type == DocumentType.LUNCH_SCHEDULE and document.extension != "xlsx": + return + + match (document.type, document.extension): + case (DocumentType.SUBSTITUTIONS, "pdf"): + self._parse_substitutions_pdf(stream, effective) + case (DocumentType.LUNCH_SCHEDULE, "pdf"): + self._parse_lunch_schedule_pdf(stream, effective) + case (DocumentType.SUBSTITUTIONS, "xlsx"): + self._parse_substitutions_xlsx(stream, effective) + case (DocumentType.LUNCH_SCHEDULE, "xlsx"): + self._parse_lunch_schedule_xlsx(stream, effective) + case (DocumentType.SUBSTITUTIONS, _): + raise SubstitutionsFormatError( + "Unknown substitutions document format: " + str(document.extension) + ) + case (DocumentType.LUNCH_SCHEDULE, _): + raise LunchScheduleFormatError( + "Unknown lunch schedule document format: " + str(document.extension) + ) + case _: + # This cannot happen because only menus are provided by the API + raise KeyError("Unknown parsable document type from the e-classroom") def document_needs_extraction(self, document: DocumentInfo) -> bool: """Return whether the document content needs to be extracted.""" @@ -415,8 +436,8 @@ def _format_substitution( } # fmt: on - def _parse_substitutions(self, tables: Tables, effective: date) -> None: - """Parse the substitutions document.""" + def _parse_substitutions_pdf(self, stream: BytesIO, effective: date) -> None: + """Parse the substitutions pdf document.""" # fmt: off header_substitutions = ["ODSOTNI UČITELJ/ICA", "URA", "RAZRED", "UČILNICA", "NADOMEŠČA", "PREDMET", "OPOMBA"] @@ -433,6 +454,9 @@ def _parse_substitutions(self, tables: Tables, effective: date) -> None: parser_type = None last_original_teacher = None + # Extract all tables from a PDF stream + tables = with_span(op="extract")(extract_tables)(stream) + # Parse tables into substitutions for table in tables: for row0 in table: @@ -620,11 +644,19 @@ def _parse_substitutions(self, tables: Tables, effective: date) -> None: if substitutions: self.session.execute(insert(Substitution), substitutions) - def _parse_lunch_schedule(self, tables: Tables, effective: date) -> None: - """Parse the lunch schedule document.""" + def _parse_substitutions_xlsx(self, stream: BytesIO, effective: date) -> None: + """Parse the substitutions xlsx document.""" + # Currently not useful. + pass + + def _parse_lunch_schedule_pdf(self, stream: BytesIO, effective: date) -> None: + """Parse the lunch schedule pdf document.""" schedule = [] + # Extract all tables from a PDF stream + tables = with_span(op="extract")(extract_tables)(stream) + for table in tables: # Skip instructions if not table[0][0] or "Dijaki prihajate v jedilnico" in table[0][0]: @@ -699,3 +731,64 @@ def _parse_lunch_schedule(self, tables: Tables, effective: date) -> None: # Store schedule to a database self.session.query(LunchSchedule).filter(LunchSchedule.date == effective).delete() self.session.execute(insert(LunchSchedule), schedule) + + def _parse_lunch_schedule_xlsx(self, stream: BytesIO, effective: date) -> None: + """ + Parse the lunch schedule xlsx document. + + Columns should be: + - Time (Ura) + - Notes (Opombe/Prilagoditev) + - Class (Razred) + * number of students (stevilo dijakov) [ignored] + - Location (Prostor) + """ + + # Extract workbook from an XLSX stream + wb = with_span(op="extract")(load_workbook)(stream, read_only=True, data_only=True) + + lunch_schedule = [] + + # Parse lunch schedule + for ws in wb: + if ws.title != "kosilo": + continue + + for wr in ws.iter_rows(min_row=3, max_col=5): + if not wr[0].value: + break + + # Check for correct cell value type + if typing.TYPE_CHECKING: + assert isinstance(wr[0].value, datetime) + assert isinstance(wr[1].value, str) + assert isinstance(wr[2].value, str) + assert isinstance(wr[4].value, str) + + # Schedule for specific class + class_schedule: dict[str, Any] = {} + + # Time in format H:M + class_schedule["time"] = wr[0].value + + # Notes + class_schedule["notes"] = wr[1].value.strip() if wr[1].value else None + + # Class name (class id) + if wr[2].value: + class_schedule["class_id"] = get_or_create( + self.session, model=Class, name=wr[2].value.strip() + )[0].id + + # Location + class_schedule["location"] = wr[4].value.strip() if wr[4].value else None + + # Effective date + class_schedule["date"] = effective + lunch_schedule.append(class_schedule) + + wb.close() + + # Store schedule to a database + self.session.query(LunchSchedule).filter(LunchSchedule.date == effective).delete() + self.session.execute(insert(LunchSchedule), lunch_schedule)