Skip to content

Commit

Permalink
Use mediumtext for document content & Do not inline images when parsi…
Browse files Browse the repository at this point in the history
…ng content
  • Loading branch information
filips123 committed Aug 31, 2022
1 parent e5d0b62 commit b378454
Showing 2 changed files with 7 additions and 2 deletions.
3 changes: 2 additions & 1 deletion API/gimvicurnik/database/__init__.py
Original file line number Diff line number Diff line change
@@ -13,6 +13,7 @@
Index,
Integer,
SmallInteger,
String,
Text,
Time,
func,
@@ -66,7 +67,7 @@ class Document(Base):
title = Column(Text, nullable=True)
hash = Column(Text, nullable=True)
parsed = Column(Boolean, nullable=True)
content = Column(Text, nullable=True)
content = Column(String(70000), nullable=True)


class Entity:
6 changes: 5 additions & 1 deletion API/gimvicurnik/updaters/eclassroom.py
Original file line number Diff line number Diff line change
@@ -22,6 +22,7 @@

if typing.TYPE_CHECKING:
from typing import Any, Dict, Iterator, List, Optional
from mammoth.documents import Image # type: ignore
from sqlalchemy.orm import Session
from sentry_sdk.tracing import Span
from ..config import ConfigSourcesEClassroom
@@ -213,7 +214,10 @@ def document_has_content(self, document: DocumentInfo) -> bool:
def get_content(self, document: DocumentInfo, content: bytes) -> Optional[str]:
"""Get file content of docx circulars."""

result = convert_to_html(io.BytesIO(content))
def ignore_images(_image: Image) -> Dict:
return {}

result = convert_to_html(io.BytesIO(content), convert_image=ignore_images)
return typing.cast(str, result.value) # The generated HTML

@with_span(op="parse", pass_span=True)

0 comments on commit b378454

Please sign in to comment.