Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cog 685 more document types #269

Merged
merged 11 commits into from
Dec 9, 2024
Merged
2 changes: 1 addition & 1 deletion .github/workflows/test_python_3_10.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
installer-parallel: true

- name: Install dependencies
run: poetry install --no-interaction
run: poetry install --no-interaction -E docs

- name: Run unit tests
run: poetry run pytest cognee/tests/unit/
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test_python_3_11.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
installer-parallel: true

- name: Install dependencies
run: poetry install --no-interaction
run: poetry install --no-interaction -E docs

- name: Run unit tests
run: poetry run pytest cognee/tests/unit/
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test_python_3_9.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
installer-parallel: true

- name: Install dependencies
run: poetry install --no-interaction
run: poetry install --no-interaction -E docs
dexters1 marked this conversation as resolved.
Show resolved Hide resolved

- name: Run unit tests
run: poetry run pytest cognee/tests/unit/
Expand Down
9 changes: 9 additions & 0 deletions cognee/modules/data/exceptions/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
Custom exceptions for the Cognee API.

This module defines a set of exceptions for handling various data errors
"""

from .exceptions import (
UnstructuredLibraryImportError,
)
11 changes: 11 additions & 0 deletions cognee/modules/data/exceptions/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from cognee.exceptions import CogneeApiError
from fastapi import status

class UnstructuredLibraryImportError(CogneeApiError):
def __init__(
self,
message: str = "Import error. Unstructured library is not installed.",
name: str = "UnstructuredModuleImportError",
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
):
super().__init__(message, name, status_code)
1 change: 1 addition & 0 deletions cognee/modules/data/processing/document_types/Document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ class Document(DataPoint):
name: str
raw_data_location: str
metadata_id: UUID
mime_type: str

def read(self, chunk_size: int) -> str:
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from io import StringIO

from cognee.modules.chunking.TextChunker import TextChunker
from .Document import Document
from cognee.modules.data.exceptions import UnstructuredLibraryImportError


class UnstructuredDocument(Document):
type: str = "unstructured"
dexters1 marked this conversation as resolved.
Show resolved Hide resolved

def read(self, chunk_size: int):
def get_text():
try:
from unstructured.partition.auto import partition
except ModuleNotFoundError:
raise UnstructuredLibraryImportError

elements = partition(self.raw_data_location, content_type=self.mime_type)
in_memory_file = StringIO("\n\n".join([str(el) for el in elements]))
in_memory_file.seek(0)

while True:
text = in_memory_file.read(1024)
dexters1 marked this conversation as resolved.
Show resolved Hide resolved

if len(text.strip()) == 0:
break

yield text

chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)

yield from chunker.read()
1 change: 1 addition & 0 deletions cognee/modules/data/processing/document_types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
from .TextDocument import TextDocument
from .ImageDocument import ImageDocument
from .AudioDocument import AudioDocument
from .UnstructuredDocument import UnstructuredDocument
11 changes: 11 additions & 0 deletions cognee/tasks/documents/classify_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,22 @@
AudioDocument,
ImageDocument,
TextDocument,
UnstructuredDocument,
)
from cognee.modules.data.operations.get_metadata import get_metadata

EXTENSION_TO_DOCUMENT_CLASS = {
"pdf": PdfDocument, # Text documents
"txt": TextDocument,
"docx": UnstructuredDocument,
"doc": UnstructuredDocument,
"odt": UnstructuredDocument,
"xls": UnstructuredDocument,
"xlsx": UnstructuredDocument,
"ppt": UnstructuredDocument,
"pptx": UnstructuredDocument,
"odp": UnstructuredDocument,
"ods": UnstructuredDocument,
"png": ImageDocument, # Image documents
"dwg": ImageDocument,
"xcf": ImageDocument,
Expand Down Expand Up @@ -48,6 +58,7 @@ async def classify_documents(data_documents: list[Data]) -> list[Document]:
title = f"{data_item.name}.{data_item.extension}",
raw_data_location = data_item.raw_data_location,
name = data_item.name,
mime_type = data_item.mime_type,
metadata_id = metadata.id
)
documents.append(document)
Expand Down
2 changes: 1 addition & 1 deletion cognee/tests/integration/documents/AudioDocument_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
def test_AudioDocument():

document = AudioDocument(
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4()
id=uuid.uuid4(), name="audio-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="",
)
with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
for ground_truth, paragraph_data in zip(
Expand Down
2 changes: 1 addition & 1 deletion cognee/tests/integration/documents/ImageDocument_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def test_ImageDocument():

document = ImageDocument(
id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4()
id=uuid.uuid4(), name="image-dummy-test", raw_data_location="", metadata_id=uuid.uuid4(), mime_type="",
)
with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):

Expand Down
3 changes: 2 additions & 1 deletion cognee/tests/integration/documents/PdfDocument_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def test_PdfDocument():
"artificial-intelligence.pdf",
)
document = PdfDocument(
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4()
id=uuid.uuid4(), name="Test document.pdf", raw_data_location=test_file_path, metadata_id=uuid.uuid4(),
mime_type="",
)

for ground_truth, paragraph_data in zip(
Expand Down
2 changes: 1 addition & 1 deletion cognee/tests/integration/documents/TextDocument_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_TextDocument(input_file, chunk_size):
input_file,
)
document = TextDocument(
id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4()
id=uuid.uuid4(), name=input_file, raw_data_location=test_file_path, metadata_id=uuid.uuid4(), mime_type="",
)

for ground_truth, paragraph_data in zip(
Expand Down
80 changes: 80 additions & 0 deletions cognee/tests/integration/documents/UnstructuredDocument_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os
import uuid

from cognee.modules.data.processing.document_types.UnstructuredDocument import UnstructuredDocument

def test_UnstructuredDocument():
# Define file paths of test data
pptx_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.pptx",
)

docx_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.docx",
)

csv_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.csv",
)

xlsx_file_path = os.path.join(
os.sep,
*(os.path.dirname(__file__).split(os.sep)[:-2]),
"test_data",
"example.xlsx",
)

# Define test documents
pptx_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.pptx", raw_data_location=pptx_file_path, metadata_id=uuid.uuid4(),
mime_type="application/vnd.openxmlformats-officedocument.presentationml.presentation"
)

docx_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.docx", raw_data_location=docx_file_path, metadata_id=uuid.uuid4(),
mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)

csv_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.csv", raw_data_location=csv_file_path, metadata_id=uuid.uuid4(),
mime_type="text/csv"
)

xlsx_document = UnstructuredDocument(
id=uuid.uuid4(), name="example.xlsx", raw_data_location=xlsx_file_path, metadata_id=uuid.uuid4(),
mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)

# Test PPTX
for paragraph_data in pptx_document.read(chunk_size=1024):
assert 19 == paragraph_data.word_count, f' 19 != {paragraph_data.word_count = }'
assert 104 == len(paragraph_data.text), f' 104 != {len(paragraph_data.text) = }'
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
Comment on lines +57 to +61
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Improve test structure and data organization

The test could benefit from:

  1. Moving expected values to constants
  2. Adding file existence checks
  3. Creating separate test functions for each file type
  4. Using parametrized tests

Example refactor:

import pytest
from pathlib import Path

TEST_CASES = [
    {
        'file': 'example.pptx',
        'mime_type': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
        'expected': {'word_count': 19, 'text_length': 104, 'cut_type': 'sentence_cut'}
    },
    # ... other test cases
]

@pytest.mark.parametrize("test_case", TEST_CASES)
def test_unstructured_document(test_case):
    file_path = Path(__file__).parent.parent.parent / 'test_data' / test_case['file']
    assert file_path.exists(), f"Test file {file_path} not found"
    
    document = UnstructuredDocument(
        id=uuid.uuid4(),
        name=test_case['file'],
        raw_data_location=str(file_path),
        metadata_id=uuid.uuid4(),
        mime_type=test_case['mime_type']
    )
    
    for paragraph_data in document.read(chunk_size=1024):
        assert test_case['expected']['word_count'] == paragraph_data.word_count
        assert test_case['expected']['text_length'] == len(paragraph_data.text)
        assert test_case['expected']['cut_type'] == paragraph_data.cut_type

Also applies to: 63-67, 69-74, 76-80


# Test DOCX
for paragraph_data in docx_document.read(chunk_size=1024):
assert 16 == paragraph_data.word_count, f' 16 != {paragraph_data.word_count = }'
assert 145 == len(paragraph_data.text), f' 145 != {len(paragraph_data.text) = }'
assert 'sentence_end' == paragraph_data.cut_type, f' sentence_end != {paragraph_data.cut_type = }'

# TEST CSV
for paragraph_data in csv_document.read(chunk_size=1024):
assert 15 == paragraph_data.word_count, f' 15 != {paragraph_data.word_count = }'
assert 'A A A A A A A A A,A A A A A A,A A' == paragraph_data.text, \
f'Read text doesn\'t match expected text: {paragraph_data.text}'
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'

# Test XLSX
for paragraph_data in xlsx_document.read(chunk_size=1024):
assert 36 == paragraph_data.word_count, f' 36 != {paragraph_data.word_count = }'
assert 171 == len(paragraph_data.text), f' 171 != {len(paragraph_data.text) = }'
assert 'sentence_cut' == paragraph_data.cut_type, f' sentence_cut != {paragraph_data.cut_type = }'
3 changes: 3 additions & 0 deletions cognee/tests/test_data/example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
A,A,A,A,A
A,A,A,"A,A",A
A,A,A,"A,A",A
Binary file added cognee/tests/test_data/example.docx
Binary file not shown.
Binary file added cognee/tests/test_data/example.pptx
Binary file not shown.
Binary file added cognee/tests/test_data/example.xlsx
Binary file not shown.
Loading
Loading