-
Notifications
You must be signed in to change notification settings - Fork 89
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cog 793 metadata rework #460
Changes from 7 commits
0c7c1d7
49ad292
ab8d95c
5c17501
4196a4c
77f0b45
bd3a5a7
655ab0b
80e67b0
40c0279
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,7 +54,6 @@ def read(self): | |
contains=[], | ||
_metadata={ | ||
"index_fields": ["text"], | ||
"metadata_id": self.document.metadata_id, | ||
}, | ||
) | ||
paragraph_chunks = [] | ||
|
@@ -74,7 +73,6 @@ def read(self): | |
contains=[], | ||
_metadata={ | ||
"index_fields": ["text"], | ||
"metadata_id": self.document.metadata_id, | ||
}, | ||
) | ||
except Exception as e: | ||
|
@@ -95,7 +93,7 @@ def read(self): | |
chunk_index=self.chunk_index, | ||
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"], | ||
contains=[], | ||
_metadata={"index_fields": ["text"], "metadata_id": self.document.metadata_id}, | ||
_metadata={"index_fields": ["text"]}, | ||
) | ||
except Exception as e: | ||
print(e) | ||
Comment on lines
+96
to
99
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Improve error handling in DocumentChunk creation. Replace print statements with proper logging: - except Exception as e:
- print(e)
+ except Exception as e:
+ logger.error("Failed to create DocumentChunk: %s", str(e)) Also consider:
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,11 @@ | ||
from datetime import datetime, timezone | ||
from typing import List | ||
from uuid import uuid4 | ||
from sqlalchemy import UUID, Column, DateTime, String | ||
from sqlalchemy.orm import Mapped, relationship | ||
from sqlalchemy import UUID, Column, DateTime, String, JSON | ||
from sqlalchemy.orm import relationship | ||
|
||
from cognee.infrastructure.databases.relational import Base | ||
|
||
from .DatasetData import DatasetData | ||
from .Metadata import Metadata | ||
|
||
|
||
class Data(Base): | ||
|
@@ -21,6 +19,7 @@ class Data(Base): | |
raw_data_location = Column(String) | ||
owner_id = Column(UUID, index=True) | ||
content_hash = Column(String) | ||
foreign_metadata = Column(JSON) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Codebase verification Consider adding schema validation for foreign_metadata - Confirmed The codebase shows that
🔗 Analysis chainConsider adding schema validation for foreign_metadata. While using JSON type provides flexibility, consider:
Let's check for any existing schema validation or documentation: 🏁 Scripts executedThe following scripts were executed for the analysis: Script: #!/bin/bash
# Search for schema validation or documentation
rg -g '!*.pyc' -A 5 'foreign_metadata.*schema|metadata.*schema'
# Check for migration files
fd -g 'migrations/*.py' -x cat {}
Length of output: 2493 Script: #!/bin/bash
# Get the complete Data.py file
cat cognee/modules/data/models/Data.py
# Search for any validation logic or usage of foreign_metadata
rg -g '!*.pyc' -A 5 'foreign_metadata'
# Look for any documentation files
fd -e md -e rst -x grep -l -i "metadata" {} \;
Length of output: 9265 |
||
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)) | ||
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc)) | ||
|
||
|
@@ -32,13 +31,6 @@ class Data(Base): | |
cascade="all, delete", | ||
) | ||
|
||
metadata_relationship = relationship( | ||
"Metadata", | ||
back_populates="data", | ||
lazy="noload", | ||
cascade="all, delete", | ||
) | ||
|
||
def to_json(self) -> dict: | ||
return { | ||
"id": str(self.id), | ||
|
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,7 @@ | |
class Document(DataPoint): | ||
name: str | ||
raw_data_location: str | ||
metadata_id: UUID | ||
foreign_metadata: Optional[str] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potential type mismatch for On line 10, foreign_metadata: Optional[str] However, elsewhere in the code (e.g., Consider changing the type annotation to accept a dictionary or ensure that - foreign_metadata: Optional[str]
+ from typing import Optional, Dict, Any
+ foreign_metadata: Optional[Dict[str, Any]] Or, if foreign_metadata=json.dumps(get_foreign_metadata_dict(data_item)) |
||
mime_type: str | ||
_metadata: dict = {"index_fields": ["name"], "type": "Document"} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
from cognee.modules.data.models import Data | ||
import json | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Add error handling for JSON serialization and remove unnecessary indentation. The current implementation has two potential issues:
Consider this implementation: async def classify_documents(data_documents: list[Data]) -> list[Document]:
documents = []
for data_item in data_documents:
+ try:
+ metadata_json = json.dumps(data_item.foreign_metadata)
+ except (TypeError, ValueError) as e:
+ logger.error(f"Failed to serialize foreign_metadata for {data_item.id}: {e}")
+ metadata_json = "{}"
+
document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
id=data_item.id,
title=f"{data_item.name}.{data_item.extension}",
raw_data_location=data_item.raw_data_location,
name=data_item.name,
mime_type=data_item.mime_type,
- foreign_metadata=json.dumps(data_item.foreign_metadata, indent=4),
+ foreign_metadata=metadata_json,
) Also applies to: 61-61 |
||
from cognee.modules.data.processing.document_types import ( | ||
Document, | ||
PdfDocument, | ||
|
@@ -7,7 +8,6 @@ | |
TextDocument, | ||
UnstructuredDocument, | ||
) | ||
from cognee.modules.data.operations.get_metadata import get_metadata | ||
|
||
EXTENSION_TO_DOCUMENT_CLASS = { | ||
"pdf": PdfDocument, # Text documents | ||
|
@@ -52,14 +52,13 @@ | |
async def classify_documents(data_documents: list[Data]) -> list[Document]: | ||
documents = [] | ||
for data_item in data_documents: | ||
metadata = await get_metadata(data_item.id) | ||
document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension]( | ||
id=data_item.id, | ||
title=f"{data_item.name}.{data_item.extension}", | ||
raw_data_location=data_item.raw_data_location, | ||
name=data_item.name, | ||
mime_type=data_item.mime_type, | ||
metadata_id=metadata.id, | ||
foreign_metadata=json.dumps(data_item.foreign_metadata, indent=4), | ||
) | ||
documents.append(document) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,3 @@ | ||
from .ingest_data import ingest_data | ||
from .save_data_to_storage import save_data_to_storage | ||
from .save_data_item_to_storage import save_data_item_to_storage | ||
from .save_data_item_with_metadata_to_storage import save_data_item_with_metadata_to_storage | ||
from .ingest_data_with_metadata import ingest_data_with_metadata | ||
from .ingest_data import ingest_data | ||
from .resolve_data_directories import resolve_data_directories |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Enhance error handling granularity.
The current implementation might miss important OpenAI-specific errors. Consider:
📝 Committable suggestion