Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cog 793 metadata rework #460

Merged
merged 10 commits into from
Jan 23, 2025
4 changes: 2 additions & 2 deletions cognee/api/v1/add/add_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user
from cognee.modules.pipelines import run_tasks, Task
from cognee.tasks.ingestion import ingest_data_with_metadata, resolve_data_directories
from cognee.tasks.ingestion import ingest_data, resolve_data_directories
from cognee.infrastructure.databases.relational import (
create_db_and_tables as create_relational_db_and_tables,
)
Expand All @@ -22,7 +22,7 @@ async def add(
if user is None:
user = await get_default_user()

tasks = [Task(resolve_data_directories), Task(ingest_data_with_metadata, dataset_name, user)]
tasks = [Task(resolve_data_directories), Task(ingest_data, dataset_name, user)]

pipeline = run_tasks(tasks, data, "add_pipeline")

Expand Down
4 changes: 2 additions & 2 deletions cognee/api/v1/cognify/code_graph_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from cognee.shared.data_models import KnowledgeGraph, MonitoringTool
from cognee.tasks.documents import classify_documents, extract_chunks_from_documents
from cognee.tasks.graph import extract_graph_from_data
from cognee.tasks.ingestion import ingest_data_with_metadata
from cognee.tasks.ingestion import ingest_data
from cognee.tasks.repo_processor import (
enrich_dependency_graph,
expand_dependency_graph,
Expand Down Expand Up @@ -68,7 +68,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
if include_docs:
non_code_tasks = [
Task(get_non_py_files, task_config={"batch_size": 50}),
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
Task(ingest_data, dataset_name="repo_docs", user=user),
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
Task(classify_documents),
Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ async def exponential_backoff(attempt):

return await self.embed_text(text)

except (litellm.exceptions.BadRequestError, litellm.llms.OpenAI.openai.OpenAIError):
except litellm.exceptions.BadRequestError:
raise EmbeddingException("Failed to index data points.")

Comment on lines +98 to 100
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Enhance error handling granularity.

The current implementation might miss important OpenAI-specific errors. Consider:

  1. Restoring specific handling for OpenAIError
  2. Adding specific error messages for different failure scenarios
-        except litellm.exceptions.BadRequestError:
-            raise EmbeddingException("Failed to index data points.")
+        except litellm.exceptions.BadRequestError as e:
+            raise EmbeddingException(f"Bad request while indexing data points: {str(e)}")
+        except litellm.llms.OpenAI.openai.OpenAIError as e:
+            raise EmbeddingException(f"OpenAI service error: {str(e)}")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
except litellm.exceptions.BadRequestError:
raise EmbeddingException("Failed to index data points.")
except litellm.exceptions.BadRequestError as e:
raise EmbeddingException(f"Bad request while indexing data points: {str(e)}")
except litellm.llms.OpenAI.openai.OpenAIError as e:
raise EmbeddingException(f"OpenAI service error: {str(e)}")

except Exception as error:
Expand Down
4 changes: 1 addition & 3 deletions cognee/modules/chunking/TextChunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def read(self):
contains=[],
_metadata={
"index_fields": ["text"],
"metadata_id": self.document.metadata_id,
},
)
paragraph_chunks = []
Expand All @@ -74,7 +73,6 @@ def read(self):
contains=[],
_metadata={
"index_fields": ["text"],
"metadata_id": self.document.metadata_id,
},
)
except Exception as e:
Expand All @@ -95,7 +93,7 @@ def read(self):
chunk_index=self.chunk_index,
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
contains=[],
_metadata={"index_fields": ["text"], "metadata_id": self.document.metadata_id},
_metadata={"index_fields": ["text"]},
)
except Exception as e:
print(e)
Comment on lines +96 to 99
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Improve error handling in DocumentChunk creation.

Replace print statements with proper logging:

-            except Exception as e:
-                print(e)
+            except Exception as e:
+                logger.error("Failed to create DocumentChunk: %s", str(e))

Also consider:

  1. Adding more specific exception types
  2. Including context information in the error message
  3. Potentially propagating the error up the stack

Committable suggestion skipped: line range outside the PR's diff.

14 changes: 3 additions & 11 deletions cognee/modules/data/models/Data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from datetime import datetime, timezone
from typing import List
from uuid import uuid4
from sqlalchemy import UUID, Column, DateTime, String
from sqlalchemy.orm import Mapped, relationship
from sqlalchemy import UUID, Column, DateTime, String, JSON
from sqlalchemy.orm import relationship

from cognee.infrastructure.databases.relational import Base

from .DatasetData import DatasetData
from .Metadata import Metadata


class Data(Base):
Expand All @@ -21,6 +19,7 @@ class Data(Base):
raw_data_location = Column(String)
owner_id = Column(UUID, index=True)
content_hash = Column(String)
foreign_metadata = Column(JSON)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Codebase verification

Consider adding schema validation for foreign_metadata - Confirmed

The codebase shows that foreign_metadata follows an implicit schema {"metadata": dict, "origin": str} but lacks validation. Current implementation silently accepts invalid data, which could lead to inconsistencies. Recommended changes:

  • Add schema validation using Pydantic or similar
  • Document the expected format in the model's docstring
  • Replace the warning with proper error handling for invalid metadata
🔗 Analysis chain

Consider adding schema validation for foreign_metadata.

While using JSON type provides flexibility, consider:

  1. Adding schema validation to ensure consistent metadata structure
  2. Documenting the expected format
  3. Adding migration strategy for existing metadata

Let's check for any existing schema validation or documentation:

🏁 Scripts executed

The following scripts were executed for the analysis:

Script:

#!/bin/bash
# Search for schema validation or documentation
rg -g '!*.pyc' -A 5 'foreign_metadata.*schema|metadata.*schema'

# Check for migration files
fd -g 'migrations/*.py' -x cat {}

Length of output: 2493


Script:

#!/bin/bash
# Get the complete Data.py file
cat cognee/modules/data/models/Data.py

# Search for any validation logic or usage of foreign_metadata
rg -g '!*.pyc' -A 5 'foreign_metadata'

# Look for any documentation files
fd -e md -e rst -x grep -l -i "metadata" {} \;

Length of output: 9265

created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))

Expand All @@ -32,13 +31,6 @@ class Data(Base):
cascade="all, delete",
)

metadata_relationship = relationship(
"Metadata",
back_populates="data",
lazy="noload",
cascade="all, delete",
)

def to_json(self) -> dict:
return {
"id": str(self.id),
Expand Down
21 changes: 0 additions & 21 deletions cognee/modules/data/models/Metadata.py

This file was deleted.

19 changes: 0 additions & 19 deletions cognee/modules/data/operations/delete_metadata.py

This file was deleted.

17 changes: 0 additions & 17 deletions cognee/modules/data/operations/get_metadata.py

This file was deleted.

65 changes: 0 additions & 65 deletions cognee/modules/data/operations/write_metadata.py

This file was deleted.

2 changes: 1 addition & 1 deletion cognee/modules/data/processing/document_types/Document.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class Document(DataPoint):
name: str
raw_data_location: str
metadata_id: UUID
foreign_metadata: Optional[str]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Potential type mismatch for foreign_metadata attribute

On line 10, foreign_metadata is defined as:

foreign_metadata: Optional[str]

However, elsewhere in the code (e.g., ingest_data.py), foreign_metadata is assigned a dictionary, which may cause type inconsistencies.

Consider changing the type annotation to accept a dictionary or ensure that foreign_metadata is always a string (e.g., by serializing the dictionary to JSON):

- foreign_metadata: Optional[str]
+ from typing import Optional, Dict, Any
+ foreign_metadata: Optional[Dict[str, Any]]

Or, if foreign_metadata should remain a string, ensure all assignments serialize the data properly:

foreign_metadata=json.dumps(get_foreign_metadata_dict(data_item))

mime_type: str
_metadata: dict = {"index_fields": ["name"], "type": "Document"}

Expand Down
5 changes: 2 additions & 3 deletions cognee/tasks/documents/classify_documents.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from cognee.modules.data.models import Data
import json
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add error handling for JSON serialization and remove unnecessary indentation.

The current implementation has two potential issues:

  1. No error handling for JSON serialization failures
  2. Unnecessary overhead from pretty-printing JSON with indent=4

Consider this implementation:

 async def classify_documents(data_documents: list[Data]) -> list[Document]:
     documents = []
     for data_item in data_documents:
+        try:
+            metadata_json = json.dumps(data_item.foreign_metadata)
+        except (TypeError, ValueError) as e:
+            logger.error(f"Failed to serialize foreign_metadata for {data_item.id}: {e}")
+            metadata_json = "{}"
+
         document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
             id=data_item.id,
             title=f"{data_item.name}.{data_item.extension}",
             raw_data_location=data_item.raw_data_location,
             name=data_item.name,
             mime_type=data_item.mime_type,
-            foreign_metadata=json.dumps(data_item.foreign_metadata, indent=4),
+            foreign_metadata=metadata_json,
         )

Also applies to: 61-61

from cognee.modules.data.processing.document_types import (
Document,
PdfDocument,
Expand All @@ -7,7 +8,6 @@
TextDocument,
UnstructuredDocument,
)
from cognee.modules.data.operations.get_metadata import get_metadata

EXTENSION_TO_DOCUMENT_CLASS = {
"pdf": PdfDocument, # Text documents
Expand Down Expand Up @@ -52,14 +52,13 @@
async def classify_documents(data_documents: list[Data]) -> list[Document]:
documents = []
for data_item in data_documents:
metadata = await get_metadata(data_item.id)
document = EXTENSION_TO_DOCUMENT_CLASS[data_item.extension](
id=data_item.id,
title=f"{data_item.name}.{data_item.extension}",
raw_data_location=data_item.raw_data_location,
name=data_item.name,
mime_type=data_item.mime_type,
metadata_id=metadata.id,
foreign_metadata=json.dumps(data_item.foreign_metadata, indent=4),
)
documents.append(document)

Expand Down
5 changes: 1 addition & 4 deletions cognee/tasks/ingestion/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from .ingest_data import ingest_data
from .save_data_to_storage import save_data_to_storage
from .save_data_item_to_storage import save_data_item_to_storage
from .save_data_item_with_metadata_to_storage import save_data_item_with_metadata_to_storage
from .ingest_data_with_metadata import ingest_data_with_metadata
from .ingest_data import ingest_data
from .resolve_data_directories import resolve_data_directories
Loading
Loading