Merge remote-tracking branch 'origin/dev' into feat/COG-1058-fastmcp

topoteretes · Jan 23, 2025 · e577276 · e577276
2 parents 00f302c + 90657a2
commit e577276
Show file tree

Hide file tree

Showing 40 changed files with 361 additions and 434 deletions.
diff --git a/cognee/api/v1/add/add_v2.py b/cognee/api/v1/add/add_v2.py
@@ -2,7 +2,7 @@
 from cognee.modules.users.models import User
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.pipelines import run_tasks, Task
-from cognee.tasks.ingestion import ingest_data_with_metadata, resolve_data_directories
+from cognee.tasks.ingestion import ingest_data, resolve_data_directories
 from cognee.infrastructure.databases.relational import (
     create_db_and_tables as create_relational_db_and_tables,
 )
@@ -22,7 +22,7 @@ async def add(
     if user is None:
         user = await get_default_user()
 
-    tasks = [Task(resolve_data_directories), Task(ingest_data_with_metadata, dataset_name, user)]
+    tasks = [Task(resolve_data_directories), Task(ingest_data, dataset_name, user)]
 
     pipeline = run_tasks(tasks, data, "add_pipeline")
 

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -10,7 +10,7 @@
 from cognee.shared.data_models import KnowledgeGraph, MonitoringTool
 from cognee.tasks.documents import classify_documents, extract_chunks_from_documents
 from cognee.tasks.graph import extract_graph_from_data
-from cognee.tasks.ingestion import ingest_data_with_metadata
+from cognee.tasks.ingestion import ingest_data
 from cognee.tasks.repo_processor import (
     enrich_dependency_graph,
     expand_dependency_graph,
@@ -68,7 +68,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
     if include_docs:
         non_code_tasks = [
             Task(get_non_py_files, task_config={"batch_size": 50}),
-            Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
+            Task(ingest_data, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
             Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),

diff --git a/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py b/cognee/infrastructure/databases/vector/embeddings/LiteLLMEmbeddingEngine.py
@@ -95,7 +95,7 @@ async def exponential_backoff(attempt):
 
             return await self.embed_text(text)
 
-        except (litellm.exceptions.BadRequestError, litellm.llms.OpenAI.openai.OpenAIError):
+        except litellm.exceptions.BadRequestError:
             raise EmbeddingException("Failed to index data points.")
 
         except Exception as error:

diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
@@ -54,7 +54,6 @@ def read(self):
                             contains=[],
                             _metadata={
                                 "index_fields": ["text"],
-                                "metadata_id": self.document.metadata_id,
                             },
                         )
                         paragraph_chunks = []
@@ -74,7 +73,6 @@ def read(self):
                                 contains=[],
                                 _metadata={
                                     "index_fields": ["text"],
-                                    "metadata_id": self.document.metadata_id,
                                 },
                             )
                         except Exception as e:
@@ -95,7 +93,7 @@ def read(self):
                     chunk_index=self.chunk_index,
                     cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
                     contains=[],
-                    _metadata={"index_fields": ["text"], "metadata_id": self.document.metadata_id},
+                    _metadata={"index_fields": ["text"]},
                 )
             except Exception as e:
                 print(e)
diff --git a/cognee/modules/data/models/Data.py b/cognee/modules/data/models/Data.py
@@ -1,13 +1,11 @@
 from datetime import datetime, timezone
-from typing import List
 from uuid import uuid4
-from sqlalchemy import UUID, Column, DateTime, String
-from sqlalchemy.orm import Mapped, relationship
+from sqlalchemy import UUID, Column, DateTime, String, JSON
+from sqlalchemy.orm import relationship
 
 from cognee.infrastructure.databases.relational import Base
 
 from .DatasetData import DatasetData
-from .Metadata import Metadata
 
 
 class Data(Base):
@@ -21,6 +19,7 @@ class Data(Base):
     raw_data_location = Column(String)
     owner_id = Column(UUID, index=True)
     content_hash = Column(String)
+    external_metadata = Column(JSON)
     created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
     updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))
 
@@ -32,13 +31,6 @@ class Data(Base):
         cascade="all, delete",
     )
 
-    metadata_relationship = relationship(
-        "Metadata",
-        back_populates="data",
-        lazy="noload",
-        cascade="all, delete",
-    )
-
     def to_json(self) -> dict:
         return {
             "id": str(self.id),

diff --git a/cognee/modules/data/models/Metadata.py b/cognee/modules/data/models/Metadata.py
diff --git a/cognee/modules/data/operations/delete_metadata.py b/cognee/modules/data/operations/delete_metadata.py
diff --git a/cognee/modules/data/operations/get_metadata.py b/cognee/modules/data/operations/get_metadata.py
diff --git a/cognee/modules/data/operations/write_metadata.py b/cognee/modules/data/operations/write_metadata.py
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
@@ -7,7 +7,7 @@
 class Document(DataPoint):
     name: str
     raw_data_location: str
-    metadata_id: UUID
+    external_metadata: Optional[str]
     mime_type: str
     _metadata: dict = {"index_fields": ["name"], "type": "Document"}
 

diff --git a/cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py b/cognee/tasks/chunk_naive_llm_classifier/chunk_naive_llm_classifier.py
@@ -10,7 +10,29 @@
 
 async def chunk_naive_llm_classifier(
     data_chunks: list[DocumentChunk], classification_model: Type[BaseModel]
-):
+) -> list[DocumentChunk]:
+    """
+    Classifies a list of document chunks using a specified classification model and updates vector and graph databases with the classification results.
+
+    Vector Database Structure:
+        - Collection Name: `classification`
+        - Payload Schema:
+            - uuid (str): Unique identifier for the classification.
+            - text (str): Text label of the classification.
+            - chunk_id (str): Identifier of the chunk associated with this classification.
+            - document_id (str): Identifier of the document associated with this classification.
+
+    Graph Database Structure:
+        - Nodes:
+            - Represent document chunks, classification types, and classification subtypes.
+        - Edges:
+            - `is_media_type`: Links document chunks to their classification type.
+            - `is_subtype_of`: Links classification subtypes to their parent type.
+            - `is_classified_as`: Links document chunks to their classification subtypes.
+    Notes:
+        - The function assumes that vector and graph database engines (`get_vector_engine` and `get_graph_engine`) are properly initialized and accessible.
+        - Classification labels are processed to ensure uniqueness using UUIDs based on their values.
+    """
     if len(data_chunks) == 0:
         return data_chunks
 

diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -17,6 +17,12 @@ def chunk_by_paragraph(
     """
     Chunks text by paragraph while preserving exact text reconstruction capability.
     When chunks are joined with empty string "", they reproduce the original text exactly.
+
+    Notes:
+        - Tokenization is handled using the `tiktoken` library, ensuring compatibility with the vector engine's embedding model.
+        - If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk.
+        - Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed.
+        - Remaining text at the end of the input will be yielded as a final chunk.
     """
     current_chunk = ""
     current_word_count = 0

diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py
@@ -1,9 +1,19 @@
-from uuid import uuid4
-from typing import Optional
+from uuid import uuid4, UUID
+from typing import Optional, Iterator, Tuple
 from .chunk_by_word import chunk_by_word
 
 
-def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
+def chunk_by_sentence(
+    data: str, maximum_length: Optional[int] = None
+) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
+    """
+    Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
+
+    Notes:
+        - Relies on the `chunk_by_word` function for word-level tokenization and classification.
+        - Ensures sentences within paragraphs are uniquely identifiable using UUIDs.
+        - Handles cases where the text ends mid-sentence by appending a special "sentence_cut" type.
+    """
     sentence = ""
     paragraph_id = uuid4()
     word_count = 0

diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py
@@ -1,4 +1,6 @@
 import re
+from typing import Iterator, Tuple
+
 
 SENTENCE_ENDINGS = r"[.;!?…]"
 PARAGRAPH_ENDINGS = r"[\n\r]"
@@ -34,7 +36,7 @@ def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
     return False
 
 
-def chunk_by_word(data: str):
+def chunk_by_word(data: str) -> Iterator[Tuple[str, str]]:
     """
     Chunks text into words and endings while preserving whitespace.
     Whitespace is included with the preceding word.

diff --git a/cognee/tasks/chunks/query_chunks.py b/cognee/tasks/chunks/query_chunks.py
@@ -3,11 +3,19 @@
 
 async def query_chunks(query: str) -> list[dict]:
     """
+
+    Queries the vector database to retrieve chunks related to the given query string.
+
     Parameters:
     - query (str): The query string to filter nodes by.
 
     Returns:
     - list(dict): A list of objects providing information about the chunks related to query.
+
+    Notes:
+        - The function uses the `search` method of the vector engine to find matches.
+        - Limits the results to the top 5 matching chunks to balance performance and relevance.
+        - Ensure that the vector database is properly initialized and contains the "document_chunk_text" collection.
     """
     vector_engine = get_vector_engine()
 

diff --git a/cognee/tasks/chunks/remove_disconnected_chunks.py b/cognee/tasks/chunks/remove_disconnected_chunks.py
@@ -3,6 +3,14 @@
 
 
 async def remove_disconnected_chunks(data_chunks: list[DocumentChunk]) -> list[DocumentChunk]:
+    """
+    Removes disconnected or obsolete chunks from the graph database.
+
+    Notes:
+        - Obsolete chunks are defined as chunks with no "next_chunk" predecessor.
+        - Fully disconnected nodes are identified and deleted separately.
+        - This function assumes that the graph database is properly initialized and accessible.
+    """
     graph_engine = await get_graph_engine()
 
     document_ids = set((data_chunk.document_id for data_chunk in data_chunks))

diff --git a/cognee/tasks/completion/graph_query_completion.py b/cognee/tasks/completion/graph_query_completion.py
@@ -6,6 +6,10 @@
 
 
 def retrieved_edges_to_string(retrieved_edges: list) -> str:
+    """
+    Converts a list of retrieved graph edges into a human-readable string format.
+
+    """
     edge_strings = []
     for edge in retrieved_edges:
         node1_string = edge.node1.attributes.get("text") or edge.node1.attributes.get("name")
@@ -18,11 +22,19 @@ def retrieved_edges_to_string(retrieved_edges: list) -> str:
 
 async def graph_query_completion(query: str) -> list:
     """
+    Executes a query on the graph database and retrieves a relevant completion based on the found data.
+
     Parameters:
     - query (str): The query string to compute.
 
     Returns:
     - list: Answer to the query.
+
+    Notes:
+    - The `brute_force_triplet_search` is used to retrieve relevant graph data.
+    - Prompts are dynamically rendered and provided to the LLM for contextual understanding.
+    - Ensure that the LLM client and graph database are properly configured and accessible.
+
     """
     found_triplets = await brute_force_triplet_search(query, top_k=5)