Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into feat/COG-1058-fastmcp
Browse files Browse the repository at this point in the history
  • Loading branch information
borisarzentar committed Jan 23, 2025
2 parents 00f302c + 90657a2 commit e577276
Show file tree
Hide file tree
Showing 40 changed files with 361 additions and 434 deletions.
4 changes: 2 additions & 2 deletions cognee/api/v1/add/add_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user
from cognee.modules.pipelines import run_tasks, Task
from cognee.tasks.ingestion import ingest_data_with_metadata, resolve_data_directories
from cognee.tasks.ingestion import ingest_data, resolve_data_directories
from cognee.infrastructure.databases.relational import (
create_db_and_tables as create_relational_db_and_tables,
)
Expand All @@ -22,7 +22,7 @@ async def add(
if user is None:
user = await get_default_user()

tasks = [Task(resolve_data_directories), Task(ingest_data_with_metadata, dataset_name, user)]
tasks = [Task(resolve_data_directories), Task(ingest_data, dataset_name, user)]

pipeline = run_tasks(tasks, data, "add_pipeline")

Expand Down
4 changes: 2 additions & 2 deletions cognee/api/v1/cognify/code_graph_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from cognee.shared.data_models import KnowledgeGraph, MonitoringTool
from cognee.tasks.documents import classify_documents, extract_chunks_from_documents
from cognee.tasks.graph import extract_graph_from_data
from cognee.tasks.ingestion import ingest_data_with_metadata
from cognee.tasks.ingestion import ingest_data
from cognee.tasks.repo_processor import (
enrich_dependency_graph,
expand_dependency_graph,
Expand Down Expand Up @@ -68,7 +68,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
if include_docs:
non_code_tasks = [
Task(get_non_py_files, task_config={"batch_size": 50}),
Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
Task(ingest_data, dataset_name="repo_docs", user=user),
Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
Task(classify_documents),
Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ async def exponential_backoff(attempt):

return await self.embed_text(text)

except (litellm.exceptions.BadRequestError, litellm.llms.OpenAI.openai.OpenAIError):
except litellm.exceptions.BadRequestError:
raise EmbeddingException("Failed to index data points.")

except Exception as error:
Expand Down
4 changes: 1 addition & 3 deletions cognee/modules/chunking/TextChunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def read(self):
contains=[],
_metadata={
"index_fields": ["text"],
"metadata_id": self.document.metadata_id,
},
)
paragraph_chunks = []
Expand All @@ -74,7 +73,6 @@ def read(self):
contains=[],
_metadata={
"index_fields": ["text"],
"metadata_id": self.document.metadata_id,
},
)
except Exception as e:
Expand All @@ -95,7 +93,7 @@ def read(self):
chunk_index=self.chunk_index,
cut_type=paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
contains=[],
_metadata={"index_fields": ["text"], "metadata_id": self.document.metadata_id},
_metadata={"index_fields": ["text"]},
)
except Exception as e:
print(e)
14 changes: 3 additions & 11 deletions cognee/modules/data/models/Data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from datetime import datetime, timezone
from typing import List
from uuid import uuid4
from sqlalchemy import UUID, Column, DateTime, String
from sqlalchemy.orm import Mapped, relationship
from sqlalchemy import UUID, Column, DateTime, String, JSON
from sqlalchemy.orm import relationship

from cognee.infrastructure.databases.relational import Base

from .DatasetData import DatasetData
from .Metadata import Metadata


class Data(Base):
Expand All @@ -21,6 +19,7 @@ class Data(Base):
raw_data_location = Column(String)
owner_id = Column(UUID, index=True)
content_hash = Column(String)
external_metadata = Column(JSON)
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(timezone.utc))
updated_at = Column(DateTime(timezone=True), onupdate=lambda: datetime.now(timezone.utc))

Expand All @@ -32,13 +31,6 @@ class Data(Base):
cascade="all, delete",
)

metadata_relationship = relationship(
"Metadata",
back_populates="data",
lazy="noload",
cascade="all, delete",
)

def to_json(self) -> dict:
return {
"id": str(self.id),
Expand Down
21 changes: 0 additions & 21 deletions cognee/modules/data/models/Metadata.py

This file was deleted.

19 changes: 0 additions & 19 deletions cognee/modules/data/operations/delete_metadata.py

This file was deleted.

17 changes: 0 additions & 17 deletions cognee/modules/data/operations/get_metadata.py

This file was deleted.

65 changes: 0 additions & 65 deletions cognee/modules/data/operations/write_metadata.py

This file was deleted.

2 changes: 1 addition & 1 deletion cognee/modules/data/processing/document_types/Document.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class Document(DataPoint):
name: str
raw_data_location: str
metadata_id: UUID
external_metadata: Optional[str]
mime_type: str
_metadata: dict = {"index_fields": ["name"], "type": "Document"}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,29 @@

async def chunk_naive_llm_classifier(
data_chunks: list[DocumentChunk], classification_model: Type[BaseModel]
):
) -> list[DocumentChunk]:
"""
Classifies a list of document chunks using a specified classification model and updates vector and graph databases with the classification results.
Vector Database Structure:
- Collection Name: `classification`
- Payload Schema:
- uuid (str): Unique identifier for the classification.
- text (str): Text label of the classification.
- chunk_id (str): Identifier of the chunk associated with this classification.
- document_id (str): Identifier of the document associated with this classification.
Graph Database Structure:
- Nodes:
- Represent document chunks, classification types, and classification subtypes.
- Edges:
- `is_media_type`: Links document chunks to their classification type.
- `is_subtype_of`: Links classification subtypes to their parent type.
- `is_classified_as`: Links document chunks to their classification subtypes.
Notes:
- The function assumes that vector and graph database engines (`get_vector_engine` and `get_graph_engine`) are properly initialized and accessible.
- Classification labels are processed to ensure uniqueness using UUIDs based on their values.
"""
if len(data_chunks) == 0:
return data_chunks

Expand Down
6 changes: 6 additions & 0 deletions cognee/tasks/chunks/chunk_by_paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ def chunk_by_paragraph(
"""
Chunks text by paragraph while preserving exact text reconstruction capability.
When chunks are joined with empty string "", they reproduce the original text exactly.
Notes:
- Tokenization is handled using the `tiktoken` library, ensuring compatibility with the vector engine's embedding model.
- If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk.
- Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed.
- Remaining text at the end of the input will be yielded as a final chunk.
"""
current_chunk = ""
current_word_count = 0
Expand Down
16 changes: 13 additions & 3 deletions cognee/tasks/chunks/chunk_by_sentence.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
from uuid import uuid4
from typing import Optional
from uuid import uuid4, UUID
from typing import Optional, Iterator, Tuple
from .chunk_by_word import chunk_by_word


def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
def chunk_by_sentence(
data: str, maximum_length: Optional[int] = None
) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
"""
Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
Notes:
- Relies on the `chunk_by_word` function for word-level tokenization and classification.
- Ensures sentences within paragraphs are uniquely identifiable using UUIDs.
- Handles cases where the text ends mid-sentence by appending a special "sentence_cut" type.
"""
sentence = ""
paragraph_id = uuid4()
word_count = 0
Expand Down
4 changes: 3 additions & 1 deletion cognee/tasks/chunks/chunk_by_word.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import re
from typing import Iterator, Tuple


SENTENCE_ENDINGS = r"[.;!?…]"
PARAGRAPH_ENDINGS = r"[\n\r]"
Expand Down Expand Up @@ -34,7 +36,7 @@ def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
return False


def chunk_by_word(data: str):
def chunk_by_word(data: str) -> Iterator[Tuple[str, str]]:
"""
Chunks text into words and endings while preserving whitespace.
Whitespace is included with the preceding word.
Expand Down
8 changes: 8 additions & 0 deletions cognee/tasks/chunks/query_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,19 @@

async def query_chunks(query: str) -> list[dict]:
"""
Queries the vector database to retrieve chunks related to the given query string.
Parameters:
- query (str): The query string to filter nodes by.
Returns:
- list(dict): A list of objects providing information about the chunks related to query.
Notes:
- The function uses the `search` method of the vector engine to find matches.
- Limits the results to the top 5 matching chunks to balance performance and relevance.
- Ensure that the vector database is properly initialized and contains the "document_chunk_text" collection.
"""
vector_engine = get_vector_engine()

Expand Down
8 changes: 8 additions & 0 deletions cognee/tasks/chunks/remove_disconnected_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@


async def remove_disconnected_chunks(data_chunks: list[DocumentChunk]) -> list[DocumentChunk]:
"""
Removes disconnected or obsolete chunks from the graph database.
Notes:
- Obsolete chunks are defined as chunks with no "next_chunk" predecessor.
- Fully disconnected nodes are identified and deleted separately.
- This function assumes that the graph database is properly initialized and accessible.
"""
graph_engine = await get_graph_engine()

document_ids = set((data_chunk.document_id for data_chunk in data_chunks))
Expand Down
12 changes: 12 additions & 0 deletions cognee/tasks/completion/graph_query_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@


def retrieved_edges_to_string(retrieved_edges: list) -> str:
"""
Converts a list of retrieved graph edges into a human-readable string format.
"""
edge_strings = []
for edge in retrieved_edges:
node1_string = edge.node1.attributes.get("text") or edge.node1.attributes.get("name")
Expand All @@ -18,11 +22,19 @@ def retrieved_edges_to_string(retrieved_edges: list) -> str:

async def graph_query_completion(query: str) -> list:
"""
Executes a query on the graph database and retrieves a relevant completion based on the found data.
Parameters:
- query (str): The query string to compute.
Returns:
- list: Answer to the query.
Notes:
- The `brute_force_triplet_search` is used to retrieve relevant graph data.
- Prompts are dynamically rendered and provided to the LLM for contextual understanding.
- Ensure that the LLM client and graph database are properly configured and accessible.
"""
found_triplets = await brute_force_triplet_search(query, top_k=5)

Expand Down
Loading

0 comments on commit e577276

Please sign in to comment.