Skip to content

Commit

Permalink
Merge branch 'dev' into add-gemini
Browse files Browse the repository at this point in the history
  • Loading branch information
dexters1 authored Jan 21, 2025
2 parents 1840f9d + c9536f9 commit 7a9a770
Show file tree
Hide file tree
Showing 13 changed files with 130 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,29 @@

async def chunk_naive_llm_classifier(
data_chunks: list[DocumentChunk], classification_model: Type[BaseModel]
):
) -> list[DocumentChunk]:
"""
Classifies a list of document chunks using a specified classification model and updates vector and graph databases with the classification results.
Vector Database Structure:
- Collection Name: `classification`
- Payload Schema:
- uuid (str): Unique identifier for the classification.
- text (str): Text label of the classification.
- chunk_id (str): Identifier of the chunk associated with this classification.
- document_id (str): Identifier of the document associated with this classification.
Graph Database Structure:
- Nodes:
- Represent document chunks, classification types, and classification subtypes.
- Edges:
- `is_media_type`: Links document chunks to their classification type.
- `is_subtype_of`: Links classification subtypes to their parent type.
- `is_classified_as`: Links document chunks to their classification subtypes.
Notes:
- The function assumes that vector and graph database engines (`get_vector_engine` and `get_graph_engine`) are properly initialized and accessible.
- Classification labels are processed to ensure uniqueness using UUIDs based on their values.
"""
if len(data_chunks) == 0:
return data_chunks

Expand Down
6 changes: 6 additions & 0 deletions cognee/tasks/chunks/chunk_by_paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ def chunk_by_paragraph(
"""
Chunks text by paragraph while preserving exact text reconstruction capability.
When chunks are joined with empty string "", they reproduce the original text exactly.
Notes:
- Tokenization is handled using the `tiktoken` library, ensuring compatibility with the vector engine's embedding model.
- If `batch_paragraphs` is False, each paragraph will be yielded as a separate chunk.
- Handles cases where paragraphs exceed the specified token or word limits by splitting them as needed.
- Remaining text at the end of the input will be yielded as a final chunk.
"""
current_chunk = ""
current_word_count = 0
Expand Down
16 changes: 13 additions & 3 deletions cognee/tasks/chunks/chunk_by_sentence.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
from uuid import uuid4
from typing import Optional
from uuid import uuid4, UUID
from typing import Optional, Iterator, Tuple
from .chunk_by_word import chunk_by_word


def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
def chunk_by_sentence(
data: str, maximum_length: Optional[int] = None
) -> Iterator[Tuple[UUID, str, int, Optional[str]]]:
"""
Splits the input text into sentences based on word-level processing, with optional sentence length constraints.
Notes:
- Relies on the `chunk_by_word` function for word-level tokenization and classification.
- Ensures sentences within paragraphs are uniquely identifiable using UUIDs.
- Handles cases where the text ends mid-sentence by appending a special "sentence_cut" type.
"""
sentence = ""
paragraph_id = uuid4()
word_count = 0
Expand Down
4 changes: 3 additions & 1 deletion cognee/tasks/chunks/chunk_by_word.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import re
from typing import Iterator, Tuple


SENTENCE_ENDINGS = r"[.;!?…]"
PARAGRAPH_ENDINGS = r"[\n\r]"
Expand Down Expand Up @@ -34,7 +36,7 @@ def is_real_paragraph_end(last_char: str, current_pos: int, text: str) -> bool:
return False


def chunk_by_word(data: str):
def chunk_by_word(data: str) -> Iterator[Tuple[str, str]]:
"""
Chunks text into words and endings while preserving whitespace.
Whitespace is included with the preceding word.
Expand Down
8 changes: 8 additions & 0 deletions cognee/tasks/chunks/query_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,19 @@

async def query_chunks(query: str) -> list[dict]:
"""
Queries the vector database to retrieve chunks related to the given query string.
Parameters:
- query (str): The query string to filter nodes by.
Returns:
- list(dict): A list of objects providing information about the chunks related to query.
Notes:
- The function uses the `search` method of the vector engine to find matches.
- Limits the results to the top 5 matching chunks to balance performance and relevance.
- Ensure that the vector database is properly initialized and contains the "document_chunk_text" collection.
"""
vector_engine = get_vector_engine()

Expand Down
8 changes: 8 additions & 0 deletions cognee/tasks/chunks/remove_disconnected_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@


async def remove_disconnected_chunks(data_chunks: list[DocumentChunk]) -> list[DocumentChunk]:
"""
Removes disconnected or obsolete chunks from the graph database.
Notes:
- Obsolete chunks are defined as chunks with no "next_chunk" predecessor.
- Fully disconnected nodes are identified and deleted separately.
- This function assumes that the graph database is properly initialized and accessible.
"""
graph_engine = await get_graph_engine()

document_ids = set((data_chunk.document_id for data_chunk in data_chunks))
Expand Down
12 changes: 12 additions & 0 deletions cognee/tasks/completion/graph_query_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@


def retrieved_edges_to_string(retrieved_edges: list) -> str:
"""
Converts a list of retrieved graph edges into a human-readable string format.
"""
edge_strings = []
for edge in retrieved_edges:
node1_string = edge.node1.attributes.get("text") or edge.node1.attributes.get("name")
Expand All @@ -18,11 +22,19 @@ def retrieved_edges_to_string(retrieved_edges: list) -> str:

async def graph_query_completion(query: str) -> list:
"""
Executes a query on the graph database and retrieves a relevant completion based on the found data.
Parameters:
- query (str): The query string to compute.
Returns:
- list: Answer to the query.
Notes:
- The `brute_force_triplet_search` is used to retrieve relevant graph data.
- Prompts are dynamically rendered and provided to the LLM for contextual understanding.
- Ensure that the LLM client and graph database are properly configured and accessible.
"""
found_triplets = await brute_force_triplet_search(query, top_k=5)

Expand Down
9 changes: 9 additions & 0 deletions cognee/tasks/completion/query_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,20 @@

async def query_completion(query: str) -> list:
"""
Executes a query against a vector database and computes a relevant response using an LLM.
Parameters:
- query (str): The query string to compute.
Returns:
- list: Answer to the query.
Notes:
- Limits the search to the top 1 matching chunk for simplicity and relevance.
- Ensure that the vector database and LLM client are properly configured and accessible.
- The response model used for the LLM output is expected to be a string.
"""
vector_engine = get_vector_engine()

Expand Down
13 changes: 12 additions & 1 deletion cognee/tasks/documents/check_permissions_on_documents.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
from cognee.modules.data.processing.document_types import Document
from cognee.modules.users.permissions.methods import check_permission_on_documents
from typing import List


async def check_permissions_on_documents(documents: list[Document], user, permissions):
async def check_permissions_on_documents(
documents: list[Document], user, permissions
) -> List[Document]:
"""
Validates a user's permissions on a list of documents.
Notes:
- This function assumes that `check_permission_on_documents` raises an exception if the permission check fails.
- It is designed to validate multiple permissions in a sequential manner for the same set of documents.
- Ensure that the `Document` and `user` objects conform to the expected structure and interfaces.
"""
document_ids = [document.id for document in documents]

for permission in permissions:
Expand Down
7 changes: 7 additions & 0 deletions cognee/tasks/documents/classify_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@


async def classify_documents(data_documents: list[Data]) -> list[Document]:
"""
Classifies a list of data items into specific document types based on file extensions.
Notes:
- The function relies on `get_metadata` to retrieve metadata information for each data item.
- Ensure the `Data` objects and their attributes (e.g., `extension`, `id`) are valid before calling this function.
"""
documents = []
for data_item in data_documents:
metadata = await get_metadata(data_item.id)
Expand Down
11 changes: 9 additions & 2 deletions cognee/tasks/documents/extract_chunks_from_documents.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, AsyncGenerator

from cognee.modules.data.processing.document_types.Document import Document

Expand All @@ -8,7 +8,14 @@ async def extract_chunks_from_documents(
chunk_size: int = 1024,
chunker="text_chunker",
max_tokens: Optional[int] = None,
):
) -> AsyncGenerator:
"""
Extracts chunks of data from a list of documents based on the specified chunking parameters.
Notes:
- The `read` method of the `Document` class must be implemented to support the chunking operation.
- The `chunker` parameter determines the chunking logic and should align with the document type.
"""
for document in documents:
for document_chunk in document.read(
chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens
Expand Down
13 changes: 11 additions & 2 deletions cognee/tasks/graph/extract_graph_from_code.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
import asyncio
from typing import Type
from typing import Type, List
from pydantic import BaseModel
from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
from cognee.tasks.storage import add_data_points


async def extract_graph_from_code(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]):
async def extract_graph_from_code(
data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]
) -> List[DocumentChunk]:
"""
Extracts a knowledge graph from the text content of document chunks using a specified graph model.
Notes:
- The `extract_content_graph` function processes each chunk's text to extract graph information.
- Graph nodes are stored using the `add_data_points` function for later retrieval or analysis.
"""
chunk_graphs = await asyncio.gather(
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
)
Expand Down
11 changes: 9 additions & 2 deletions cognee/tasks/graph/extract_graph_from_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import asyncio
from typing import Type
from typing import Type, List

from pydantic import BaseModel

Expand All @@ -13,7 +13,14 @@
from cognee.tasks.storage import add_data_points


async def extract_graph_from_data(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]):
async def extract_graph_from_data(
data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]
) -> List[DocumentChunk]:
"""
Extracts and integrates a knowledge graph from the text content of document chunks using a specified graph model.
"""

chunk_graphs = await asyncio.gather(
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
)
Expand Down

0 comments on commit 7a9a770

Please sign in to comment.