topoteretes · Vasilije1990 · Jan 10, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 10, 2025
diff --git a/.github/workflows/profiling.yaml b/.github/workflows/profiling.yaml
@@ -94,7 +94,7 @@ jobs:
 #        chmod +x cognee/api/v1/cognify/code_graph_pipeline.py
 #        # Run Scalene
 #        poetry run pyinstrument --renderer json -o head_results.json cognee/api/v1/cognify/code_graph_pipeline.py
-#
+
 #    # Compare profiling results
 #    - name: Compare profiling results
 #      run: |

diff --git a/cognee/api/v1/config/config.py b/cognee/api/v1/config/config.py
@@ -131,6 +131,18 @@ def set_relational_db_config(config_dict: dict):
                     message=f"'{key}' is not a valid attribute of the config."
                 )
 
+    @staticmethod
+    def set_graph_db_config(config_dict: dict) -> None:
+        """
+        Updates the graph db config with values from config_dict.
+        """
+        graph_db_config = get_graph_config()
+        for key, value in config_dict.items():
+            if hasattr(graph_db_config, key):
+                object.__setattr__(graph_db_config, key, value)
+            else:
+                raise AttributeError(message=f"'{key}' is not a valid attribute of the config.")
+
     @staticmethod
     def set_vector_db_config(config_dict: dict):
         """

diff --git a/cognee/api/v1/search/search_v2.py b/cognee/api/v1/search/search_v2.py
@@ -15,13 +15,15 @@
 from cognee.tasks.graph import query_graph_connections
 from cognee.tasks.summarization import query_summaries
 from cognee.tasks.completion import query_completion
+from cognee.tasks.completion import graph_query_completion
 
 
 class SearchType(Enum):
     SUMMARIES = "SUMMARIES"
     INSIGHTS = "INSIGHTS"
     CHUNKS = "CHUNKS"
     COMPLETION = "COMPLETION"
+    GRAPH_COMPLETION = "GRAPH_COMPLETION"
 
 
 async def search(
@@ -65,6 +67,7 @@ async def specific_search(query_type: SearchType, query: str, user) -> list:
         SearchType.INSIGHTS: query_graph_connections,
         SearchType.CHUNKS: query_chunks,
         SearchType.COMPLETION: query_completion,
+        SearchType.GRAPH_COMPLETION: graph_query_completion,
     }
 
     search_task = search_tasks.get(query_type)

diff --git a/cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt b/cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt
@@ -0,0 +1 @@
+Answer the question using the provided context. If the provided context is not connected to the question, just answer "The provided knowledge base does not contain the answer to the question". Be as brief as possible.
diff --git a/cognee/infrastructure/llm/prompts/graph_context_for_question.txt b/cognee/infrastructure/llm/prompts/graph_context_for_question.txt
@@ -0,0 +1,2 @@
+The question is: `{{ question }}`
+and here is the context provided with a set of relationships from a knowledge graph separated by \n---\n each represented as node1 -- relation -- node2 triplet: `{{ context }}`
diff --git a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt
@@ -1,3 +1,6 @@
-I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and 
-generate a single patch file that I can apply directly to this repository using git apply. 
-Please respond with a single patch file in the following format.
+You are a senior software engineer. I need you to solve this issue by looking at the provided context and
+generate a single patch file that I can apply directly to this repository using git apply.
+Additionally, please make sure that you provide code only with correct syntax and
+you apply the patch on the relevant files (together with their path that you can try to find out from the github issue). Don't change the names of existing
+functions or classes, as they may be referenced from other code.
+Please respond only with a single patch file in the following format without adding any additional context or string.
diff --git a/cognee/modules/chunking/models/DocumentChunk.py b/cognee/modules/chunking/models/DocumentChunk.py
@@ -12,6 +12,7 @@ class DocumentChunk(DataPoint):
     chunk_index: int
     cut_type: str
     is_part_of: Document
+    pydantic_type: str = "DocumentChunk"
     contains: List[Entity] = None
 
     _metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"}
diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py
@@ -7,5 +7,6 @@ class Entity(DataPoint):
     name: str
     is_a: EntityType
     description: str
+    pydantic_type: str = "Entity"
 
     _metadata: dict = {"index_fields": ["name"], "type": "Entity"}
diff --git a/cognee/modules/engine/models/EntityType.py b/cognee/modules/engine/models/EntityType.py
@@ -5,5 +5,6 @@ class EntityType(DataPoint):
     __tablename__ = "entity_type"
     name: str
     description: str
+    pydantic_type: str = "EntityType"
 
     _metadata: dict = {"index_fields": ["name"], "type": "EntityType"}
diff --git a/cognee/modules/retrieval/description_to_codepart_search.py b/cognee/modules/retrieval/description_to_codepart_search.py
@@ -8,27 +8,35 @@
 from cognee.modules.users.methods import get_default_user
 from cognee.modules.users.models import User
 from cognee.shared.utils import send_telemetry
+from cognee.api.v1.search import SearchType
+from cognee.api.v1.search.search_v2 import search
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
 
 
-async def code_description_to_code_part_search(query: str, user: User = None, top_k=2) -> list:
+async def code_description_to_code_part_search(
+    query: str, include_docs=False, user: User = None, top_k=5
+) -> list:
     if user is None:
         user = await get_default_user()
 
     if user is None:
         raise PermissionError("No user found in the system. Please create a user.")
 
-    retrieved_codeparts = await code_description_to_code_part(query, user, top_k)
+    retrieved_codeparts = await code_description_to_code_part(query, user, top_k, include_docs)
     return retrieved_codeparts
 
 
-async def code_description_to_code_part(query: str, user: User, top_k: int) -> List[str]:
+async def code_description_to_code_part(
+    query: str, user: User, top_k: int, include_docs: bool = False
+) -> List[str]:
     """
     Maps a code description query to relevant code parts using a CodeGraph pipeline.
 
     Args:
         query (str): The search query describing the code parts.
         user (User): The user performing the search.
         top_k (int): Number of codegraph descriptions to match ( num of corresponding codeparts will be higher)
+        include_docs(bool): Boolean showing whether we have the docs in the graph or not
 
     Returns:
         Set[str]: A set of unique code parts matching the query.
@@ -55,21 +63,49 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
     )
 
     try:
-        results = await vector_engine.search("code_summary_text", query_text=query, limit=top_k)
-        if not results:
+        if include_docs:
+            search_results = await search(SearchType.INSIGHTS, query_text=query)
+
+            concatenated_descriptions = " ".join(
+                obj["description"]
+                for tpl in search_results
+                for obj in tpl
+                if isinstance(obj, dict) and "description" in obj
+            )
+
+            llm_client = get_llm_client()
+            context_from_documents = await llm_client.acreate_structured_output(
+                text_input=f"The retrieved context from documents"
+                f" is {concatenated_descriptions}.",
+                system_prompt="You are a Senior Software Engineer, summarize the context from documents"
+                f" in a way that it is gonna be provided next to codeparts as context"
+                f" while trying to solve this github issue connected to the project: {query}]",
+                response_model=str,
+            )
+
+        code_summaries = await vector_engine.search(
+            "code_summary_text", query_text=query, limit=top_k
+        )
+        if not code_summaries:
             logging.warning("No results found for query: '%s' by user: %s", query, user.id)
             return []
 
         memory_fragment = CogneeGraph()
         await memory_fragment.project_graph_from_db(
             graph_engine,
-            node_properties_to_project=["id", "type", "text", "source_code"],
+            node_properties_to_project=[
+                "id",
+                "type",
+                "text",
+                "source_code",
+                "pydantic_type",
+            ],
             edge_properties_to_project=["relationship_name"],
         )
 
         code_pieces_to_return = set()
 
-        for node in results:
+        for node in code_summaries:
             node_id = str(node.id)
             node_to_search_from = memory_fragment.get_node(node_id)
 
@@ -78,9 +114,16 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
                 continue
 
             for code_file in node_to_search_from.get_skeleton_neighbours():
-                for code_file_edge in code_file.get_skeleton_edges():
-                    if code_file_edge.get_attribute("relationship_name") == "contains":
-                        code_pieces_to_return.add(code_file_edge.get_destination_node())
+                if code_file.get_attribute("pydantic_type") == "SourceCodeChunk":
+                    for code_file_edge in code_file.get_skeleton_edges():
+                        if code_file_edge.get_attribute("relationship_name") == "code_chunk_of":
+                            code_pieces_to_return.add(code_file_edge.get_destination_node())
+                elif code_file.get_attribute("pydantic_type") == "CodePart":
+                    code_pieces_to_return.add(code_file)
+                elif code_file.get_attribute("pydantic_type") == "CodeFile":
+                    for code_file_edge in code_file.get_skeleton_edges():
+                        if code_file_edge.get_attribute("relationship_name") == "contains":
+                            code_pieces_to_return.add(code_file_edge.get_destination_node())
 
         logging.info(
             "Search completed for user: %s, query: '%s'. Found %d code pieces.",
@@ -89,7 +132,14 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
             len(code_pieces_to_return),
         )
 
-        return list(code_pieces_to_return)
+        context = ""
+        for code_piece in code_pieces_to_return:
+            context = context + code_piece.get_attribute("source_code")
+
+        if include_docs:
+            context = context_from_documents + context
+
+        return context
-        context = ""
-        for code_piece in code_pieces_to_return:
-            context = context + code_piece.get_attribute("source_code")
-
-        if include_docs:
-            context = context_from_documents + context
-
-        return context
+        code_pieces_content = [
+            code_piece.get_attribute("source_code")
+            for code_piece in code_pieces_to_return
+        ]
+        
+        # Add separator between code pieces for better readability
+        context = "\n\n---\n\n".join(code_pieces_content)
+
+        if include_docs:
+            context = f"{context_from_documents}\n\n---\n\n{context}"
+
+        return context
-        context = ""
-        for code_piece in code_pieces_to_return:
-            context = context + code_piece.get_attribute("source_code")
-
-        if include_docs:
-            context = context_from_documents + context
-
-        return context
+        code_pieces_content = [
+            code_piece.get_attribute("source_code")
+            for code_piece in code_pieces_to_return
+        ]
+        
+        # Add separator between code pieces for better readability
+        context = "\n\n---\n\n".join(code_pieces_content)
+
+        if include_docs:
+            context = f"{context_from_documents}\n\n---\n\n{context}"
+
+        return context
 
     except Exception as exec_error:
         logging.error(

diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py
@@ -5,12 +5,14 @@
 class Repository(DataPoint):
     __tablename__ = "Repository"
     path: str
+    pydantic_type: str = "Repository"
     _metadata: dict = {"index_fields": [], "type": "Repository"}
 
 
 class CodeFile(DataPoint):
     __tablename__ = "codefile"
     extracted_id: str  # actually file path
+    pydantic_type: str = "CodeFile"
     source_code: Optional[str] = None
     part_of: Optional[Repository] = None
     depends_on: Optional[List["CodeFile"]] = None
@@ -22,6 +24,7 @@ class CodeFile(DataPoint):
 class CodePart(DataPoint):
     __tablename__ = "codepart"
     # part_of: Optional[CodeFile] = None
+    pydantic_type: str = "CodePart"
     source_code: Optional[str] = None
     _metadata: dict = {"index_fields": [], "type": "CodePart"}
 
@@ -30,6 +33,7 @@ class SourceCodeChunk(DataPoint):
     __tablename__ = "sourcecodechunk"
     code_chunk_of: Optional[CodePart] = None
     source_code: Optional[str] = None
+    pydantic_type: str = "SourceCodeChunk"
     previous_chunk: Optional["SourceCodeChunk"] = None
 
     _metadata: dict = {"index_fields": ["source_code"], "type": "SourceCodeChunk"}

diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py
@@ -231,6 +231,7 @@ class SummarizedContent(BaseModel):
 
     summary: str
     description: str
+    pydantic_type: str = "SummarizedContent"
 
 
 class SummarizedFunction(BaseModel):
@@ -239,13 +240,15 @@ class SummarizedFunction(BaseModel):
     inputs: Optional[List[str]] = None
     outputs: Optional[List[str]] = None
     decorators: Optional[List[str]] = None
+    pydantic_type: str = "SummarizedFunction"
 
 
 class SummarizedClass(BaseModel):
     name: str
     description: str
     methods: Optional[List[SummarizedFunction]] = None
     decorators: Optional[List[str]] = None
+    pydantic_type: str = "SummarizedClass"
 
 
 class SummarizedCode(BaseModel):
@@ -256,6 +259,7 @@ class SummarizedCode(BaseModel):
     classes: List[SummarizedClass] = []
     functions: List[SummarizedFunction] = []
     workflow_description: Optional[str] = None
+    pydantic_type: str = "SummarizedCode"
 
 
 class GraphDBType(Enum):

diff --git a/cognee/tasks/completion/__init__.py b/cognee/tasks/completion/__init__.py
@@ -1 +1,2 @@
 from .query_completion import query_completion
+from .graph_query_completion import graph_query_completion
diff --git a/cognee/tasks/completion/graph_query_completion.py b/cognee/tasks/completion/graph_query_completion.py
@@ -0,0 +1,46 @@
+from cognee.infrastructure.databases.vector import get_vector_engine
+from cognee.tasks.completion.exceptions import NoRelevantDataFound
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
+from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search
+
+
+def retrieved_edges_to_string(retrieved_edges: list) -> str:
+    edge_strings = []
+    for edge in retrieved_edges:
+        node1_string = edge.node1.attributes.get("text") or edge.node1.attributes.get("name")
+        node2_string = edge.node2.attributes.get("text") or edge.node2.attributes.get("name")
+        edge_string = edge.attributes["relationship_type"]
+        edge_str = f"{node1_string} -- {edge_string} -- {node2_string}"
+        edge_strings.append(edge_str)
+    return "\n---\n".join(edge_strings)
+
+
+async def graph_query_completion(query: str) -> list:
+    """
+    Parameters:
+    - query (str): The query string to compute.
+
+    Returns:
+    - list: Answer to the query.
+    """
+    found_triplets = await brute_force_triplet_search(query, top_k=5)
+
+    if len(found_triplets) == 0:
+        raise NoRelevantDataFound
+
+    args = {
+        "question": query,
+        "context": retrieved_edges_to_string(found_triplets),
+    }
+    user_prompt = render_prompt("graph_context_for_question.txt", args)
+    system_prompt = read_query_prompt("answer_simple_question_restricted.txt")
+
+    llm_client = get_llm_client()
+    computed_answer = await llm_client.acreate_structured_output(
+        text_input=user_prompt,
+        system_prompt=system_prompt,
+        response_model=str,
+    )
+
+    return [computed_answer]
diff --git a/cognee/tasks/repo_processor/expand_dependency_graph.py b/cognee/tasks/repo_processor/expand_dependency_graph.py
@@ -5,10 +5,10 @@
 from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart
 from cognee.tasks.repo_processor.extract_code_parts import extract_code_parts
-
 import logging
 
-logger = logging.getLogger("task:repo_processor")
+logger = logging.getLogger(__name__)
+
 
 
 def _add_code_parts_nodes_and_edges(code_file: CodeFile, part_type, code_parts) -> None:

diff --git a/cognee/tasks/repo_processor/extract_code_parts.py b/cognee/tasks/repo_processor/extract_code_parts.py
@@ -1,9 +1,8 @@
 from typing import Dict, List
 import parso
-
 import logging
 
-logger = logging.getLogger("task:repo_processor")
+logger = logging.getLogger(__name__)
 
 
 def _extract_parts_from_module(module, parts_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:

diff --git a/cognee/tasks/repo_processor/get_local_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py
@@ -9,10 +9,9 @@
 import jedi
 import parso
 from parso.tree import BaseNode
-
 import logging
 
-logger = logging.getLogger("task:repo_processor")
+logger = logging.getLogger(__name__)
 
 
 @contextmanager

diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -9,7 +9,7 @@
 from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk
 
-logger = logging.getLogger("task:get_source_code_chunks")
+logger = logging.getLogger(__name__)
 
 
 def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int:

diff --git a/cognee/tasks/repo_processor/top_down_repo_parse.py b/cognee/tasks/repo_processor/top_down_repo_parse.py
@@ -4,7 +4,9 @@
 import parso
 from tqdm import tqdm
 
-from . import logger
+import logging
+
+logger = logging.getLogger(__name__)
 
 _NODE_TYPE_MAP = {
     "funcdef": "func_def",

diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py
@@ -17,5 +17,6 @@ class CodeSummary(DataPoint):
     __tablename__ = "code_summary"
     text: str
     summarizes: Union[CodeFile, CodePart, SourceCodeChunk]
+    pydantic_type: str = "CodeSummary"
 
     _metadata: dict = {"index_fields": ["text"], "type": "CodeSummary"}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Answer the question using the provided context. If the provided context is not connected to the question, just answer "The provided knowledge base does not contain the answer to the question". Be as brief as possible.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		The question is: `{{ question }}`
		and here is the context provided with a set of relationships from a knowledge graph separated by \n---\n each represented as node1 -- relation -- node2 triplet: `{{ context }}`
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .query_completion import query_completion
		from .graph_query_completion import graph_query_completion