Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/cog 971 preparing swe bench run #424

Merged
merged 8 commits into from
Jan 10, 2025
2 changes: 1 addition & 1 deletion .github/workflows/profiling.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ jobs:
# chmod +x cognee/api/v1/cognify/code_graph_pipeline.py
# # Run Scalene
# poetry run pyinstrument --renderer json -o head_results.json cognee/api/v1/cognify/code_graph_pipeline.py
#

# # Compare profiling results
# - name: Compare profiling results
# run: |
Expand Down
12 changes: 12 additions & 0 deletions cognee/api/v1/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,18 @@ def set_relational_db_config(config_dict: dict):
message=f"'{key}' is not a valid attribute of the config."
)

@staticmethod
def set_graph_db_config(config_dict: dict) -> None:
"""
Updates the graph db config with values from config_dict.
"""
graph_db_config = get_graph_config()
for key, value in config_dict.items():
if hasattr(graph_db_config, key):
object.__setattr__(graph_db_config, key, value)
else:
raise AttributeError(message=f"'{key}' is not a valid attribute of the config.")

@staticmethod
def set_vector_db_config(config_dict: dict):
"""
Expand Down
3 changes: 3 additions & 0 deletions cognee/api/v1/search/search_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@
from cognee.tasks.graph import query_graph_connections
from cognee.tasks.summarization import query_summaries
from cognee.tasks.completion import query_completion
from cognee.tasks.completion import graph_query_completion


class SearchType(Enum):
SUMMARIES = "SUMMARIES"
INSIGHTS = "INSIGHTS"
CHUNKS = "CHUNKS"
COMPLETION = "COMPLETION"
GRAPH_COMPLETION = "GRAPH_COMPLETION"


async def search(
Expand Down Expand Up @@ -65,6 +67,7 @@ async def specific_search(query_type: SearchType, query: str, user) -> list:
SearchType.INSIGHTS: query_graph_connections,
SearchType.CHUNKS: query_chunks,
SearchType.COMPLETION: query_completion,
SearchType.GRAPH_COMPLETION: graph_query_completion,
}

search_task = search_tasks.get(query_type)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Answer the question using the provided context. If the provided context is not connected to the question, just answer "The provided knowledge base does not contain the answer to the question". Be as brief as possible.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
The question is: `{{ question }}`
and here is the context provided with a set of relationships from a knowledge graph separated by \n---\n each represented as node1 -- relation -- node2 triplet: `{{ context }}`
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and
generate a single patch file that I can apply directly to this repository using git apply.
Please respond with a single patch file in the following format.
You are a senior software engineer. I need you to solve this issue by looking at the provided context and
generate a single patch file that I can apply directly to this repository using git apply.
Additionally, please make sure that you provide code only with correct syntax and
you apply the patch on the relevant files (together with their path that you can try to find out from the github issue). Don't change the names of existing
functions or classes, as they may be referenced from other code.
Please respond only with a single patch file in the following format without adding any additional context or string.
1 change: 1 addition & 0 deletions cognee/modules/chunking/models/DocumentChunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class DocumentChunk(DataPoint):
chunk_index: int
cut_type: str
is_part_of: Document
pydantic_type: str = "DocumentChunk"
contains: List[Entity] = None

_metadata: dict = {"index_fields": ["text"], "type": "DocumentChunk"}
1 change: 1 addition & 0 deletions cognee/modules/engine/models/Entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@ class Entity(DataPoint):
name: str
is_a: EntityType
description: str
pydantic_type: str = "Entity"

_metadata: dict = {"index_fields": ["name"], "type": "Entity"}
1 change: 1 addition & 0 deletions cognee/modules/engine/models/EntityType.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ class EntityType(DataPoint):
__tablename__ = "entity_type"
name: str
description: str
pydantic_type: str = "EntityType"

_metadata: dict = {"index_fields": ["name"], "type": "EntityType"}
72 changes: 61 additions & 11 deletions cognee/modules/retrieval/description_to_codepart_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,35 @@
from cognee.modules.users.methods import get_default_user
from cognee.modules.users.models import User
from cognee.shared.utils import send_telemetry
from cognee.api.v1.search import SearchType
from cognee.api.v1.search.search_v2 import search
from cognee.infrastructure.llm.get_llm_client import get_llm_client


async def code_description_to_code_part_search(query: str, user: User = None, top_k=2) -> list:
async def code_description_to_code_part_search(
query: str, include_docs=False, user: User = None, top_k=5
) -> list:
if user is None:
user = await get_default_user()

if user is None:
raise PermissionError("No user found in the system. Please create a user.")

retrieved_codeparts = await code_description_to_code_part(query, user, top_k)
retrieved_codeparts = await code_description_to_code_part(query, user, top_k, include_docs)
return retrieved_codeparts


async def code_description_to_code_part(query: str, user: User, top_k: int) -> List[str]:
async def code_description_to_code_part(
query: str, user: User, top_k: int, include_docs: bool = False
) -> List[str]:
"""
Maps a code description query to relevant code parts using a CodeGraph pipeline.

Args:
query (str): The search query describing the code parts.
user (User): The user performing the search.
top_k (int): Number of codegraph descriptions to match ( num of corresponding codeparts will be higher)
include_docs(bool): Boolean showing whether we have the docs in the graph or not

Returns:
Set[str]: A set of unique code parts matching the query.
Expand All @@ -55,21 +63,49 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
)

try:
results = await vector_engine.search("code_summary_text", query_text=query, limit=top_k)
if not results:
if include_docs:
search_results = await search(SearchType.INSIGHTS, query_text=query)

concatenated_descriptions = " ".join(
obj["description"]
for tpl in search_results
for obj in tpl
if isinstance(obj, dict) and "description" in obj
)
Comment on lines +66 to +74
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add error handling for search results

The search results processing lacks error handling and validation. Consider adding checks for empty results and proper error handling.

         if include_docs:
             search_results = await search(SearchType.INSIGHTS, query_text=query)
+            if not search_results:
+                logging.warning("No document insights found for query: '%s'", query)
+                context_from_documents = ""
+                return context_from_documents

             concatenated_descriptions = " ".join(
                 obj["description"]
                 for tpl in search_results
                 for obj in tpl
                 if isinstance(obj, dict) and "description" in obj
             )

Committable suggestion skipped: line range outside the PR's diff.

🧰 Tools
🪛 GitHub Actions: ruff format

[warning] File requires formatting with Ruff formatter


llm_client = get_llm_client()
context_from_documents = await llm_client.acreate_structured_output(
text_input=f"The retrieved context from documents"
f" is {concatenated_descriptions}.",
system_prompt="You are a Senior Software Engineer, summarize the context from documents"
f" in a way that it is gonna be provided next to codeparts as context"
f" while trying to solve this github issue connected to the project: {query}]",
Comment on lines +80 to +82
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix the syntax error and improve the system prompt

There is an unmatched closing bracket ] at the end of the system_prompt string, which will cause a syntax error. Additionally, consider rephrasing the prompt for clarity and grammatical correctness.

Apply this diff to fix the syntax error:

-                    f" while trying to solve this github issue connected to the project: {query}]",
+                    f" while trying to solve this GitHub issue connected to the project: {query}",

Revised system_prompt for clarity:

system_prompt = (
    "You are a Senior Software Engineer. "
    "Summarize the context from the documents so it can be provided "
    "as context alongside code parts while trying to solve "
    f"the following GitHub issue connected to the project: {query}"
)

response_model=str,
)

code_summaries = await vector_engine.search(
"code_summary_text", query_text=query, limit=top_k
)
if not code_summaries:
logging.warning("No results found for query: '%s' by user: %s", query, user.id)
return []

memory_fragment = CogneeGraph()
await memory_fragment.project_graph_from_db(
graph_engine,
node_properties_to_project=["id", "type", "text", "source_code"],
node_properties_to_project=[
"id",
"type",
"text",
"source_code",
"pydantic_type",
],
edge_properties_to_project=["relationship_name"],
)

code_pieces_to_return = set()

for node in results:
for node in code_summaries:
node_id = str(node.id)
node_to_search_from = memory_fragment.get_node(node_id)

Expand All @@ -78,9 +114,16 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
continue

for code_file in node_to_search_from.get_skeleton_neighbours():
for code_file_edge in code_file.get_skeleton_edges():
if code_file_edge.get_attribute("relationship_name") == "contains":
code_pieces_to_return.add(code_file_edge.get_destination_node())
if code_file.get_attribute("pydantic_type") == "SourceCodeChunk":
for code_file_edge in code_file.get_skeleton_edges():
if code_file_edge.get_attribute("relationship_name") == "code_chunk_of":
code_pieces_to_return.add(code_file_edge.get_destination_node())
elif code_file.get_attribute("pydantic_type") == "CodePart":
code_pieces_to_return.add(code_file)
elif code_file.get_attribute("pydantic_type") == "CodeFile":
for code_file_edge in code_file.get_skeleton_edges():
if code_file_edge.get_attribute("relationship_name") == "contains":
code_pieces_to_return.add(code_file_edge.get_destination_node())

logging.info(
"Search completed for user: %s, query: '%s'. Found %d code pieces.",
Expand All @@ -89,7 +132,14 @@ async def code_description_to_code_part(query: str, user: User, top_k: int) -> L
len(code_pieces_to_return),
)

return list(code_pieces_to_return)
context = ""
for code_piece in code_pieces_to_return:
context = context + code_piece.get_attribute("source_code")

if include_docs:
context = context_from_documents + context

return context
Comment on lines +135 to +142
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Improve code concatenation for better memory efficiency and readability

The current string concatenation approach could be inefficient for large code bases and lacks separation between different code pieces.

-        context = ""
-        for code_piece in code_pieces_to_return:
-            context = context + code_piece.get_attribute("source_code")
+        code_pieces_content = [
+            code_piece.get_attribute("source_code")
+            for code_piece in code_pieces_to_return
+        ]
+        
+        # Add separator between code pieces for better readability
+        context = "\n\n---\n\n".join(code_pieces_content)

         if include_docs:
-            context = context_from_documents + context
+            context = f"{context_from_documents}\n\n---\n\n{context}"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
context = ""
for code_piece in code_pieces_to_return:
context = context + code_piece.get_attribute("source_code")
if include_docs:
context = context_from_documents + context
return context
code_pieces_content = [
code_piece.get_attribute("source_code")
for code_piece in code_pieces_to_return
]
# Add separator between code pieces for better readability
context = "\n\n---\n\n".join(code_pieces_content)
if include_docs:
context = f"{context_from_documents}\n\n---\n\n{context}"
return context
🧰 Tools
🪛 GitHub Actions: ruff format

[warning] File requires formatting with Ruff formatter


except Exception as exec_error:
logging.error(
Expand Down
4 changes: 4 additions & 0 deletions cognee/shared/CodeGraphEntities.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
class Repository(DataPoint):
__tablename__ = "Repository"
path: str
pydantic_type: str = "Repository"
_metadata: dict = {"index_fields": [], "type": "Repository"}


class CodeFile(DataPoint):
__tablename__ = "codefile"
extracted_id: str # actually file path
pydantic_type: str = "CodeFile"
source_code: Optional[str] = None
part_of: Optional[Repository] = None
depends_on: Optional[List["CodeFile"]] = None
Expand All @@ -22,6 +24,7 @@ class CodeFile(DataPoint):
class CodePart(DataPoint):
__tablename__ = "codepart"
# part_of: Optional[CodeFile] = None
pydantic_type: str = "CodePart"
source_code: Optional[str] = None
_metadata: dict = {"index_fields": [], "type": "CodePart"}

Expand All @@ -30,6 +33,7 @@ class SourceCodeChunk(DataPoint):
__tablename__ = "sourcecodechunk"
code_chunk_of: Optional[CodePart] = None
source_code: Optional[str] = None
pydantic_type: str = "SourceCodeChunk"
previous_chunk: Optional["SourceCodeChunk"] = None

_metadata: dict = {"index_fields": ["source_code"], "type": "SourceCodeChunk"}
Expand Down
4 changes: 4 additions & 0 deletions cognee/shared/data_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ class SummarizedContent(BaseModel):

summary: str
description: str
pydantic_type: str = "SummarizedContent"


class SummarizedFunction(BaseModel):
Expand All @@ -239,13 +240,15 @@ class SummarizedFunction(BaseModel):
inputs: Optional[List[str]] = None
outputs: Optional[List[str]] = None
decorators: Optional[List[str]] = None
pydantic_type: str = "SummarizedFunction"


class SummarizedClass(BaseModel):
name: str
description: str
methods: Optional[List[SummarizedFunction]] = None
decorators: Optional[List[str]] = None
pydantic_type: str = "SummarizedClass"


class SummarizedCode(BaseModel):
Expand All @@ -256,6 +259,7 @@ class SummarizedCode(BaseModel):
classes: List[SummarizedClass] = []
functions: List[SummarizedFunction] = []
workflow_description: Optional[str] = None
pydantic_type: str = "SummarizedCode"


class GraphDBType(Enum):
Expand Down
1 change: 1 addition & 0 deletions cognee/tasks/completion/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .query_completion import query_completion
from .graph_query_completion import graph_query_completion
46 changes: 46 additions & 0 deletions cognee/tasks/completion/graph_query_completion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from cognee.infrastructure.databases.vector import get_vector_engine
from cognee.tasks.completion.exceptions import NoRelevantDataFound
from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search


def retrieved_edges_to_string(retrieved_edges: list) -> str:
edge_strings = []
for edge in retrieved_edges:
node1_string = edge.node1.attributes.get("text") or edge.node1.attributes.get("name")
node2_string = edge.node2.attributes.get("text") or edge.node2.attributes.get("name")
edge_string = edge.attributes["relationship_type"]
edge_str = f"{node1_string} -- {edge_string} -- {node2_string}"
edge_strings.append(edge_str)
return "\n---\n".join(edge_strings)


async def graph_query_completion(query: str) -> list:
"""
Parameters:
- query (str): The query string to compute.

Returns:
- list: Answer to the query.
"""
found_triplets = await brute_force_triplet_search(query, top_k=5)

if len(found_triplets) == 0:
raise NoRelevantDataFound

args = {
"question": query,
"context": retrieved_edges_to_string(found_triplets),
}
user_prompt = render_prompt("graph_context_for_question.txt", args)
system_prompt = read_query_prompt("answer_simple_question_restricted.txt")

llm_client = get_llm_client()
computed_answer = await llm_client.acreate_structured_output(
text_input=user_prompt,
system_prompt=system_prompt,
response_model=str,
)

return [computed_answer]
4 changes: 2 additions & 2 deletions cognee/tasks/repo_processor/expand_dependency_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from cognee.infrastructure.engine import DataPoint
from cognee.shared.CodeGraphEntities import CodeFile, CodePart
from cognee.tasks.repo_processor.extract_code_parts import extract_code_parts

import logging

logger = logging.getLogger("task:repo_processor")
logger = logging.getLogger(__name__)



def _add_code_parts_nodes_and_edges(code_file: CodeFile, part_type, code_parts) -> None:
Expand Down
3 changes: 1 addition & 2 deletions cognee/tasks/repo_processor/extract_code_parts.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from typing import Dict, List
import parso

import logging

logger = logging.getLogger("task:repo_processor")
logger = logging.getLogger(__name__)


def _extract_parts_from_module(module, parts_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
Expand Down
3 changes: 1 addition & 2 deletions cognee/tasks/repo_processor/get_local_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
import jedi
import parso
from parso.tree import BaseNode

import logging

logger = logging.getLogger("task:repo_processor")
logger = logging.getLogger(__name__)


@contextmanager
Expand Down
2 changes: 1 addition & 1 deletion cognee/tasks/repo_processor/get_source_code_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from cognee.infrastructure.engine import DataPoint
from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk

logger = logging.getLogger("task:get_source_code_chunks")
logger = logging.getLogger(__name__)


def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int:
Expand Down
4 changes: 3 additions & 1 deletion cognee/tasks/repo_processor/top_down_repo_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import parso
from tqdm import tqdm

from . import logger
import logging

logger = logging.getLogger(__name__)

_NODE_TYPE_MAP = {
"funcdef": "func_def",
Expand Down
1 change: 1 addition & 0 deletions cognee/tasks/summarization/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ class CodeSummary(DataPoint):
__tablename__ = "code_summary"
text: str
summarizes: Union[CodeFile, CodePart, SourceCodeChunk]
pydantic_type: str = "CodeSummary"

_metadata: dict = {"index_fields": ["text"], "type": "CodeSummary"}
Loading
Loading