From d39140f28b1a54b2632150c080518de97ae371fc Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:10:29 +0100 Subject: [PATCH 1/8] feat: implements the first version of graph based completion in search --- cognee/api/v1/search/search_v2.py | 3 ++ .../answer_simple_question_restricted.txt | 1 + .../prompts/graph_context_for_question.txt | 2 + cognee/tasks/completion/__init__.py | 3 +- .../completion/graph_query_completion.py | 45 +++++++++++++++++++ examples/python/dynamic_steps_example.py | 12 ++--- 6 files changed, 60 insertions(+), 6 deletions(-) create mode 100644 cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt create mode 100644 cognee/infrastructure/llm/prompts/graph_context_for_question.txt create mode 100644 cognee/tasks/completion/graph_query_completion.py diff --git a/cognee/api/v1/search/search_v2.py b/cognee/api/v1/search/search_v2.py index 222ec6791..48eb9c97f 100644 --- a/cognee/api/v1/search/search_v2.py +++ b/cognee/api/v1/search/search_v2.py @@ -15,12 +15,14 @@ from cognee.tasks.graph import query_graph_connections from cognee.tasks.summarization import query_summaries from cognee.tasks.completion import query_completion +from cognee.tasks.completion import graph_query_completion class SearchType(Enum): SUMMARIES = "SUMMARIES" INSIGHTS = "INSIGHTS" CHUNKS = "CHUNKS" COMPLETION = "COMPLETION" + GRAPH_COMPLETION = "GRAPH_COMPLETION" async def search(query_type: SearchType, query_text: str, user: User = None, datasets: Union[list[str], str, None] = None) -> list: @@ -58,6 +60,7 @@ async def specific_search(query_type: SearchType, query: str, user) -> list: SearchType.INSIGHTS: query_graph_connections, SearchType.CHUNKS: query_chunks, SearchType.COMPLETION: query_completion, + SearchType.GRAPH_COMPLETION: graph_query_completion } search_task = search_tasks.get(query_type) diff --git a/cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt b/cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt new file mode 100644 index 000000000..fe37a2668 --- /dev/null +++ b/cognee/infrastructure/llm/prompts/answer_simple_question_restricted.txt @@ -0,0 +1 @@ +Answer the question using the provided context. If the provided context is not connected to the question, just answer "The provided knowledge base does not contain the answer to the question". Be as brief as possible. \ No newline at end of file diff --git a/cognee/infrastructure/llm/prompts/graph_context_for_question.txt b/cognee/infrastructure/llm/prompts/graph_context_for_question.txt new file mode 100644 index 000000000..dae7138ac --- /dev/null +++ b/cognee/infrastructure/llm/prompts/graph_context_for_question.txt @@ -0,0 +1,2 @@ +The question is: `{{ question }}` +and here is the context provided with a set of relationships from a knowledge graph separated by \n---\n each represented as node1 -- relation -- node2 triplet: `{{ context }}` \ No newline at end of file diff --git a/cognee/tasks/completion/__init__.py b/cognee/tasks/completion/__init__.py index 1bf0fa6bb..393733341 100644 --- a/cognee/tasks/completion/__init__.py +++ b/cognee/tasks/completion/__init__.py @@ -1 +1,2 @@ -from .query_completion import query_completion \ No newline at end of file +from .query_completion import query_completion +from .graph_query_completion import graph_query_completion \ No newline at end of file diff --git a/cognee/tasks/completion/graph_query_completion.py b/cognee/tasks/completion/graph_query_completion.py new file mode 100644 index 000000000..ffa11e9dd --- /dev/null +++ b/cognee/tasks/completion/graph_query_completion.py @@ -0,0 +1,45 @@ +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.tasks.completion.exceptions import NoRelevantDataFound +from cognee.infrastructure.llm.get_llm_client import get_llm_client +from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt +from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search + + +def retrieved_edges_to_string(retrieved_edges): + edge_strings = [] + for edge in retrieved_edges: + node1_string = edge.node1.attributes['text'] or edge.node1.attributes.get('name') + node2_string = edge.node2.attributes['text'] or edge.node2.attributes.get('name') + edge_string = edge.attributes['relationship_type'] + edge_str = f"{node1_string} -- {edge_string} -- {node2_string}" + edge_strings.append(edge_str) + return "\n---\n".join(edge_strings) + +async def graph_query_completion(query: str) -> list: + """ + Parameters: + - query (str): The query string to compute. + + Returns: + - list: Answer to the query. + """ + found_triplets = await brute_force_triplet_search(query, top_k=5) + + if len(found_triplets) == 0: + raise NoRelevantDataFound + + args = { + "question": query, + "context": retrieved_edges_to_string(found_triplets), + } + user_prompt = render_prompt("graph_context_for_question.txt", args) + system_prompt = read_query_prompt("answer_simple_question_restricted.txt") + + llm_client = get_llm_client() + computed_answer = await llm_client.acreate_structured_output( + text_input=user_prompt, + system_prompt=system_prompt, + response_model=str, + ) + + return [computed_answer] diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index 7c0af8f0c..1eeff8259 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -1,8 +1,8 @@ import cognee import asyncio import logging -from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search -from cognee.modules.retrieval.brute_force_triplet_search import format_triplets + +from cognee.api.v1.search import SearchType from cognee.shared.utils import setup_logging job_1 = """ @@ -184,11 +184,13 @@ async def main(enable_steps): # Step 4: Query insights if enable_steps.get("retriever"): - results = await brute_force_triplet_search('Who has the most experience with graphic design?') - print(format_triplets(results)) + search_results = await cognee.search( + SearchType.GRAPH_COMPLETION, query_text='Who has experience in design tools?' + ) + print(search_results) if __name__ == '__main__': - setup_logging(logging.ERROR) + setup_logging(logging.INFO) rebuild_kg = True retrieve = True From 341f30fcdc1d8fe40e3db6a8a2c09d151c07d73c Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Thu, 9 Jan 2025 12:00:49 +0100 Subject: [PATCH 2/8] fix: Fixes ruff formatting --- cognee/api/v1/search/search_v2.py | 2 +- cognee/tasks/completion/graph_query_completion.py | 7 ++++--- examples/python/dynamic_steps_example.py | 5 +++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cognee/api/v1/search/search_v2.py b/cognee/api/v1/search/search_v2.py index 862b6ede0..4166fd3f3 100644 --- a/cognee/api/v1/search/search_v2.py +++ b/cognee/api/v1/search/search_v2.py @@ -67,7 +67,7 @@ async def specific_search(query_type: SearchType, query: str, user) -> list: SearchType.INSIGHTS: query_graph_connections, SearchType.CHUNKS: query_chunks, SearchType.COMPLETION: query_completion, - SearchType.GRAPH_COMPLETION: graph_query_completion + SearchType.GRAPH_COMPLETION: graph_query_completion, } search_task = search_tasks.get(query_type) diff --git a/cognee/tasks/completion/graph_query_completion.py b/cognee/tasks/completion/graph_query_completion.py index ffa11e9dd..d579aea4d 100644 --- a/cognee/tasks/completion/graph_query_completion.py +++ b/cognee/tasks/completion/graph_query_completion.py @@ -8,13 +8,14 @@ def retrieved_edges_to_string(retrieved_edges): edge_strings = [] for edge in retrieved_edges: - node1_string = edge.node1.attributes['text'] or edge.node1.attributes.get('name') - node2_string = edge.node2.attributes['text'] or edge.node2.attributes.get('name') - edge_string = edge.attributes['relationship_type'] + node1_string = edge.node1.attributes["text"] or edge.node1.attributes.get("name") + node2_string = edge.node2.attributes["text"] or edge.node2.attributes.get("name") + edge_string = edge.attributes["relationship_type"] edge_str = f"{node1_string} -- {edge_string} -- {node2_string}" edge_strings.append(edge_str) return "\n---\n".join(edge_strings) + async def graph_query_completion(query: str) -> list: """ Parameters: diff --git a/examples/python/dynamic_steps_example.py b/examples/python/dynamic_steps_example.py index 04db3d7f2..11596a5e2 100644 --- a/examples/python/dynamic_steps_example.py +++ b/examples/python/dynamic_steps_example.py @@ -186,11 +186,12 @@ async def main(enable_steps): # Step 4: Query insights if enable_steps.get("retriever"): search_results = await cognee.search( - SearchType.GRAPH_COMPLETION, query_text='Who has experience in design tools?' + SearchType.GRAPH_COMPLETION, query_text="Who has experience in design tools?" ) print(search_results) -if __name__ == '__main__': + +if __name__ == "__main__": setup_logging(logging.INFO) rebuild_kg = True From 1989296b019d7b79f0be44756fb4404483194501 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 9 Jan 2025 12:17:42 +0100 Subject: [PATCH 3/8] fix: Resolve profiler issue with partial and recursive logger imports Resolve issue for profiler with partial and recursive logger imports --- cognee/tasks/repo_processor/__init__.py | 2 +- cognee/tasks/repo_processor/expand_dependency_graph.py | 4 +++- cognee/tasks/repo_processor/extract_code_parts.py | 4 +++- cognee/tasks/repo_processor/get_local_dependencies.py | 4 +++- cognee/tasks/repo_processor/get_source_code_chunks.py | 2 +- cognee/tasks/repo_processor/top_down_repo_parse.py | 4 +++- 6 files changed, 14 insertions(+), 6 deletions(-) diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index 6dc032547..b20351685 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -4,4 +4,4 @@ from .get_repo_file_dependencies import get_repo_file_dependencies import logging -logger = logging.getLogger("task:repo_processor") +logger = logging.getLogger(__name__) diff --git a/cognee/tasks/repo_processor/expand_dependency_graph.py b/cognee/tasks/repo_processor/expand_dependency_graph.py index de26fe8d4..cc957742b 100644 --- a/cognee/tasks/repo_processor/expand_dependency_graph.py +++ b/cognee/tasks/repo_processor/expand_dependency_graph.py @@ -5,7 +5,9 @@ from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, CodePart from cognee.tasks.repo_processor.extract_code_parts import extract_code_parts -from cognee.tasks.repo_processor import logger +import logging + +logger = logging.getLogger(__name__) def _add_code_parts_nodes_and_edges(code_file: CodeFile, part_type, code_parts) -> None: diff --git a/cognee/tasks/repo_processor/extract_code_parts.py b/cognee/tasks/repo_processor/extract_code_parts.py index 76cfef538..f25146232 100644 --- a/cognee/tasks/repo_processor/extract_code_parts.py +++ b/cognee/tasks/repo_processor/extract_code_parts.py @@ -1,7 +1,9 @@ from typing import Dict, List import parso -from cognee.tasks.repo_processor import logger +import logging + +logger = logging.getLogger(__name__) def _extract_parts_from_module(module, parts_dict: Dict[str, List[str]]) -> Dict[str, List[str]]: diff --git a/cognee/tasks/repo_processor/get_local_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py index b443829c9..b0ac2829f 100644 --- a/cognee/tasks/repo_processor/get_local_dependencies.py +++ b/cognee/tasks/repo_processor/get_local_dependencies.py @@ -10,7 +10,9 @@ import parso from parso.tree import BaseNode -from cognee.tasks.repo_processor import logger +import logging + +logger = logging.getLogger(__name__) @contextmanager diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index 5e14e11ac..980a86539 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -8,7 +8,7 @@ from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk -logger = logging.getLogger("task:get_source_code_chunks") +logger = logging.getLogger(__name__) def _count_tokens(tokenizer: tiktoken.Encoding, source_code: str) -> int: diff --git a/cognee/tasks/repo_processor/top_down_repo_parse.py b/cognee/tasks/repo_processor/top_down_repo_parse.py index aed971920..87b7b8c95 100644 --- a/cognee/tasks/repo_processor/top_down_repo_parse.py +++ b/cognee/tasks/repo_processor/top_down_repo_parse.py @@ -4,7 +4,9 @@ import parso from tqdm import tqdm -from . import logger +import logging + +logger = logging.getLogger(__name__) _NODE_TYPE_MAP = { "funcdef": "func_def", From b733590724d6a4ec2df1cac38e04d3a440cec508 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 9 Jan 2025 12:26:14 +0100 Subject: [PATCH 4/8] fix: Remove logger from __init__.py file --- cognee/tasks/repo_processor/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index b20351685..8f0df23d8 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -2,6 +2,3 @@ from .expand_dependency_graph import expand_dependency_graph from .get_non_code_files import get_data_list_for_user, get_non_py_files from .get_repo_file_dependencies import get_repo_file_dependencies -import logging - -logger = logging.getLogger(__name__) From ccf758ed7b8a8be68949503900509d9a422c26fc Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 9 Jan 2025 12:32:30 +0100 Subject: [PATCH 5/8] test: Test profiling on HEAD branch --- .github/workflows/profiling.yaml | 40 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/profiling.yaml b/.github/workflows/profiling.yaml index 0ecbc960c..de4bb179d 100644 --- a/.github/workflows/profiling.yaml +++ b/.github/workflows/profiling.yaml @@ -68,33 +68,33 @@ jobs: echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV # Run profiler on the base branch - - name: Run profiler on base branch +# - name: Run profiler on base branch +# env: +# BASE_SHA: ${{ env.BASE_SHA }} +# run: | +# echo "Profiling the base branch for code_graph_pipeline.py" +# echo "Checking out base SHA: $BASE_SHA" +# git checkout $BASE_SHA +# echo "This is the working directory: $PWD" +# # Ensure the script is executable +# chmod +x cognee/api/v1/cognify/code_graph_pipeline.py +# # Run Scalene +# poetry run pyinstrument --renderer json -o base_results.json cognee/api/v1/cognify/code_graph_pipeline.py + + # Run profiler on head branch + - name: Run profiler on head branch env: - BASE_SHA: ${{ env.BASE_SHA }} + HEAD_SHA: ${{ env.HEAD_SHA }} run: | - echo "Profiling the base branch for code_graph_pipeline.py" - echo "Checking out base SHA: $BASE_SHA" - git checkout $BASE_SHA + echo "Profiling the head branch for code_graph_pipeline.py" + echo "Checking out head SHA: $HEAD_SHA" + git checkout $HEAD_SHA echo "This is the working directory: $PWD" # Ensure the script is executable chmod +x cognee/api/v1/cognify/code_graph_pipeline.py # Run Scalene - poetry run pyinstrument --renderer json -o base_results.json cognee/api/v1/cognify/code_graph_pipeline.py + poetry run pyinstrument --renderer json -o head_results.json cognee/api/v1/cognify/code_graph_pipeline.py - # Run profiler on head branch -# - name: Run profiler on head branch -# env: -# HEAD_SHA: ${{ env.HEAD_SHA }} -# run: | -# echo "Profiling the head branch for code_graph_pipeline.py" -# echo "Checking out head SHA: $HEAD_SHA" -# git checkout $HEAD_SHA -# echo "This is the working directory: $PWD" -# # Ensure the script is executable -# chmod +x cognee/api/v1/cognify/code_graph_pipeline.py -# # Run Scalene -# poetry run pyinstrument --renderer json -o head_results.json cognee/api/v1/cognify/code_graph_pipeline.py -# # # Compare profiling results # - name: Compare profiling results # run: | From 2ae66c2c2e5f0bb08a1a1e98706d6dc850a54d11 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 9 Jan 2025 12:38:42 +0100 Subject: [PATCH 6/8] test: Return profiler to base branch --- .github/workflows/profiling.yaml | 40 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/profiling.yaml b/.github/workflows/profiling.yaml index de4bb179d..2408a8f70 100644 --- a/.github/workflows/profiling.yaml +++ b/.github/workflows/profiling.yaml @@ -68,32 +68,32 @@ jobs: echo "HEAD_SHA=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV # Run profiler on the base branch -# - name: Run profiler on base branch -# env: -# BASE_SHA: ${{ env.BASE_SHA }} -# run: | -# echo "Profiling the base branch for code_graph_pipeline.py" -# echo "Checking out base SHA: $BASE_SHA" -# git checkout $BASE_SHA -# echo "This is the working directory: $PWD" -# # Ensure the script is executable -# chmod +x cognee/api/v1/cognify/code_graph_pipeline.py -# # Run Scalene -# poetry run pyinstrument --renderer json -o base_results.json cognee/api/v1/cognify/code_graph_pipeline.py - - # Run profiler on head branch - - name: Run profiler on head branch + - name: Run profiler on base branch env: - HEAD_SHA: ${{ env.HEAD_SHA }} + BASE_SHA: ${{ env.BASE_SHA }} run: | - echo "Profiling the head branch for code_graph_pipeline.py" - echo "Checking out head SHA: $HEAD_SHA" - git checkout $HEAD_SHA + echo "Profiling the base branch for code_graph_pipeline.py" + echo "Checking out base SHA: $BASE_SHA" + git checkout $BASE_SHA echo "This is the working directory: $PWD" # Ensure the script is executable chmod +x cognee/api/v1/cognify/code_graph_pipeline.py # Run Scalene - poetry run pyinstrument --renderer json -o head_results.json cognee/api/v1/cognify/code_graph_pipeline.py + poetry run pyinstrument --renderer json -o base_results.json cognee/api/v1/cognify/code_graph_pipeline.py + + # Run profiler on head branch +# - name: Run profiler on head branch +# env: +# HEAD_SHA: ${{ env.HEAD_SHA }} +# run: | +# echo "Profiling the head branch for code_graph_pipeline.py" +# echo "Checking out head SHA: $HEAD_SHA" +# git checkout $HEAD_SHA +# echo "This is the working directory: $PWD" +# # Ensure the script is executable +# chmod +x cognee/api/v1/cognify/code_graph_pipeline.py +# # Run Scalene +# poetry run pyinstrument --renderer json -o head_results.json cognee/api/v1/cognify/code_graph_pipeline.py # # Compare profiling results # - name: Compare profiling results From 6b6cc0f1d45863328a7b2b150f4234910f622df0 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 9 Jan 2025 16:06:26 +0100 Subject: [PATCH 7/8] fix: Add fix for accessing dictionary elements that don't exits Using get for the text key instead of direct access to handle situation if the text key doesn't exist --- cognee/tasks/completion/graph_query_completion.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cognee/tasks/completion/graph_query_completion.py b/cognee/tasks/completion/graph_query_completion.py index d579aea4d..b130d4f7b 100644 --- a/cognee/tasks/completion/graph_query_completion.py +++ b/cognee/tasks/completion/graph_query_completion.py @@ -5,11 +5,11 @@ from cognee.modules.retrieval.brute_force_triplet_search import brute_force_triplet_search -def retrieved_edges_to_string(retrieved_edges): +def retrieved_edges_to_string(retrieved_edges: list) -> str: edge_strings = [] for edge in retrieved_edges: - node1_string = edge.node1.attributes["text"] or edge.node1.attributes.get("name") - node2_string = edge.node2.attributes["text"] or edge.node2.attributes.get("name") + node1_string = edge.node1.attributes.get("text") or edge.node1.attributes.get("name") + node2_string = edge.node2.attributes.get("text") or edge.node2.attributes.get("name") edge_string = edge.attributes["relationship_type"] edge_str = f"{node1_string} -- {edge_string} -- {node2_string}" edge_strings.append(edge_str) From 6b57bfc4cb50c550a3ff650ee0d5068212399655 Mon Sep 17 00:00:00 2001 From: Igor Ilic Date: Thu, 9 Jan 2025 16:41:18 +0100 Subject: [PATCH 8/8] feat: Add ability to change graph database configuration through cognee --- cognee/api/v1/config/config.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cognee/api/v1/config/config.py b/cognee/api/v1/config/config.py index da58cf581..2f7b406a8 100644 --- a/cognee/api/v1/config/config.py +++ b/cognee/api/v1/config/config.py @@ -131,6 +131,18 @@ def set_relational_db_config(config_dict: dict): message=f"'{key}' is not a valid attribute of the config." ) + @staticmethod + def set_graph_db_config(config_dict: dict) -> None: + """ + Updates the graph db config with values from config_dict. + """ + graph_db_config = get_graph_config() + for key, value in config_dict.items(): + if hasattr(graph_db_config, key): + object.__setattr__(graph_db_config, key, value) + else: + raise AttributeError(message=f"'{key}' is not a valid attribute of the config.") + @staticmethod def set_vector_db_config(config_dict: dict): """