Skip to content

Commit

Permalink
Structured code summarization (#375)
Browse files Browse the repository at this point in the history
* feat: turn summarize_code into generator

* feat: extract run_code_graph_pipeline, update the pipeline

* feat: minimal code graph example

* refactor: update argument

* refactor: move run_code_graph_pipeline to cognify/code_graph_pipeline

* refactor: indentation and whitespace nits

* refactor: add deprecated use comments and warnings

* Structured code summarization

* add missing prompt file

* Remove summarization_model argument from summarize_code and fix typehinting

* minor refactors

---------

Co-authored-by: lxobr <[email protected]>
Co-authored-by: Vasilije <[email protected]>
Co-authored-by: Igor Ilic <[email protected]>
Co-authored-by: Boris <[email protected]>
  • Loading branch information
5 people authored Dec 17, 2024
1 parent da5e3ab commit 9afd0ec
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 26 deletions.
31 changes: 18 additions & 13 deletions cognee/api/v1/cognify/code_graph_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,27 @@
from pathlib import Path
from typing import Union

from cognee.shared.SourceCodeGraph import SourceCodeGraph
from cognee.shared.data_models import SummarizedContent
from cognee.shared.utils import send_telemetry
from cognee.modules.data.models import Dataset, Data
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
from cognee.modules.data.methods import get_datasets, get_datasets_by_name
from cognee.modules.pipelines.tasks.Task import Task
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
from cognee.modules.data.models import Data, Dataset
from cognee.modules.pipelines import run_tasks
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user
from cognee.modules.pipelines.models import PipelineRunStatus
from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
from cognee.modules.pipelines.operations.get_pipeline_status import \
get_pipeline_status
from cognee.modules.pipelines.operations.log_pipeline_status import \
log_pipeline_status
from cognee.modules.pipelines.tasks.Task import Task
from cognee.modules.users.methods import get_default_user
from cognee.modules.users.models import User
from cognee.shared.SourceCodeGraph import SourceCodeGraph
from cognee.shared.utils import send_telemetry
from cognee.tasks.documents import (check_permissions_on_documents,
classify_documents,
extract_chunks_from_documents)
from cognee.tasks.graph import extract_graph_from_code
from cognee.tasks.repo_processor import get_repo_file_dependencies, enrich_dependency_graph, expand_dependency_graph
from cognee.tasks.repo_processor import (enrich_dependency_graph,
expand_dependency_graph,
get_repo_file_dependencies)
from cognee.tasks.storage import add_data_points
from cognee.tasks.summarization import summarize_code

Expand Down Expand Up @@ -134,7 +139,7 @@ async def run_code_graph_pipeline(repo_path):
Task(get_repo_file_dependencies),
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
Task(expand_dependency_graph, task_config={"batch_size": 50}),
Task(summarize_code, summarization_model=SummarizedContent, task_config={"batch_size": 50}),
Task(summarize_code, task_config={"batch_size": 50}),
Task(add_data_points, task_config={"batch_size": 50}),
]

Expand Down
10 changes: 10 additions & 0 deletions cognee/infrastructure/llm/prompts/summarize_code.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
You are an expert Python programmer and technical writer. Your task is to summarize the given Python code snippet or file.
The code may contain multiple imports, classes, functions, constants and logic. Provide a clear, structured explanation of its components
and their relationships.

Instructions:
Provide an overview: Start with a high-level summary of what the code does as a whole.
Break it down: Summarize each class and function individually, explaining their purpose and how they interact.
Describe the workflow: Outline how the classes and functions work together. Mention any control flow (e.g., main functions, entry points, loops).
Key features: Highlight important elements like arguments, return values, or unique logic.
Maintain clarity: Write in plain English for someone familiar with Python but unfamiliar with this code.
10 changes: 9 additions & 1 deletion cognee/modules/data/extraction/extract_summary.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from typing import Type

from pydantic import BaseModel
from cognee.infrastructure.llm.prompts import read_query_prompt

from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.shared.data_models import SummarizedCode


async def extract_summary(content: str, response_model: Type[BaseModel]):
llm_client = get_llm_client()
Expand All @@ -11,3 +15,7 @@ async def extract_summary(content: str, response_model: Type[BaseModel]):
llm_output = await llm_client.acreate_structured_output(content, system_prompt, response_model)

return llm_output

async def extract_code_summary(content: str):

return await extract_summary(content, response_model=SummarizedCode)
27 changes: 26 additions & 1 deletion cognee/shared/data_models.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Data models for the cognitive architecture."""

from enum import Enum, auto
from typing import Optional, List, Union, Dict, Any
from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel, Field


class Node(BaseModel):
"""Node in a knowledge graph."""
id: str
Expand Down Expand Up @@ -194,6 +196,29 @@ class SummarizedContent(BaseModel):
summary: str
description: str

class SummarizedFunction(BaseModel):
name: str
description: str
inputs: Optional[List[str]] = None
outputs: Optional[List[str]] = None
decorators: Optional[List[str]] = None

class SummarizedClass(BaseModel):
name: str
description: str
methods: Optional[List[SummarizedFunction]] = None
decorators: Optional[List[str]] = None

class SummarizedCode(BaseModel):
file_name: str
high_level_summary: str
key_features: List[str]
imports: List[str] = []
constants: List[str] = []
classes: List[SummarizedClass] = []
functions: List[SummarizedFunction] = []
workflow_description: Optional[str] = None


class GraphDBType(Enum):
NETWORKX = auto()
Expand Down
19 changes: 8 additions & 11 deletions cognee/tasks/summarization/summarize_code.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,36 @@
import asyncio
from typing import AsyncGenerator, Union
from uuid import uuid5
from typing import Type

from pydantic import BaseModel

from cognee.infrastructure.engine import DataPoint
from cognee.modules.data.extraction.extract_summary import extract_summary
from cognee.shared.CodeGraphEntities import CodeFile
from cognee.modules.data.extraction.extract_summary import extract_code_summary
from .models import CodeSummary


async def summarize_code(
code_graph_nodes: list[DataPoint],
summarization_model: Type[BaseModel],
) -> list[DataPoint]:
) -> AsyncGenerator[Union[DataPoint, CodeSummary], None]:
if len(code_graph_nodes) == 0:
return

code_files_data_points = [file for file in code_graph_nodes if isinstance(file, CodeFile)]
code_data_points = [file for file in code_graph_nodes if hasattr(file, "source_code")]

file_summaries = await asyncio.gather(
*[extract_summary(file.source_code, summarization_model) for file in code_files_data_points]
*[extract_code_summary(file.source_code) for file in code_data_points]
)

file_summaries_map = {
code_file_data_point.extracted_id: file_summary.summary
for code_file_data_point, file_summary in zip(code_files_data_points, file_summaries)
code_data_point.extracted_id: str(file_summary)
for code_data_point, file_summary in zip(code_data_points, file_summaries)
}

for node in code_graph_nodes:
if not isinstance(node, DataPoint):
continue
yield node

if not isinstance(node, CodeFile):
if not hasattr(node, "source_code"):
continue

yield CodeSummary(
Expand Down

0 comments on commit 9afd0ec

Please sign in to comment.