Structured code summarization (#375)

* feat: turn summarize_code into generator

* feat: extract run_code_graph_pipeline, update the pipeline

* feat: minimal code graph example

* refactor: update argument

* refactor: move run_code_graph_pipeline to cognify/code_graph_pipeline

* refactor: indentation and whitespace nits

* refactor: add deprecated use comments and warnings

* Structured code summarization

* add missing prompt file

* Remove summarization_model argument from summarize_code and fix typehinting

* minor refactors

---------

Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com>
Co-authored-by: Boris <boris@topoteretes.com>
This commit is contained in:
alekszievr 2024-12-17 13:05:47 +01:00 committed by GitHub
parent da5e3ab24d
commit 9afd0ece63
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 71 additions and 26 deletions

View file

@ -7,22 +7,27 @@ import logging
from pathlib import Path
from typing import Union
from cognee.shared.SourceCodeGraph import SourceCodeGraph
from cognee.shared.data_models import SummarizedContent
from cognee.shared.utils import send_telemetry
from cognee.modules.data.models import Dataset, Data
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
from cognee.modules.data.methods import get_datasets, get_datasets_by_name
from cognee.modules.pipelines.tasks.Task import Task
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
from cognee.modules.data.models import Data, Dataset
from cognee.modules.pipelines import run_tasks
from cognee.modules.users.models import User
from cognee.modules.users.methods import get_default_user
from cognee.modules.pipelines.models import PipelineRunStatus
from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
from cognee.modules.pipelines.operations.get_pipeline_status import \
get_pipeline_status
from cognee.modules.pipelines.operations.log_pipeline_status import \
log_pipeline_status
from cognee.modules.pipelines.tasks.Task import Task
from cognee.modules.users.methods import get_default_user
from cognee.modules.users.models import User
from cognee.shared.SourceCodeGraph import SourceCodeGraph
from cognee.shared.utils import send_telemetry
from cognee.tasks.documents import (check_permissions_on_documents,
classify_documents,
extract_chunks_from_documents)
from cognee.tasks.graph import extract_graph_from_code
from cognee.tasks.repo_processor import get_repo_file_dependencies, enrich_dependency_graph, expand_dependency_graph
from cognee.tasks.repo_processor import (enrich_dependency_graph,
expand_dependency_graph,
get_repo_file_dependencies)
from cognee.tasks.storage import add_data_points
from cognee.tasks.summarization import summarize_code
@ -134,7 +139,7 @@ async def run_code_graph_pipeline(repo_path):
Task(get_repo_file_dependencies),
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
Task(expand_dependency_graph, task_config={"batch_size": 50}),
Task(summarize_code, summarization_model=SummarizedContent, task_config={"batch_size": 50}),
Task(summarize_code, task_config={"batch_size": 50}),
Task(add_data_points, task_config={"batch_size": 50}),
]

View file

@ -0,0 +1,10 @@
You are an expert Python programmer and technical writer. Your task is to summarize the given Python code snippet or file.
The code may contain multiple imports, classes, functions, constants and logic. Provide a clear, structured explanation of its components
and their relationships.
Instructions:
Provide an overview: Start with a high-level summary of what the code does as a whole.
Break it down: Summarize each class and function individually, explaining their purpose and how they interact.
Describe the workflow: Outline how the classes and functions work together. Mention any control flow (e.g., main functions, entry points, loops).
Key features: Highlight important elements like arguments, return values, or unique logic.
Maintain clarity: Write in plain English for someone familiar with Python but unfamiliar with this code.

View file

@ -1,7 +1,11 @@
from typing import Type
from pydantic import BaseModel
from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.shared.data_models import SummarizedCode
async def extract_summary(content: str, response_model: Type[BaseModel]):
llm_client = get_llm_client()
@ -11,3 +15,7 @@ async def extract_summary(content: str, response_model: Type[BaseModel]):
llm_output = await llm_client.acreate_structured_output(content, system_prompt, response_model)
return llm_output
async def extract_code_summary(content: str):
return await extract_summary(content, response_model=SummarizedCode)

View file

@ -1,9 +1,11 @@
"""Data models for the cognitive architecture."""
from enum import Enum, auto
from typing import Optional, List, Union, Dict, Any
from typing import Any, Dict, List, Optional, Union
from pydantic import BaseModel, Field
class Node(BaseModel):
"""Node in a knowledge graph."""
id: str
@ -194,6 +196,29 @@ class SummarizedContent(BaseModel):
summary: str
description: str
class SummarizedFunction(BaseModel):
name: str
description: str
inputs: Optional[List[str]] = None
outputs: Optional[List[str]] = None
decorators: Optional[List[str]] = None
class SummarizedClass(BaseModel):
name: str
description: str
methods: Optional[List[SummarizedFunction]] = None
decorators: Optional[List[str]] = None
class SummarizedCode(BaseModel):
file_name: str
high_level_summary: str
key_features: List[str]
imports: List[str] = []
constants: List[str] = []
classes: List[SummarizedClass] = []
functions: List[SummarizedFunction] = []
workflow_description: Optional[str] = None
class GraphDBType(Enum):
NETWORKX = auto()

View file

@ -1,31 +1,28 @@
import asyncio
from typing import AsyncGenerator, Union
from uuid import uuid5
from typing import Type
from pydantic import BaseModel
from cognee.infrastructure.engine import DataPoint
from cognee.modules.data.extraction.extract_summary import extract_summary
from cognee.shared.CodeGraphEntities import CodeFile
from cognee.modules.data.extraction.extract_summary import extract_code_summary
from .models import CodeSummary
async def summarize_code(
code_graph_nodes: list[DataPoint],
summarization_model: Type[BaseModel],
) -> list[DataPoint]:
) -> AsyncGenerator[Union[DataPoint, CodeSummary], None]:
if len(code_graph_nodes) == 0:
return
code_files_data_points = [file for file in code_graph_nodes if isinstance(file, CodeFile)]
code_data_points = [file for file in code_graph_nodes if hasattr(file, "source_code")]
file_summaries = await asyncio.gather(
*[extract_summary(file.source_code, summarization_model) for file in code_files_data_points]
*[extract_code_summary(file.source_code) for file in code_data_points]
)
file_summaries_map = {
code_file_data_point.extracted_id: file_summary.summary
for code_file_data_point, file_summary in zip(code_files_data_points, file_summaries)
code_data_point.extracted_id: str(file_summary)
for code_data_point, file_summary in zip(code_data_points, file_summaries)
}
for node in code_graph_nodes:
@ -33,7 +30,7 @@ async def summarize_code(
continue
yield node
if not isinstance(node, CodeFile):
if not hasattr(node, "source_code"):
continue
yield CodeSummary(