Structured code summarization (#375)
* feat: turn summarize_code into generator * feat: extract run_code_graph_pipeline, update the pipeline * feat: minimal code graph example * refactor: update argument * refactor: move run_code_graph_pipeline to cognify/code_graph_pipeline * refactor: indentation and whitespace nits * refactor: add deprecated use comments and warnings * Structured code summarization * add missing prompt file * Remove summarization_model argument from summarize_code and fix typehinting * minor refactors --------- Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com> Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com> Co-authored-by: Boris <boris@topoteretes.com>
This commit is contained in:
parent
da5e3ab24d
commit
9afd0ece63
5 changed files with 71 additions and 26 deletions
|
|
@ -7,22 +7,27 @@ import logging
|
|||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
from cognee.shared.SourceCodeGraph import SourceCodeGraph
|
||||
from cognee.shared.data_models import SummarizedContent
|
||||
from cognee.shared.utils import send_telemetry
|
||||
from cognee.modules.data.models import Dataset, Data
|
||||
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
||||
from cognee.modules.data.methods import get_datasets, get_datasets_by_name
|
||||
from cognee.modules.pipelines.tasks.Task import Task
|
||||
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
||||
from cognee.modules.data.models import Data, Dataset
|
||||
from cognee.modules.pipelines import run_tasks
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.modules.users.methods import get_default_user
|
||||
from cognee.modules.pipelines.models import PipelineRunStatus
|
||||
from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
|
||||
from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
|
||||
from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
|
||||
from cognee.modules.pipelines.operations.get_pipeline_status import \
|
||||
get_pipeline_status
|
||||
from cognee.modules.pipelines.operations.log_pipeline_status import \
|
||||
log_pipeline_status
|
||||
from cognee.modules.pipelines.tasks.Task import Task
|
||||
from cognee.modules.users.methods import get_default_user
|
||||
from cognee.modules.users.models import User
|
||||
from cognee.shared.SourceCodeGraph import SourceCodeGraph
|
||||
from cognee.shared.utils import send_telemetry
|
||||
from cognee.tasks.documents import (check_permissions_on_documents,
|
||||
classify_documents,
|
||||
extract_chunks_from_documents)
|
||||
from cognee.tasks.graph import extract_graph_from_code
|
||||
from cognee.tasks.repo_processor import get_repo_file_dependencies, enrich_dependency_graph, expand_dependency_graph
|
||||
from cognee.tasks.repo_processor import (enrich_dependency_graph,
|
||||
expand_dependency_graph,
|
||||
get_repo_file_dependencies)
|
||||
from cognee.tasks.storage import add_data_points
|
||||
from cognee.tasks.summarization import summarize_code
|
||||
|
||||
|
|
@ -134,7 +139,7 @@ async def run_code_graph_pipeline(repo_path):
|
|||
Task(get_repo_file_dependencies),
|
||||
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
|
||||
Task(expand_dependency_graph, task_config={"batch_size": 50}),
|
||||
Task(summarize_code, summarization_model=SummarizedContent, task_config={"batch_size": 50}),
|
||||
Task(summarize_code, task_config={"batch_size": 50}),
|
||||
Task(add_data_points, task_config={"batch_size": 50}),
|
||||
]
|
||||
|
||||
|
|
|
|||
10
cognee/infrastructure/llm/prompts/summarize_code.txt
Normal file
10
cognee/infrastructure/llm/prompts/summarize_code.txt
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
You are an expert Python programmer and technical writer. Your task is to summarize the given Python code snippet or file.
|
||||
The code may contain multiple imports, classes, functions, constants and logic. Provide a clear, structured explanation of its components
|
||||
and their relationships.
|
||||
|
||||
Instructions:
|
||||
Provide an overview: Start with a high-level summary of what the code does as a whole.
|
||||
Break it down: Summarize each class and function individually, explaining their purpose and how they interact.
|
||||
Describe the workflow: Outline how the classes and functions work together. Mention any control flow (e.g., main functions, entry points, loops).
|
||||
Key features: Highlight important elements like arguments, return values, or unique logic.
|
||||
Maintain clarity: Write in plain English for someone familiar with Python but unfamiliar with this code.
|
||||
|
|
@ -1,7 +1,11 @@
|
|||
from typing import Type
|
||||
|
||||
from pydantic import BaseModel
|
||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
||||
|
||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
||||
from cognee.shared.data_models import SummarizedCode
|
||||
|
||||
|
||||
async def extract_summary(content: str, response_model: Type[BaseModel]):
|
||||
llm_client = get_llm_client()
|
||||
|
|
@ -11,3 +15,7 @@ async def extract_summary(content: str, response_model: Type[BaseModel]):
|
|||
llm_output = await llm_client.acreate_structured_output(content, system_prompt, response_model)
|
||||
|
||||
return llm_output
|
||||
|
||||
async def extract_code_summary(content: str):
|
||||
|
||||
return await extract_summary(content, response_model=SummarizedCode)
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
"""Data models for the cognitive architecture."""
|
||||
|
||||
from enum import Enum, auto
|
||||
from typing import Optional, List, Union, Dict, Any
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class Node(BaseModel):
|
||||
"""Node in a knowledge graph."""
|
||||
id: str
|
||||
|
|
@ -194,6 +196,29 @@ class SummarizedContent(BaseModel):
|
|||
summary: str
|
||||
description: str
|
||||
|
||||
class SummarizedFunction(BaseModel):
|
||||
name: str
|
||||
description: str
|
||||
inputs: Optional[List[str]] = None
|
||||
outputs: Optional[List[str]] = None
|
||||
decorators: Optional[List[str]] = None
|
||||
|
||||
class SummarizedClass(BaseModel):
|
||||
name: str
|
||||
description: str
|
||||
methods: Optional[List[SummarizedFunction]] = None
|
||||
decorators: Optional[List[str]] = None
|
||||
|
||||
class SummarizedCode(BaseModel):
|
||||
file_name: str
|
||||
high_level_summary: str
|
||||
key_features: List[str]
|
||||
imports: List[str] = []
|
||||
constants: List[str] = []
|
||||
classes: List[SummarizedClass] = []
|
||||
functions: List[SummarizedFunction] = []
|
||||
workflow_description: Optional[str] = None
|
||||
|
||||
|
||||
class GraphDBType(Enum):
|
||||
NETWORKX = auto()
|
||||
|
|
|
|||
|
|
@ -1,31 +1,28 @@
|
|||
import asyncio
|
||||
from typing import AsyncGenerator, Union
|
||||
from uuid import uuid5
|
||||
from typing import Type
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
from cognee.modules.data.extraction.extract_summary import extract_summary
|
||||
from cognee.shared.CodeGraphEntities import CodeFile
|
||||
from cognee.modules.data.extraction.extract_summary import extract_code_summary
|
||||
from .models import CodeSummary
|
||||
|
||||
|
||||
async def summarize_code(
|
||||
code_graph_nodes: list[DataPoint],
|
||||
summarization_model: Type[BaseModel],
|
||||
) -> list[DataPoint]:
|
||||
) -> AsyncGenerator[Union[DataPoint, CodeSummary], None]:
|
||||
if len(code_graph_nodes) == 0:
|
||||
return
|
||||
|
||||
code_files_data_points = [file for file in code_graph_nodes if isinstance(file, CodeFile)]
|
||||
code_data_points = [file for file in code_graph_nodes if hasattr(file, "source_code")]
|
||||
|
||||
file_summaries = await asyncio.gather(
|
||||
*[extract_summary(file.source_code, summarization_model) for file in code_files_data_points]
|
||||
*[extract_code_summary(file.source_code) for file in code_data_points]
|
||||
)
|
||||
|
||||
file_summaries_map = {
|
||||
code_file_data_point.extracted_id: file_summary.summary
|
||||
for code_file_data_point, file_summary in zip(code_files_data_points, file_summaries)
|
||||
code_data_point.extracted_id: str(file_summary)
|
||||
for code_data_point, file_summary in zip(code_data_points, file_summaries)
|
||||
}
|
||||
|
||||
for node in code_graph_nodes:
|
||||
|
|
@ -33,7 +30,7 @@ async def summarize_code(
|
|||
continue
|
||||
yield node
|
||||
|
||||
if not isinstance(node, CodeFile):
|
||||
if not hasattr(node, "source_code"):
|
||||
continue
|
||||
|
||||
yield CodeSummary(
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue