COG 870 Remove duplicate edges from the code graph (#293)

* feat: turn summarize_code into generator

* feat: extract run_code_graph_pipeline, update the pipeline

* feat: minimal code graph example

* refactor: update argument

* refactor: move run_code_graph_pipeline to cognify/code_graph_pipeline

* refactor: indentation and whitespace nits

* refactor: add deprecated use comments and warnings

---------

Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com>
Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com>
Co-authored-by: Boris <boris@topoteretes.com>
This commit is contained in:
lxobr 2024-12-17 12:02:25 +01:00 committed by GitHub
parent 9e7ab6492a
commit da5e3ab24d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 80 additions and 58 deletions

View file

@ -1,8 +1,14 @@
# NOTICE: This module contains deprecated functions.
# Use only the run_code_graph_pipeline function; all other functions are deprecated.
# Related issue: COG-906
import asyncio import asyncio
import logging import logging
from pathlib import Path
from typing import Union from typing import Union
from cognee.shared.SourceCodeGraph import SourceCodeGraph from cognee.shared.SourceCodeGraph import SourceCodeGraph
from cognee.shared.data_models import SummarizedContent
from cognee.shared.utils import send_telemetry from cognee.shared.utils import send_telemetry
from cognee.modules.data.models import Dataset, Data from cognee.modules.data.models import Dataset, Data
from cognee.modules.data.methods.get_dataset_data import get_dataset_data from cognee.modules.data.methods.get_dataset_data import get_dataset_data
@ -16,7 +22,9 @@ from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline
from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
from cognee.tasks.graph import extract_graph_from_code from cognee.tasks.graph import extract_graph_from_code
from cognee.tasks.repo_processor import get_repo_file_dependencies, enrich_dependency_graph, expand_dependency_graph
from cognee.tasks.storage import add_data_points from cognee.tasks.storage import add_data_points
from cognee.tasks.summarization import summarize_code
logger = logging.getLogger("code_graph_pipeline") logger = logging.getLogger("code_graph_pipeline")
@ -51,6 +59,7 @@ async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User
async def run_pipeline(dataset: Dataset, user: User): async def run_pipeline(dataset: Dataset, user: User):
'''DEPRECATED: Use `run_code_graph_pipeline` instead. This function will be removed.'''
data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id) data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
document_ids_str = [str(document.id) for document in data_documents] document_ids_str = [str(document.id) for document in data_documents]
@ -103,3 +112,30 @@ async def run_pipeline(dataset: Dataset, user: User):
def generate_dataset_name(dataset_name: str) -> str: def generate_dataset_name(dataset_name: str) -> str:
return dataset_name.replace(".", "_").replace(" ", "_") return dataset_name.replace(".", "_").replace(" ", "_")
async def run_code_graph_pipeline(repo_path):
import os
import pathlib
import cognee
from cognee.infrastructure.databases.relational import create_db_and_tables
file_path = Path(__file__).parent
data_directory_path = str(pathlib.Path(os.path.join(file_path, ".data_storage/code_graph")).resolve())
cognee.config.data_root_directory(data_directory_path)
cognee_directory_path = str(pathlib.Path(os.path.join(file_path, ".cognee_system/code_graph")).resolve())
cognee.config.system_root_directory(cognee_directory_path)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
await create_db_and_tables()
tasks = [
Task(get_repo_file_dependencies),
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
Task(expand_dependency_graph, task_config={"batch_size": 50}),
Task(summarize_code, summarization_model=SummarizedContent, task_config={"batch_size": 50}),
Task(add_data_points, task_config={"batch_size": 50}),
]
return run_tasks(tasks, repo_path, "cognify_code_pipeline")

View file

@ -1,39 +1,43 @@
import asyncio import asyncio
from typing import Type
from uuid import uuid5 from uuid import uuid5
from typing import Type
from pydantic import BaseModel from pydantic import BaseModel
from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine import DataPoint
from cognee.modules.data.extraction.extract_summary import extract_summary from cognee.modules.data.extraction.extract_summary import extract_summary
from cognee.shared.CodeGraphEntities import CodeFile from cognee.shared.CodeGraphEntities import CodeFile
from cognee.tasks.storage import add_data_points
from .models import CodeSummary from .models import CodeSummary
async def summarize_code( async def summarize_code(
code_files: list[DataPoint], code_graph_nodes: list[DataPoint],
summarization_model: Type[BaseModel], summarization_model: Type[BaseModel],
) -> list[DataPoint]: ) -> list[DataPoint]:
if len(code_files) == 0: if len(code_graph_nodes) == 0:
return code_files return
code_files_data_points = [file for file in code_files if isinstance(file, CodeFile)] code_files_data_points = [file for file in code_graph_nodes if isinstance(file, CodeFile)]
file_summaries = await asyncio.gather( file_summaries = await asyncio.gather(
*[extract_summary(file.source_code, summarization_model) for file in code_files_data_points] *[extract_summary(file.source_code, summarization_model) for file in code_files_data_points]
) )
summaries = [ file_summaries_map = {
CodeSummary( code_file_data_point.extracted_id: file_summary.summary
id = uuid5(file.id, "CodeSummary"), for code_file_data_point, file_summary in zip(code_files_data_points, file_summaries)
made_from = file, }
text = file_summaries[file_index].summary,
for node in code_graph_nodes:
if not isinstance(node, DataPoint):
continue
yield node
if not isinstance(node, CodeFile):
continue
yield CodeSummary(
id=uuid5(node.id, "CodeSummary"),
made_from=node,
text=file_summaries_map[node.extracted_id],
) )
for (file_index, file) in enumerate(code_files_data_points)
]
await add_data_points(summaries)
return code_files

View file

@ -7,19 +7,13 @@ from pathlib import Path
from swebench.harness.utils import load_swebench_dataset from swebench.harness.utils import load_swebench_dataset
from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
from cognee.api.v1.search import SearchType from cognee.api.v1.search import SearchType
from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.get_llm_client import get_llm_client
from cognee.infrastructure.llm.prompts import read_query_prompt from cognee.infrastructure.llm.prompts import read_query_prompt
from cognee.modules.pipelines import Task, run_tasks
from cognee.modules.retrieval.brute_force_triplet_search import \ from cognee.modules.retrieval.brute_force_triplet_search import \
brute_force_triplet_search brute_force_triplet_search
# from cognee.shared.data_models import SummarizedContent
from cognee.shared.utils import render_graph from cognee.shared.utils import render_graph
from cognee.tasks.repo_processor import (enrich_dependency_graph,
expand_dependency_graph,
get_repo_file_dependencies)
from cognee.tasks.storage import add_data_points
# from cognee.tasks.summarization import summarize_code
from evals.eval_utils import download_github_repo, retrieved_edges_to_string from evals.eval_utils import download_github_repo, retrieved_edges_to_string
@ -42,48 +36,22 @@ def check_install_package(package_name):
async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS): async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
import os repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')
import pathlib pipeline = await run_code_graph_pipeline(repo_path)
import cognee
from cognee.infrastructure.databases.relational import create_db_and_tables
file_path = Path(__file__).parent
data_directory_path = str(pathlib.Path(os.path.join(file_path, ".data_storage/code_graph")).resolve())
cognee.config.data_root_directory(data_directory_path)
cognee_directory_path = str(pathlib.Path(os.path.join(file_path, ".cognee_system/code_graph")).resolve())
cognee.config.system_root_directory(cognee_directory_path)
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata = True)
await create_db_and_tables()
# repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')
repo_path = '/Users/borisarzentar/Projects/graphrag'
tasks = [
Task(get_repo_file_dependencies),
Task(enrich_dependency_graph, task_config = { "batch_size": 50 }),
Task(expand_dependency_graph, task_config = { "batch_size": 50 }),
Task(add_data_points, task_config = { "batch_size": 50 }),
# Task(summarize_code, summarization_model = SummarizedContent),
]
pipeline = run_tasks(tasks, repo_path, "cognify_code_pipeline")
async for result in pipeline: async for result in pipeline:
print(result) print(result)
print('Here we have the repo under the repo_path') print('Here we have the repo under the repo_path')
await render_graph(None, include_labels = True, include_nodes = True) await render_graph(None, include_labels=True, include_nodes=True)
problem_statement = instance['problem_statement'] problem_statement = instance['problem_statement']
instructions = read_query_prompt("patch_gen_kg_instructions.txt") instructions = read_query_prompt("patch_gen_kg_instructions.txt")
retrieved_edges = await brute_force_triplet_search(problem_statement, top_k = 3, collections = ["data_point_source_code", "data_point_text"]) retrieved_edges = await brute_force_triplet_search(problem_statement, top_k=3,
collections=["data_point_source_code", "data_point_text"])
retrieved_edges_str = retrieved_edges_to_string(retrieved_edges) retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)
prompt = "\n".join([ prompt = "\n".join([
@ -171,7 +139,6 @@ async def main():
with open(predictions_path, "w") as file: with open(predictions_path, "w") as file:
json.dump(preds, file) json.dump(preds, file)
subprocess.run( subprocess.run(
[ [
"python", "python",

View file

@ -0,0 +1,15 @@
import argparse
import asyncio
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
async def main(repo_path):
async for result in await run_code_graph_pipeline(repo_path):
print(result)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository")
args = parser.parse_args()
asyncio.run(main(args.repo_path))