COG 870 Remove duplicate edges from the code graph (#293)
* feat: turn summarize_code into generator * feat: extract run_code_graph_pipeline, update the pipeline * feat: minimal code graph example * refactor: update argument * refactor: move run_code_graph_pipeline to cognify/code_graph_pipeline * refactor: indentation and whitespace nits * refactor: add deprecated use comments and warnings --------- Co-authored-by: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Co-authored-by: Igor Ilic <30923996+dexters1@users.noreply.github.com> Co-authored-by: Boris <boris@topoteretes.com>
This commit is contained in:
parent
9e7ab6492a
commit
da5e3ab24d
4 changed files with 80 additions and 58 deletions
|
|
@ -1,8 +1,14 @@
|
||||||
|
# NOTICE: This module contains deprecated functions.
|
||||||
|
# Use only the run_code_graph_pipeline function; all other functions are deprecated.
|
||||||
|
# Related issue: COG-906
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from cognee.shared.SourceCodeGraph import SourceCodeGraph
|
from cognee.shared.SourceCodeGraph import SourceCodeGraph
|
||||||
|
from cognee.shared.data_models import SummarizedContent
|
||||||
from cognee.shared.utils import send_telemetry
|
from cognee.shared.utils import send_telemetry
|
||||||
from cognee.modules.data.models import Dataset, Data
|
from cognee.modules.data.models import Dataset, Data
|
||||||
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
from cognee.modules.data.methods.get_dataset_data import get_dataset_data
|
||||||
|
|
@ -16,7 +22,9 @@ from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline
|
||||||
from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
|
from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
|
||||||
from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
|
from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
|
||||||
from cognee.tasks.graph import extract_graph_from_code
|
from cognee.tasks.graph import extract_graph_from_code
|
||||||
|
from cognee.tasks.repo_processor import get_repo_file_dependencies, enrich_dependency_graph, expand_dependency_graph
|
||||||
from cognee.tasks.storage import add_data_points
|
from cognee.tasks.storage import add_data_points
|
||||||
|
from cognee.tasks.summarization import summarize_code
|
||||||
|
|
||||||
logger = logging.getLogger("code_graph_pipeline")
|
logger = logging.getLogger("code_graph_pipeline")
|
||||||
|
|
||||||
|
|
@ -51,6 +59,7 @@ async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User
|
||||||
|
|
||||||
|
|
||||||
async def run_pipeline(dataset: Dataset, user: User):
|
async def run_pipeline(dataset: Dataset, user: User):
|
||||||
|
'''DEPRECATED: Use `run_code_graph_pipeline` instead. This function will be removed.'''
|
||||||
data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
|
data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)
|
||||||
|
|
||||||
document_ids_str = [str(document.id) for document in data_documents]
|
document_ids_str = [str(document.id) for document in data_documents]
|
||||||
|
|
@ -103,3 +112,30 @@ async def run_pipeline(dataset: Dataset, user: User):
|
||||||
|
|
||||||
def generate_dataset_name(dataset_name: str) -> str:
|
def generate_dataset_name(dataset_name: str) -> str:
|
||||||
return dataset_name.replace(".", "_").replace(" ", "_")
|
return dataset_name.replace(".", "_").replace(" ", "_")
|
||||||
|
|
||||||
|
|
||||||
|
async def run_code_graph_pipeline(repo_path):
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import cognee
|
||||||
|
from cognee.infrastructure.databases.relational import create_db_and_tables
|
||||||
|
|
||||||
|
file_path = Path(__file__).parent
|
||||||
|
data_directory_path = str(pathlib.Path(os.path.join(file_path, ".data_storage/code_graph")).resolve())
|
||||||
|
cognee.config.data_root_directory(data_directory_path)
|
||||||
|
cognee_directory_path = str(pathlib.Path(os.path.join(file_path, ".cognee_system/code_graph")).resolve())
|
||||||
|
cognee.config.system_root_directory(cognee_directory_path)
|
||||||
|
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
await create_db_and_tables()
|
||||||
|
|
||||||
|
tasks = [
|
||||||
|
Task(get_repo_file_dependencies),
|
||||||
|
Task(enrich_dependency_graph, task_config={"batch_size": 50}),
|
||||||
|
Task(expand_dependency_graph, task_config={"batch_size": 50}),
|
||||||
|
Task(summarize_code, summarization_model=SummarizedContent, task_config={"batch_size": 50}),
|
||||||
|
Task(add_data_points, task_config={"batch_size": 50}),
|
||||||
|
]
|
||||||
|
|
||||||
|
return run_tasks(tasks, repo_path, "cognify_code_pipeline")
|
||||||
|
|
|
||||||
|
|
@ -1,39 +1,43 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import Type
|
|
||||||
from uuid import uuid5
|
from uuid import uuid5
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from cognee.infrastructure.engine import DataPoint
|
from cognee.infrastructure.engine import DataPoint
|
||||||
from cognee.modules.data.extraction.extract_summary import extract_summary
|
from cognee.modules.data.extraction.extract_summary import extract_summary
|
||||||
from cognee.shared.CodeGraphEntities import CodeFile
|
from cognee.shared.CodeGraphEntities import CodeFile
|
||||||
from cognee.tasks.storage import add_data_points
|
|
||||||
|
|
||||||
from .models import CodeSummary
|
from .models import CodeSummary
|
||||||
|
|
||||||
|
|
||||||
async def summarize_code(
|
async def summarize_code(
|
||||||
code_files: list[DataPoint],
|
code_graph_nodes: list[DataPoint],
|
||||||
summarization_model: Type[BaseModel],
|
summarization_model: Type[BaseModel],
|
||||||
) -> list[DataPoint]:
|
) -> list[DataPoint]:
|
||||||
if len(code_files) == 0:
|
if len(code_graph_nodes) == 0:
|
||||||
return code_files
|
return
|
||||||
|
|
||||||
code_files_data_points = [file for file in code_files if isinstance(file, CodeFile)]
|
code_files_data_points = [file for file in code_graph_nodes if isinstance(file, CodeFile)]
|
||||||
|
|
||||||
file_summaries = await asyncio.gather(
|
file_summaries = await asyncio.gather(
|
||||||
*[extract_summary(file.source_code, summarization_model) for file in code_files_data_points]
|
*[extract_summary(file.source_code, summarization_model) for file in code_files_data_points]
|
||||||
)
|
)
|
||||||
|
|
||||||
summaries = [
|
file_summaries_map = {
|
||||||
CodeSummary(
|
code_file_data_point.extracted_id: file_summary.summary
|
||||||
id = uuid5(file.id, "CodeSummary"),
|
for code_file_data_point, file_summary in zip(code_files_data_points, file_summaries)
|
||||||
made_from = file,
|
}
|
||||||
text = file_summaries[file_index].summary,
|
|
||||||
|
for node in code_graph_nodes:
|
||||||
|
if not isinstance(node, DataPoint):
|
||||||
|
continue
|
||||||
|
yield node
|
||||||
|
|
||||||
|
if not isinstance(node, CodeFile):
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield CodeSummary(
|
||||||
|
id=uuid5(node.id, "CodeSummary"),
|
||||||
|
made_from=node,
|
||||||
|
text=file_summaries_map[node.extracted_id],
|
||||||
)
|
)
|
||||||
for (file_index, file) in enumerate(code_files_data_points)
|
|
||||||
]
|
|
||||||
|
|
||||||
await add_data_points(summaries)
|
|
||||||
|
|
||||||
return code_files
|
|
||||||
|
|
|
||||||
|
|
@ -7,19 +7,13 @@ from pathlib import Path
|
||||||
from swebench.harness.utils import load_swebench_dataset
|
from swebench.harness.utils import load_swebench_dataset
|
||||||
from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE
|
from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE
|
||||||
|
|
||||||
|
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
|
||||||
from cognee.api.v1.search import SearchType
|
from cognee.api.v1.search import SearchType
|
||||||
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
from cognee.infrastructure.llm.get_llm_client import get_llm_client
|
||||||
from cognee.infrastructure.llm.prompts import read_query_prompt
|
from cognee.infrastructure.llm.prompts import read_query_prompt
|
||||||
from cognee.modules.pipelines import Task, run_tasks
|
|
||||||
from cognee.modules.retrieval.brute_force_triplet_search import \
|
from cognee.modules.retrieval.brute_force_triplet_search import \
|
||||||
brute_force_triplet_search
|
brute_force_triplet_search
|
||||||
# from cognee.shared.data_models import SummarizedContent
|
|
||||||
from cognee.shared.utils import render_graph
|
from cognee.shared.utils import render_graph
|
||||||
from cognee.tasks.repo_processor import (enrich_dependency_graph,
|
|
||||||
expand_dependency_graph,
|
|
||||||
get_repo_file_dependencies)
|
|
||||||
from cognee.tasks.storage import add_data_points
|
|
||||||
# from cognee.tasks.summarization import summarize_code
|
|
||||||
from evals.eval_utils import download_github_repo, retrieved_edges_to_string
|
from evals.eval_utils import download_github_repo, retrieved_edges_to_string
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -42,48 +36,22 @@ def check_install_package(package_name):
|
||||||
|
|
||||||
|
|
||||||
async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
|
async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
|
||||||
import os
|
repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')
|
||||||
import pathlib
|
pipeline = await run_code_graph_pipeline(repo_path)
|
||||||
import cognee
|
|
||||||
from cognee.infrastructure.databases.relational import create_db_and_tables
|
|
||||||
|
|
||||||
file_path = Path(__file__).parent
|
|
||||||
data_directory_path = str(pathlib.Path(os.path.join(file_path, ".data_storage/code_graph")).resolve())
|
|
||||||
cognee.config.data_root_directory(data_directory_path)
|
|
||||||
cognee_directory_path = str(pathlib.Path(os.path.join(file_path, ".cognee_system/code_graph")).resolve())
|
|
||||||
cognee.config.system_root_directory(cognee_directory_path)
|
|
||||||
|
|
||||||
await cognee.prune.prune_data()
|
|
||||||
await cognee.prune.prune_system(metadata = True)
|
|
||||||
|
|
||||||
await create_db_and_tables()
|
|
||||||
|
|
||||||
# repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')
|
|
||||||
|
|
||||||
repo_path = '/Users/borisarzentar/Projects/graphrag'
|
|
||||||
|
|
||||||
tasks = [
|
|
||||||
Task(get_repo_file_dependencies),
|
|
||||||
Task(enrich_dependency_graph, task_config = { "batch_size": 50 }),
|
|
||||||
Task(expand_dependency_graph, task_config = { "batch_size": 50 }),
|
|
||||||
Task(add_data_points, task_config = { "batch_size": 50 }),
|
|
||||||
# Task(summarize_code, summarization_model = SummarizedContent),
|
|
||||||
]
|
|
||||||
|
|
||||||
pipeline = run_tasks(tasks, repo_path, "cognify_code_pipeline")
|
|
||||||
|
|
||||||
async for result in pipeline:
|
async for result in pipeline:
|
||||||
print(result)
|
print(result)
|
||||||
|
|
||||||
print('Here we have the repo under the repo_path')
|
print('Here we have the repo under the repo_path')
|
||||||
|
|
||||||
await render_graph(None, include_labels = True, include_nodes = True)
|
await render_graph(None, include_labels=True, include_nodes=True)
|
||||||
|
|
||||||
problem_statement = instance['problem_statement']
|
problem_statement = instance['problem_statement']
|
||||||
instructions = read_query_prompt("patch_gen_kg_instructions.txt")
|
instructions = read_query_prompt("patch_gen_kg_instructions.txt")
|
||||||
|
|
||||||
retrieved_edges = await brute_force_triplet_search(problem_statement, top_k = 3, collections = ["data_point_source_code", "data_point_text"])
|
retrieved_edges = await brute_force_triplet_search(problem_statement, top_k=3,
|
||||||
|
collections=["data_point_source_code", "data_point_text"])
|
||||||
|
|
||||||
retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)
|
retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)
|
||||||
|
|
||||||
prompt = "\n".join([
|
prompt = "\n".join([
|
||||||
|
|
@ -171,7 +139,6 @@ async def main():
|
||||||
with open(predictions_path, "w") as file:
|
with open(predictions_path, "w") as file:
|
||||||
json.dump(preds, file)
|
json.dump(preds, file)
|
||||||
|
|
||||||
|
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
[
|
[
|
||||||
"python",
|
"python",
|
||||||
|
|
|
||||||
15
examples/python/code_graph_example.py
Normal file
15
examples/python/code_graph_example.py
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
|
||||||
|
|
||||||
|
|
||||||
|
async def main(repo_path):
|
||||||
|
async for result in await run_code_graph_pipeline(repo_path):
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository")
|
||||||
|
args = parser.parse_args()
|
||||||
|
asyncio.run(main(args.repo_path))
|
||||||
|
|
||||||
Loading…
Add table
Reference in a new issue