From 0fb47ba23d9b428d5bfb6047b8676751d0672cd4 Mon Sep 17 00:00:00 2001 From: 0xideas Date: Sun, 24 Nov 2024 20:50:32 +0100 Subject: [PATCH] feat: COG-548-create-code-graph-to-kg-task (#7) Co-authored-by: Boris Arzentar --- .github/workflows/test_python_3_10.yml | 4 +- .github/workflows/test_python_3_11.yml | 4 +- .github/workflows/test_python_3_9.yml | 4 +- cognee/shared/CodeGraphEntities.py | 5 +- .../graph/convert_graph_from_code_graph.py | 83 ++++--------------- cognee/tasks/repo_processor/__init__.py | 4 + .../get_repo_dependency_graph.py | 4 +- cognee/tasks/summarization/summarize_code.py | 8 +- examples/python/code_graph_pipeline.py | 39 +++++---- 9 files changed, 52 insertions(+), 103 deletions(-) diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml index a7ea005ef..83d794c03 100644 --- a/.github/workflows/test_python_3_10.yml +++ b/.github/workflows/test_python_3_10.yml @@ -56,8 +56,8 @@ jobs: - name: Run integration tests run: poetry run pytest cognee/tests/integration/ - - name: Run summarize_code test - run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py + - name: Run convert_graph_from_code_graph test + run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py env: ENV: 'dev' LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml index 18b04cd94..9483cb687 100644 --- a/.github/workflows/test_python_3_11.yml +++ b/.github/workflows/test_python_3_11.yml @@ -56,8 +56,8 @@ jobs: - name: Run integration tests run: poetry run pytest cognee/tests/integration/ - - name: Run summarize_code test - run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py + - name: Run convert_graph_from_code_graph test + run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py env: ENV: 'dev' LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/test_python_3_9.yml b/.github/workflows/test_python_3_9.yml index 54194bd19..c4f6d271a 100644 --- a/.github/workflows/test_python_3_9.yml +++ b/.github/workflows/test_python_3_9.yml @@ -56,8 +56,8 @@ jobs: - name: Run integration tests run: poetry run pytest cognee/tests/integration/ - - name: Run summarize_code test - run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py + - name: Run convert_graph_from_code_graph test + run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py env: ENV: 'dev' LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py index f0061406e..9052cf89e 100644 --- a/cognee/shared/CodeGraphEntities.py +++ b/cognee/shared/CodeGraphEntities.py @@ -1,16 +1,13 @@ -from typing import Any, List, Literal, Optional, Union - from cognee.infrastructure.engine import DataPoint - class Repository(DataPoint): path: str - class CodeFile(DataPoint): extracted_id: str # actually file path type: str source_code: str + part_of: Repository _metadata: dict = { "index_fields": ["source_code"] diff --git a/cognee/tasks/graph/convert_graph_from_code_graph.py b/cognee/tasks/graph/convert_graph_from_code_graph.py index 91eaf660a..bc8544994 100644 --- a/cognee/tasks/graph/convert_graph_from_code_graph.py +++ b/cognee/tasks/graph/convert_graph_from_code_graph.py @@ -1,86 +1,35 @@ -from uuid import UUID, uuid4 import os import networkx as nx -from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.modules.graph.utils import ( - expand_with_nodes_and_edges, - retrieve_existing_edges, -) from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository -from cognee.shared.data_models import Edge, KnowledgeGraph, Node from cognee.tasks.storage import add_data_points async def convert_graph_from_code_graph( graph: nx.DiGraph, repo_path: str ) -> tuple[str, list[CodeFile], list[CodeRelationship]]: + code_objects = code_objects_from_di_graph(graph, repo_path) - repo, nodes, edges = code_objects_from_di_graph(graph, repo_path) + add_data_points(code_objects) - graph_engine = await get_graph_engine() - - code_knowledge_graph = build_code_knowledge_graph(nodes, edges) - repo_and_knowledge_graph = [(repo, code_knowledge_graph)] - - existing_edges_map = await retrieve_existing_edges( - repo_and_knowledge_graph, graph_engine - ) - - graph_nodes, graph_edges = expand_with_nodes_and_edges( - repo_and_knowledge_graph, existing_edges_map - ) - - if len(graph_nodes) > 0: - await add_data_points(graph_nodes) - - if len(graph_edges) > 0: - await graph_engine.add_edges(graph_edges) - - return nodes + return code_objects -def convert_node(node: CodeFile) -> Node: - return Node( - id=str(node.id), - name=node.extracted_id, - type=node.type, - description=f"{node.source_code = }", - properties={}, - ) - - -def convert_edge(edge: CodeRelationship, extracted_ids_to_ids: dict[str, UUID]) -> Edge: - return Edge( - source_node_id=str(extracted_ids_to_ids[edge.source_id]), - target_node_id=str(extracted_ids_to_ids[edge.target_id]), - relationship_name=f"{edge.type}_{edge.relation}", - ) - - -def build_code_knowledge_graph(nodes: list[CodeFile], edges: list[CodeRelationship]): - extracted_ids_to_ids = {node.extracted_id: node.id for node in nodes} - graph_nodes = [convert_node(node) for node in nodes] - graph_edges = [convert_edge(edge, extracted_ids_to_ids) for edge in edges] - return KnowledgeGraph(nodes=graph_nodes, edges=graph_edges) - - -def create_code_file(path, type): +def create_code_file(path, type, repo): abspath = os.path.abspath(path) - print(f"{path = } - {abspath = }") + with open(abspath, "r") as f: source_code = f.read() - code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code) - return (code_file, abspath) - -def create_code_relationship( - source_path: str, target_path: str, type: str, relation: str -): - return CodeRelationship( - source_id=source_path, target_id=target_path, type=type, relation=relation + code_file = CodeFile( + extracted_id = abspath, + type = type, + source_code = source_code, + part_of = repo, ) + return code_file + def code_objects_from_di_graph( graph: nx.DiGraph, repo_path: str @@ -88,18 +37,18 @@ def code_objects_from_di_graph( repo = Repository(path=repo_path) code_files = [ - create_code_file(os.path.join(repo_path, path), "python_file")[0] + create_code_file(os.path.join(repo_path, path), "python_file", repo) for path in graph.nodes ] code_relationships = [ - create_code_relationship( + CodeRelationship( os.path.join(repo_path, source), os.path.join(repo_path, target), "python_file", - graph.get_edge_data(source, target, v)["relation"], + graph.get_edge_data(source, target)["relation"], ) - for source, target, v in graph.edges + for source, target in graph.edges ] return (repo, code_files, code_relationships) diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index 94dab6b3f..a1aeabcdc 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -1,3 +1,7 @@ import logging logger = logging.getLogger("task:repo_processor") + +from .enrich_dependency_graph import enrich_dependency_graph +from .expand_dependency_graph import expand_dependency_graph +from .get_repo_dependency_graph import get_repo_dependency_graph diff --git a/cognee/tasks/repo_processor/get_repo_dependency_graph.py b/cognee/tasks/repo_processor/get_repo_dependency_graph.py index b36607ab8..7f96bd49c 100644 --- a/cognee/tasks/repo_processor/get_repo_dependency_graph.py +++ b/cognee/tasks/repo_processor/get_repo_dependency_graph.py @@ -1,8 +1,6 @@ import os import aiofiles - import networkx as nx -from typing import Dict, List from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies @@ -35,6 +33,7 @@ async def get_py_files_dict(repo_path): return py_files_dict + def get_edge(file_path: str, dependency: str, repo_path: str, relative_paths: bool = True) -> tuple: if relative_paths: file_path = os.path.relpath(file_path, repo_path) @@ -58,4 +57,5 @@ async def get_repo_dependency_graph(repo_path: str) -> nx.DiGraph: dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path) dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies] dependency_graph.add_edges_from(dependency_edges) + return dependency_graph diff --git a/cognee/tasks/summarization/summarize_code.py b/cognee/tasks/summarization/summarize_code.py index 597bc4026..31b86d325 100644 --- a/cognee/tasks/summarization/summarize_code.py +++ b/cognee/tasks/summarization/summarize_code.py @@ -1,5 +1,5 @@ import asyncio -from typing import Type, Union +from typing import Type from uuid import uuid5 from pydantic import BaseModel @@ -23,9 +23,9 @@ async def summarize_code( summaries = [ CodeSummary( - id=uuid5(file.id, "CodeSummary"), - made_from=file, - text=file_summaries[file_index].summary, + id = uuid5(file.id, "CodeSummary"), + made_from = file, + text = file_summaries[file_index].summary, ) for (file_index, file) in enumerate(code_files) ] diff --git a/examples/python/code_graph_pipeline.py b/examples/python/code_graph_pipeline.py index 221ff8385..fe4be371b 100644 --- a/examples/python/code_graph_pipeline.py +++ b/examples/python/code_graph_pipeline.py @@ -1,37 +1,30 @@ +import argparse import asyncio import os -from uuid import UUID, uuid4 - -import numpy as np from cognee.modules.pipelines import Task, run_tasks -from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository +from cognee.shared.CodeGraphEntities import CodeRelationship, Repository from cognee.shared.data_models import SummarizedContent from cognee.tasks.code.get_local_dependencies_checker import ( get_local_script_dependencies, ) from cognee.tasks.graph.convert_graph_from_code_graph import ( + create_code_file, convert_graph_from_code_graph, ) - -from cognee.tasks.summarization import summarize_code -from cognee.tests.tasks.graph.code_graph_test_data_generation import ( - code_graph_test_data_generation, +from cognee.tasks.repo_processor import ( + enrich_dependency_graph, + expand_dependency_graph, + get_repo_dependency_graph, ) +from cognee.tasks.summarization import summarize_code + async def print_results(pipeline): async for result in pipeline: print(result) -def create_code_file(path, type): - abspath = os.path.abspath(path) - with open(abspath, "r") as f: - source_code = f.read() - code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code) - return (code_file, abspath) - - async def get_local_script_dependencies_wrapper(script_path, repo_path): dependencies = await get_local_script_dependencies(script_path, repo_path) return (script_path, dependencies) @@ -72,11 +65,17 @@ async def scan_repo(path, condition): if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process a file path") + parser.add_argument("path", help="Path to the file") + + args = parser.parse_args() + abspath = os.path.abspath(args.path or ".") tasks = [ - Task(scan_repo), + Task(get_repo_dependency_graph), + Task(enrich_dependency_graph), + Task(expand_dependency_graph), Task(convert_graph_from_code_graph), - Task(summarize_code, summarization_model=SummarizedContent), + Task(summarize_code, summarization_model = SummarizedContent), ] - data = ("cognee", lambda file: file.endswith(".py")) - pipeline = run_tasks(tasks, data, "cognify_pipeline") + pipeline = run_tasks(tasks, abspath, "cognify_code_pipeline") asyncio.run(print_results(pipeline))