feat: COG-548-create-code-graph-to-kg-task (#7)

Co-authored-by: Boris Arzentar <borisarzentar@gmail.com>
This commit is contained in:
0xideas 2024-11-24 20:50:32 +01:00 committed by GitHub
parent 8466764cbe
commit 0fb47ba23d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 52 additions and 103 deletions

View file

@ -56,8 +56,8 @@ jobs:
- name: Run integration tests - name: Run integration tests
run: poetry run pytest cognee/tests/integration/ run: poetry run pytest cognee/tests/integration/
- name: Run summarize_code test - name: Run convert_graph_from_code_graph test
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py
env: env:
ENV: 'dev' ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}

View file

@ -56,8 +56,8 @@ jobs:
- name: Run integration tests - name: Run integration tests
run: poetry run pytest cognee/tests/integration/ run: poetry run pytest cognee/tests/integration/
- name: Run summarize_code test - name: Run convert_graph_from_code_graph test
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py
env: env:
ENV: 'dev' ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}

View file

@ -56,8 +56,8 @@ jobs:
- name: Run integration tests - name: Run integration tests
run: poetry run pytest cognee/tests/integration/ run: poetry run pytest cognee/tests/integration/
- name: Run summarize_code test - name: Run convert_graph_from_code_graph test
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py
env: env:
ENV: 'dev' ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}

View file

@ -1,16 +1,13 @@
from typing import Any, List, Literal, Optional, Union
from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine import DataPoint
class Repository(DataPoint): class Repository(DataPoint):
path: str path: str
class CodeFile(DataPoint): class CodeFile(DataPoint):
extracted_id: str # actually file path extracted_id: str # actually file path
type: str type: str
source_code: str source_code: str
part_of: Repository
_metadata: dict = { _metadata: dict = {
"index_fields": ["source_code"] "index_fields": ["source_code"]

View file

@ -1,86 +1,35 @@
from uuid import UUID, uuid4
import os import os
import networkx as nx import networkx as nx
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.graph.utils import (
expand_with_nodes_and_edges,
retrieve_existing_edges,
)
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository
from cognee.shared.data_models import Edge, KnowledgeGraph, Node
from cognee.tasks.storage import add_data_points from cognee.tasks.storage import add_data_points
async def convert_graph_from_code_graph( async def convert_graph_from_code_graph(
graph: nx.DiGraph, repo_path: str graph: nx.DiGraph, repo_path: str
) -> tuple[str, list[CodeFile], list[CodeRelationship]]: ) -> tuple[str, list[CodeFile], list[CodeRelationship]]:
code_objects = code_objects_from_di_graph(graph, repo_path)
repo, nodes, edges = code_objects_from_di_graph(graph, repo_path) add_data_points(code_objects)
graph_engine = await get_graph_engine() return code_objects
code_knowledge_graph = build_code_knowledge_graph(nodes, edges)
repo_and_knowledge_graph = [(repo, code_knowledge_graph)]
existing_edges_map = await retrieve_existing_edges(
repo_and_knowledge_graph, graph_engine
)
graph_nodes, graph_edges = expand_with_nodes_and_edges(
repo_and_knowledge_graph, existing_edges_map
)
if len(graph_nodes) > 0:
await add_data_points(graph_nodes)
if len(graph_edges) > 0:
await graph_engine.add_edges(graph_edges)
return nodes
def convert_node(node: CodeFile) -> Node: def create_code_file(path, type, repo):
return Node(
id=str(node.id),
name=node.extracted_id,
type=node.type,
description=f"{node.source_code = }",
properties={},
)
def convert_edge(edge: CodeRelationship, extracted_ids_to_ids: dict[str, UUID]) -> Edge:
return Edge(
source_node_id=str(extracted_ids_to_ids[edge.source_id]),
target_node_id=str(extracted_ids_to_ids[edge.target_id]),
relationship_name=f"{edge.type}_{edge.relation}",
)
def build_code_knowledge_graph(nodes: list[CodeFile], edges: list[CodeRelationship]):
extracted_ids_to_ids = {node.extracted_id: node.id for node in nodes}
graph_nodes = [convert_node(node) for node in nodes]
graph_edges = [convert_edge(edge, extracted_ids_to_ids) for edge in edges]
return KnowledgeGraph(nodes=graph_nodes, edges=graph_edges)
def create_code_file(path, type):
abspath = os.path.abspath(path) abspath = os.path.abspath(path)
print(f"{path = } - {abspath = }")
with open(abspath, "r") as f: with open(abspath, "r") as f:
source_code = f.read() source_code = f.read()
code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code)
return (code_file, abspath)
code_file = CodeFile(
def create_code_relationship( extracted_id = abspath,
source_path: str, target_path: str, type: str, relation: str type = type,
): source_code = source_code,
return CodeRelationship( part_of = repo,
source_id=source_path, target_id=target_path, type=type, relation=relation
) )
return code_file
def code_objects_from_di_graph( def code_objects_from_di_graph(
graph: nx.DiGraph, repo_path: str graph: nx.DiGraph, repo_path: str
@ -88,18 +37,18 @@ def code_objects_from_di_graph(
repo = Repository(path=repo_path) repo = Repository(path=repo_path)
code_files = [ code_files = [
create_code_file(os.path.join(repo_path, path), "python_file")[0] create_code_file(os.path.join(repo_path, path), "python_file", repo)
for path in graph.nodes for path in graph.nodes
] ]
code_relationships = [ code_relationships = [
create_code_relationship( CodeRelationship(
os.path.join(repo_path, source), os.path.join(repo_path, source),
os.path.join(repo_path, target), os.path.join(repo_path, target),
"python_file", "python_file",
graph.get_edge_data(source, target, v)["relation"], graph.get_edge_data(source, target)["relation"],
) )
for source, target, v in graph.edges for source, target in graph.edges
] ]
return (repo, code_files, code_relationships) return (repo, code_files, code_relationships)

View file

@ -1,3 +1,7 @@
import logging import logging
logger = logging.getLogger("task:repo_processor") logger = logging.getLogger("task:repo_processor")
from .enrich_dependency_graph import enrich_dependency_graph
from .expand_dependency_graph import expand_dependency_graph
from .get_repo_dependency_graph import get_repo_dependency_graph

View file

@ -1,8 +1,6 @@
import os import os
import aiofiles import aiofiles
import networkx as nx import networkx as nx
from typing import Dict, List
from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
@ -35,6 +33,7 @@ async def get_py_files_dict(repo_path):
return py_files_dict return py_files_dict
def get_edge(file_path: str, dependency: str, repo_path: str, relative_paths: bool = True) -> tuple: def get_edge(file_path: str, dependency: str, repo_path: str, relative_paths: bool = True) -> tuple:
if relative_paths: if relative_paths:
file_path = os.path.relpath(file_path, repo_path) file_path = os.path.relpath(file_path, repo_path)
@ -58,4 +57,5 @@ async def get_repo_dependency_graph(repo_path: str) -> nx.DiGraph:
dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path) dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path)
dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies] dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies]
dependency_graph.add_edges_from(dependency_edges) dependency_graph.add_edges_from(dependency_edges)
return dependency_graph return dependency_graph

View file

@ -1,5 +1,5 @@
import asyncio import asyncio
from typing import Type, Union from typing import Type
from uuid import uuid5 from uuid import uuid5
from pydantic import BaseModel from pydantic import BaseModel

View file

@ -1,37 +1,30 @@
import argparse
import asyncio import asyncio
import os import os
from uuid import UUID, uuid4
import numpy as np
from cognee.modules.pipelines import Task, run_tasks from cognee.modules.pipelines import Task, run_tasks
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository from cognee.shared.CodeGraphEntities import CodeRelationship, Repository
from cognee.shared.data_models import SummarizedContent from cognee.shared.data_models import SummarizedContent
from cognee.tasks.code.get_local_dependencies_checker import ( from cognee.tasks.code.get_local_dependencies_checker import (
get_local_script_dependencies, get_local_script_dependencies,
) )
from cognee.tasks.graph.convert_graph_from_code_graph import ( from cognee.tasks.graph.convert_graph_from_code_graph import (
create_code_file,
convert_graph_from_code_graph, convert_graph_from_code_graph,
) )
from cognee.tasks.repo_processor import (
from cognee.tasks.summarization import summarize_code enrich_dependency_graph,
from cognee.tests.tasks.graph.code_graph_test_data_generation import ( expand_dependency_graph,
code_graph_test_data_generation, get_repo_dependency_graph,
) )
from cognee.tasks.summarization import summarize_code
async def print_results(pipeline): async def print_results(pipeline):
async for result in pipeline: async for result in pipeline:
print(result) print(result)
def create_code_file(path, type):
abspath = os.path.abspath(path)
with open(abspath, "r") as f:
source_code = f.read()
code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code)
return (code_file, abspath)
async def get_local_script_dependencies_wrapper(script_path, repo_path): async def get_local_script_dependencies_wrapper(script_path, repo_path):
dependencies = await get_local_script_dependencies(script_path, repo_path) dependencies = await get_local_script_dependencies(script_path, repo_path)
return (script_path, dependencies) return (script_path, dependencies)
@ -72,11 +65,17 @@ async def scan_repo(path, condition):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process a file path")
parser.add_argument("path", help="Path to the file")
args = parser.parse_args()
abspath = os.path.abspath(args.path or ".")
tasks = [ tasks = [
Task(scan_repo), Task(get_repo_dependency_graph),
Task(enrich_dependency_graph),
Task(expand_dependency_graph),
Task(convert_graph_from_code_graph), Task(convert_graph_from_code_graph),
Task(summarize_code, summarization_model = SummarizedContent), Task(summarize_code, summarization_model = SummarizedContent),
] ]
data = ("cognee", lambda file: file.endswith(".py")) pipeline = run_tasks(tasks, abspath, "cognify_code_pipeline")
pipeline = run_tasks(tasks, data, "cognify_pipeline")
asyncio.run(print_results(pipeline)) asyncio.run(print_results(pipeline))