feat: COG-548-create-code-graph-to-kg-task (#7)
Co-authored-by: Boris Arzentar <borisarzentar@gmail.com>
This commit is contained in:
parent
8466764cbe
commit
0fb47ba23d
9 changed files with 52 additions and 103 deletions
4
.github/workflows/test_python_3_10.yml
vendored
4
.github/workflows/test_python_3_10.yml
vendored
|
|
@ -56,8 +56,8 @@ jobs:
|
|||
- name: Run integration tests
|
||||
run: poetry run pytest cognee/tests/integration/
|
||||
|
||||
- name: Run summarize_code test
|
||||
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
|
||||
- name: Run convert_graph_from_code_graph test
|
||||
run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
|
|
|
|||
4
.github/workflows/test_python_3_11.yml
vendored
4
.github/workflows/test_python_3_11.yml
vendored
|
|
@ -56,8 +56,8 @@ jobs:
|
|||
- name: Run integration tests
|
||||
run: poetry run pytest cognee/tests/integration/
|
||||
|
||||
- name: Run summarize_code test
|
||||
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
|
||||
- name: Run convert_graph_from_code_graph test
|
||||
run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
|
|
|
|||
4
.github/workflows/test_python_3_9.yml
vendored
4
.github/workflows/test_python_3_9.yml
vendored
|
|
@ -56,8 +56,8 @@ jobs:
|
|||
- name: Run integration tests
|
||||
run: poetry run pytest cognee/tests/integration/
|
||||
|
||||
- name: Run summarize_code test
|
||||
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
|
||||
- name: Run convert_graph_from_code_graph test
|
||||
run: poetry run pytest cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py
|
||||
env:
|
||||
ENV: 'dev'
|
||||
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
|
|
|
|||
|
|
@ -1,16 +1,13 @@
|
|||
from typing import Any, List, Literal, Optional, Union
|
||||
|
||||
from cognee.infrastructure.engine import DataPoint
|
||||
|
||||
|
||||
class Repository(DataPoint):
|
||||
path: str
|
||||
|
||||
|
||||
class CodeFile(DataPoint):
|
||||
extracted_id: str # actually file path
|
||||
type: str
|
||||
source_code: str
|
||||
part_of: Repository
|
||||
|
||||
_metadata: dict = {
|
||||
"index_fields": ["source_code"]
|
||||
|
|
|
|||
|
|
@ -1,86 +1,35 @@
|
|||
from uuid import UUID, uuid4
|
||||
import os
|
||||
import networkx as nx
|
||||
|
||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||
from cognee.modules.graph.utils import (
|
||||
expand_with_nodes_and_edges,
|
||||
retrieve_existing_edges,
|
||||
)
|
||||
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository
|
||||
from cognee.shared.data_models import Edge, KnowledgeGraph, Node
|
||||
from cognee.tasks.storage import add_data_points
|
||||
|
||||
|
||||
async def convert_graph_from_code_graph(
|
||||
graph: nx.DiGraph, repo_path: str
|
||||
) -> tuple[str, list[CodeFile], list[CodeRelationship]]:
|
||||
code_objects = code_objects_from_di_graph(graph, repo_path)
|
||||
|
||||
repo, nodes, edges = code_objects_from_di_graph(graph, repo_path)
|
||||
add_data_points(code_objects)
|
||||
|
||||
graph_engine = await get_graph_engine()
|
||||
|
||||
code_knowledge_graph = build_code_knowledge_graph(nodes, edges)
|
||||
repo_and_knowledge_graph = [(repo, code_knowledge_graph)]
|
||||
|
||||
existing_edges_map = await retrieve_existing_edges(
|
||||
repo_and_knowledge_graph, graph_engine
|
||||
)
|
||||
|
||||
graph_nodes, graph_edges = expand_with_nodes_and_edges(
|
||||
repo_and_knowledge_graph, existing_edges_map
|
||||
)
|
||||
|
||||
if len(graph_nodes) > 0:
|
||||
await add_data_points(graph_nodes)
|
||||
|
||||
if len(graph_edges) > 0:
|
||||
await graph_engine.add_edges(graph_edges)
|
||||
|
||||
return nodes
|
||||
return code_objects
|
||||
|
||||
|
||||
def convert_node(node: CodeFile) -> Node:
|
||||
return Node(
|
||||
id=str(node.id),
|
||||
name=node.extracted_id,
|
||||
type=node.type,
|
||||
description=f"{node.source_code = }",
|
||||
properties={},
|
||||
)
|
||||
|
||||
|
||||
def convert_edge(edge: CodeRelationship, extracted_ids_to_ids: dict[str, UUID]) -> Edge:
|
||||
return Edge(
|
||||
source_node_id=str(extracted_ids_to_ids[edge.source_id]),
|
||||
target_node_id=str(extracted_ids_to_ids[edge.target_id]),
|
||||
relationship_name=f"{edge.type}_{edge.relation}",
|
||||
)
|
||||
|
||||
|
||||
def build_code_knowledge_graph(nodes: list[CodeFile], edges: list[CodeRelationship]):
|
||||
extracted_ids_to_ids = {node.extracted_id: node.id for node in nodes}
|
||||
graph_nodes = [convert_node(node) for node in nodes]
|
||||
graph_edges = [convert_edge(edge, extracted_ids_to_ids) for edge in edges]
|
||||
return KnowledgeGraph(nodes=graph_nodes, edges=graph_edges)
|
||||
|
||||
|
||||
def create_code_file(path, type):
|
||||
def create_code_file(path, type, repo):
|
||||
abspath = os.path.abspath(path)
|
||||
print(f"{path = } - {abspath = }")
|
||||
|
||||
with open(abspath, "r") as f:
|
||||
source_code = f.read()
|
||||
code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code)
|
||||
return (code_file, abspath)
|
||||
|
||||
|
||||
def create_code_relationship(
|
||||
source_path: str, target_path: str, type: str, relation: str
|
||||
):
|
||||
return CodeRelationship(
|
||||
source_id=source_path, target_id=target_path, type=type, relation=relation
|
||||
code_file = CodeFile(
|
||||
extracted_id = abspath,
|
||||
type = type,
|
||||
source_code = source_code,
|
||||
part_of = repo,
|
||||
)
|
||||
|
||||
return code_file
|
||||
|
||||
|
||||
def code_objects_from_di_graph(
|
||||
graph: nx.DiGraph, repo_path: str
|
||||
|
|
@ -88,18 +37,18 @@ def code_objects_from_di_graph(
|
|||
repo = Repository(path=repo_path)
|
||||
|
||||
code_files = [
|
||||
create_code_file(os.path.join(repo_path, path), "python_file")[0]
|
||||
create_code_file(os.path.join(repo_path, path), "python_file", repo)
|
||||
for path in graph.nodes
|
||||
]
|
||||
|
||||
code_relationships = [
|
||||
create_code_relationship(
|
||||
CodeRelationship(
|
||||
os.path.join(repo_path, source),
|
||||
os.path.join(repo_path, target),
|
||||
"python_file",
|
||||
graph.get_edge_data(source, target, v)["relation"],
|
||||
graph.get_edge_data(source, target)["relation"],
|
||||
)
|
||||
for source, target, v in graph.edges
|
||||
for source, target in graph.edges
|
||||
]
|
||||
|
||||
return (repo, code_files, code_relationships)
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
import logging
|
||||
|
||||
logger = logging.getLogger("task:repo_processor")
|
||||
|
||||
from .enrich_dependency_graph import enrich_dependency_graph
|
||||
from .expand_dependency_graph import expand_dependency_graph
|
||||
from .get_repo_dependency_graph import get_repo_dependency_graph
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
import os
|
||||
import aiofiles
|
||||
|
||||
import networkx as nx
|
||||
from typing import Dict, List
|
||||
|
||||
from cognee.tasks.repo_processor.get_local_dependencies import get_local_script_dependencies
|
||||
|
||||
|
|
@ -35,6 +33,7 @@ async def get_py_files_dict(repo_path):
|
|||
|
||||
return py_files_dict
|
||||
|
||||
|
||||
def get_edge(file_path: str, dependency: str, repo_path: str, relative_paths: bool = True) -> tuple:
|
||||
if relative_paths:
|
||||
file_path = os.path.relpath(file_path, repo_path)
|
||||
|
|
@ -58,4 +57,5 @@ async def get_repo_dependency_graph(repo_path: str) -> nx.DiGraph:
|
|||
dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path)
|
||||
dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies]
|
||||
dependency_graph.add_edges_from(dependency_edges)
|
||||
|
||||
return dependency_graph
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import asyncio
|
||||
from typing import Type, Union
|
||||
from typing import Type
|
||||
from uuid import uuid5
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
|
@ -23,9 +23,9 @@ async def summarize_code(
|
|||
|
||||
summaries = [
|
||||
CodeSummary(
|
||||
id=uuid5(file.id, "CodeSummary"),
|
||||
made_from=file,
|
||||
text=file_summaries[file_index].summary,
|
||||
id = uuid5(file.id, "CodeSummary"),
|
||||
made_from = file,
|
||||
text = file_summaries[file_index].summary,
|
||||
)
|
||||
for (file_index, file) in enumerate(code_files)
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,37 +1,30 @@
|
|||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
import numpy as np
|
||||
|
||||
from cognee.modules.pipelines import Task, run_tasks
|
||||
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository
|
||||
from cognee.shared.CodeGraphEntities import CodeRelationship, Repository
|
||||
from cognee.shared.data_models import SummarizedContent
|
||||
from cognee.tasks.code.get_local_dependencies_checker import (
|
||||
get_local_script_dependencies,
|
||||
)
|
||||
from cognee.tasks.graph.convert_graph_from_code_graph import (
|
||||
create_code_file,
|
||||
convert_graph_from_code_graph,
|
||||
)
|
||||
|
||||
from cognee.tasks.summarization import summarize_code
|
||||
from cognee.tests.tasks.graph.code_graph_test_data_generation import (
|
||||
code_graph_test_data_generation,
|
||||
from cognee.tasks.repo_processor import (
|
||||
enrich_dependency_graph,
|
||||
expand_dependency_graph,
|
||||
get_repo_dependency_graph,
|
||||
)
|
||||
from cognee.tasks.summarization import summarize_code
|
||||
|
||||
|
||||
async def print_results(pipeline):
|
||||
async for result in pipeline:
|
||||
print(result)
|
||||
|
||||
|
||||
def create_code_file(path, type):
|
||||
abspath = os.path.abspath(path)
|
||||
with open(abspath, "r") as f:
|
||||
source_code = f.read()
|
||||
code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code)
|
||||
return (code_file, abspath)
|
||||
|
||||
|
||||
async def get_local_script_dependencies_wrapper(script_path, repo_path):
|
||||
dependencies = await get_local_script_dependencies(script_path, repo_path)
|
||||
return (script_path, dependencies)
|
||||
|
|
@ -72,11 +65,17 @@ async def scan_repo(path, condition):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Process a file path")
|
||||
parser.add_argument("path", help="Path to the file")
|
||||
|
||||
args = parser.parse_args()
|
||||
abspath = os.path.abspath(args.path or ".")
|
||||
tasks = [
|
||||
Task(scan_repo),
|
||||
Task(get_repo_dependency_graph),
|
||||
Task(enrich_dependency_graph),
|
||||
Task(expand_dependency_graph),
|
||||
Task(convert_graph_from_code_graph),
|
||||
Task(summarize_code, summarization_model=SummarizedContent),
|
||||
Task(summarize_code, summarization_model = SummarizedContent),
|
||||
]
|
||||
data = ("cognee", lambda file: file.endswith(".py"))
|
||||
pipeline = run_tasks(tasks, data, "cognify_pipeline")
|
||||
pipeline = run_tasks(tasks, abspath, "cognify_code_pipeline")
|
||||
asyncio.run(print_results(pipeline))
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue