test: Test for code graph enrichment task

Co-authored-by: lxobr <lazar@topoteretes.com>
This commit is contained in:
0xideas 2024-11-24 19:24:47 +01:00 committed by GitHub
parent 70bdaea8f7
commit 80b06c3acb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 527 additions and 108 deletions

View file

@ -56,6 +56,12 @@ jobs:
- name: Run integration tests - name: Run integration tests
run: poetry run pytest cognee/tests/integration/ run: poetry run pytest cognee/tests/integration/
- name: Run summarize_code test
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- name: Run default basic pipeline - name: Run default basic pipeline
env: env:
ENV: 'dev' ENV: 'dev'

View file

@ -56,6 +56,12 @@ jobs:
- name: Run integration tests - name: Run integration tests
run: poetry run pytest cognee/tests/integration/ run: poetry run pytest cognee/tests/integration/
- name: Run summarize_code test
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- name: Run default basic pipeline - name: Run default basic pipeline
env: env:
ENV: 'dev' ENV: 'dev'

View file

@ -56,6 +56,12 @@ jobs:
- name: Run integration tests - name: Run integration tests
run: poetry run pytest cognee/tests/integration/ run: poetry run pytest cognee/tests/integration/
- name: Run summarize_code test
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
env:
ENV: 'dev'
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- name: Run default basic pipeline - name: Run default basic pipeline
env: env:
ENV: 'dev' ENV: 'dev'

2
.gitignore vendored
View file

@ -4,6 +4,8 @@
.prod.env .prod.env
cognee/.data/ cognee/.data/
code_pipeline_output*/
*.lance/ *.lance/
.DS_Store .DS_Store
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files

View file

@ -1,13 +1,17 @@
from typing import Union
from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine import DataPoint
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
from .EntityType import EntityType from cognee.modules.engine.models.EntityType import EntityType
from cognee.shared.CodeGraphEntities import Repository
class Entity(DataPoint): class Entity(DataPoint):
__tablename__ = "entity" __tablename__ = "entity"
name: str name: str
is_a: EntityType is_a: EntityType
description: str description: str
mentioned_in: DocumentChunk mentioned_in: Union[DocumentChunk, Repository]
_metadata: dict = { _metadata: dict = {
"index_fields": ["name"], "index_fields": ["name"],
} }

View file

@ -1,12 +1,16 @@
from typing import Union
from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine import DataPoint
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
from cognee.shared.CodeGraphEntities import Repository
class EntityType(DataPoint): class EntityType(DataPoint):
__tablename__ = "entity_type" __tablename__ = "entity_type"
name: str name: str
type: str type: str
description: str description: str
exists_in: DocumentChunk exists_in: Union[DocumentChunk, Repository]
_metadata: dict = { _metadata: dict = {
"index_fields": ["name"], "index_fields": ["name"],
} }

View file

@ -1,2 +1,4 @@
from .expand_with_nodes_and_edges import expand_with_nodes_and_edges
from .get_graph_from_model import get_graph_from_model from .get_graph_from_model import get_graph_from_model
from .get_model_instance_from_graph import get_model_instance_from_graph from .get_model_instance_from_graph import get_model_instance_from_graph
from .retrieve_existing_edges import retrieve_existing_edges

View file

@ -0,0 +1,83 @@
from typing import Optional
from cognee.infrastructure.engine import DataPoint
from cognee.modules.engine.models import Entity, EntityType
from cognee.modules.engine.utils import (
generate_edge_name,
generate_node_id,
generate_node_name,
)
from cognee.shared.data_models import KnowledgeGraph
def expand_with_nodes_and_edges(
graph_node_index: list[tuple[DataPoint, KnowledgeGraph]],
existing_edges_map: Optional[dict[str, bool]] = None,
):
if existing_edges_map is None:
existing_edges_map = {}
added_nodes_map = {}
relationships = []
data_points = []
for graph_source, graph in graph_node_index:
if graph is None:
continue
for node in graph.nodes:
node_id = generate_node_id(node.id)
node_name = generate_node_name(node.name)
type_node_id = generate_node_id(node.type)
type_node_name = generate_node_name(node.type)
if f"{str(type_node_id)}_type" not in added_nodes_map:
type_node = EntityType(
id=type_node_id,
name=type_node_name,
type=type_node_name,
description=type_node_name,
exists_in=graph_source,
)
added_nodes_map[f"{str(type_node_id)}_type"] = type_node
else:
type_node = added_nodes_map[f"{str(type_node_id)}_type"]
if f"{str(node_id)}_entity" not in added_nodes_map:
entity_node = Entity(
id=node_id,
name=node_name,
is_a=type_node,
description=node.description,
mentioned_in=graph_source,
)
data_points.append(entity_node)
added_nodes_map[f"{str(node_id)}_entity"] = entity_node
# Add relationship that came from graphs.
for edge in graph.edges:
source_node_id = generate_node_id(edge.source_node_id)
target_node_id = generate_node_id(edge.target_node_id)
relationship_name = generate_edge_name(edge.relationship_name)
edge_key = str(source_node_id) + str(target_node_id) + relationship_name
if edge_key not in existing_edges_map:
relationships.append(
(
source_node_id,
target_node_id,
edge.relationship_name,
dict(
relationship_name=generate_edge_name(
edge.relationship_name
),
source_node_id=source_node_id,
target_node_id=target_node_id,
),
)
)
existing_edges_map[edge_key] = True
return (data_points, relationships)

View file

@ -0,0 +1,55 @@
from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface
from cognee.infrastructure.engine import DataPoint
from cognee.modules.engine.utils import generate_node_id
from cognee.shared.data_models import KnowledgeGraph
async def retrieve_existing_edges(
graph_node_index: list[tuple[DataPoint, KnowledgeGraph]],
graph_engine: GraphDBInterface,
) -> dict[str, bool]:
processed_nodes = {}
type_node_edges = []
entity_node_edges = []
type_entity_edges = []
for graph_source, graph in graph_node_index:
for node in graph.nodes:
type_node_id = generate_node_id(node.type)
entity_node_id = generate_node_id(node.id)
if str(type_node_id) not in processed_nodes:
type_node_edges.append(
(str(graph_source), str(type_node_id), "exists_in")
)
processed_nodes[str(type_node_id)] = True
if str(entity_node_id) not in processed_nodes:
entity_node_edges.append(
(str(graph_source), entity_node_id, "mentioned_in")
)
type_entity_edges.append(
(str(entity_node_id), str(type_node_id), "is_a")
)
processed_nodes[str(entity_node_id)] = True
graph_node_edges = [
(edge.target_node_id, edge.source_node_id, edge.relationship_name)
for edge in graph.edges
]
existing_edges = await graph_engine.has_edges(
[
*type_node_edges,
*entity_node_edges,
*type_entity_edges,
*graph_node_edges,
]
)
existing_edges_map = {}
for edge in existing_edges:
existing_edges_map[edge[0] + edge[1] + edge[2]] = True
return existing_edges_map

View file

@ -0,0 +1,23 @@
from typing import Any, List, Literal, Optional, Union
from cognee.infrastructure.engine import DataPoint
class Repository(DataPoint):
path: str
class CodeFile(DataPoint):
extracted_id: str # actually file path
type: str
source_code: str
_metadata: dict = {
"index_fields": ["source_code"]
}
class CodeRelationship(DataPoint):
source_id: str
target_id: str
type: str # between files
relation: str # depends on or depends directly

View file

@ -0,0 +1,105 @@
from uuid import UUID, uuid4
import os
import networkx as nx
from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.graph.utils import (
expand_with_nodes_and_edges,
retrieve_existing_edges,
)
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository
from cognee.shared.data_models import Edge, KnowledgeGraph, Node
from cognee.tasks.storage import add_data_points
async def convert_graph_from_code_graph(
graph: nx.DiGraph, repo_path: str
) -> tuple[str, list[CodeFile], list[CodeRelationship]]:
repo, nodes, edges = code_objects_from_di_graph(graph, repo_path)
graph_engine = await get_graph_engine()
code_knowledge_graph = build_code_knowledge_graph(nodes, edges)
repo_and_knowledge_graph = [(repo, code_knowledge_graph)]
existing_edges_map = await retrieve_existing_edges(
repo_and_knowledge_graph, graph_engine
)
graph_nodes, graph_edges = expand_with_nodes_and_edges(
repo_and_knowledge_graph, existing_edges_map
)
if len(graph_nodes) > 0:
await add_data_points(graph_nodes)
if len(graph_edges) > 0:
await graph_engine.add_edges(graph_edges)
return nodes
def convert_node(node: CodeFile) -> Node:
return Node(
id=str(node.id),
name=node.extracted_id,
type=node.type,
description=f"{node.source_code = }",
properties={},
)
def convert_edge(edge: CodeRelationship, extracted_ids_to_ids: dict[str, UUID]) -> Edge:
return Edge(
source_node_id=str(extracted_ids_to_ids[edge.source_id]),
target_node_id=str(extracted_ids_to_ids[edge.target_id]),
relationship_name=f"{edge.type}_{edge.relation}",
)
def build_code_knowledge_graph(nodes: list[CodeFile], edges: list[CodeRelationship]):
extracted_ids_to_ids = {node.extracted_id: node.id for node in nodes}
graph_nodes = [convert_node(node) for node in nodes]
graph_edges = [convert_edge(edge, extracted_ids_to_ids) for edge in edges]
return KnowledgeGraph(nodes=graph_nodes, edges=graph_edges)
def create_code_file(path, type):
abspath = os.path.abspath(path)
print(f"{path = } - {abspath = }")
with open(abspath, "r") as f:
source_code = f.read()
code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code)
return (code_file, abspath)
def create_code_relationship(
source_path: str, target_path: str, type: str, relation: str
):
return CodeRelationship(
source_id=source_path, target_id=target_path, type=type, relation=relation
)
def code_objects_from_di_graph(
graph: nx.DiGraph, repo_path: str
) -> tuple[Repository, list[CodeFile], list[CodeRelationship]]:
repo = Repository(path=repo_path)
code_files = [
create_code_file(os.path.join(repo_path, path), "python_file")[0]
for path in graph.nodes
]
code_relationships = [
create_code_relationship(
os.path.join(repo_path, source),
os.path.join(repo_path, target),
"python_file",
graph.get_edge_data(source, target, v)["relation"],
)
for source, target, v in graph.edges
]
return (repo, code_files, code_relationships)

View file

@ -1,119 +1,38 @@
import asyncio import asyncio
from typing import Type from typing import Type
from pydantic import BaseModel from pydantic import BaseModel
from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.graph import get_graph_engine
from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
from cognee.modules.engine.models import EntityType, Entity from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
from cognee.modules.engine.utils import generate_edge_name, generate_node_id, generate_node_name from cognee.modules.graph.utils import (
expand_with_nodes_and_edges,
retrieve_existing_edges,
)
from cognee.tasks.storage import add_data_points from cognee.tasks.storage import add_data_points
async def extract_graph_from_data(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]):
async def extract_graph_from_data(
data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]
):
chunk_graphs = await asyncio.gather( chunk_graphs = await asyncio.gather(
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks] *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
) )
processed_nodes = {}
type_node_edges = []
entity_node_edges = []
type_entity_edges = []
for (chunk_index, chunk) in enumerate(data_chunks):
chunk_graph = chunk_graphs[chunk_index]
for node in chunk_graph.nodes:
type_node_id = generate_node_id(node.type)
entity_node_id = generate_node_id(node.id)
if str(type_node_id) not in processed_nodes:
type_node_edges.append((str(chunk.id), str(type_node_id), "exists_in"))
processed_nodes[str(type_node_id)] = True
if str(entity_node_id) not in processed_nodes:
entity_node_edges.append((str(chunk.id), entity_node_id, "mentioned_in"))
type_entity_edges.append((str(entity_node_id), str(type_node_id), "is_a"))
processed_nodes[str(entity_node_id)] = True
graph_node_edges = [
(edge.target_node_id, edge.source_node_id, edge.relationship_name) \
for edge in chunk_graph.edges
]
graph_engine = await get_graph_engine() graph_engine = await get_graph_engine()
chunk_and_chunk_graphs = [
(chunk, chunk_graph) for chunk, chunk_graph in zip(data_chunks, chunk_graphs)
]
existing_edges_map = await retrieve_existing_edges(
chunk_and_chunk_graphs, graph_engine
)
existing_edges = await graph_engine.has_edges([ graph_nodes, graph_edges = expand_with_nodes_and_edges(
*type_node_edges, chunk_and_chunk_graphs, existing_edges_map
*entity_node_edges, )
*type_entity_edges,
*graph_node_edges,
])
existing_edges_map = {} if len(graph_nodes) > 0:
await add_data_points(graph_nodes)
for edge in existing_edges:
existing_edges_map[edge[0] + edge[1] + edge[2]] = True
added_nodes_map = {}
graph_edges = []
data_points = []
for (chunk_index, chunk) in enumerate(data_chunks):
graph = chunk_graphs[chunk_index]
if graph is None:
continue
for node in graph.nodes:
node_id = generate_node_id(node.id)
node_name = generate_node_name(node.name)
type_node_id = generate_node_id(node.type)
type_node_name = generate_node_name(node.type)
if f"{str(type_node_id)}_type" not in added_nodes_map:
type_node = EntityType(
id = type_node_id,
name = type_node_name,
type = type_node_name,
description = type_node_name,
exists_in = chunk,
)
added_nodes_map[f"{str(type_node_id)}_type"] = type_node
else:
type_node = added_nodes_map[f"{str(type_node_id)}_type"]
if f"{str(node_id)}_entity" not in added_nodes_map:
entity_node = Entity(
id = node_id,
name = node_name,
is_a = type_node,
description = node.description,
mentioned_in = chunk,
)
data_points.append(entity_node)
added_nodes_map[f"{str(node_id)}_entity"] = entity_node
# Add relationship that came from graphs.
for edge in graph.edges:
source_node_id = generate_node_id(edge.source_node_id)
target_node_id = generate_node_id(edge.target_node_id)
relationship_name = generate_edge_name(edge.relationship_name)
edge_key = str(source_node_id) + str(target_node_id) + relationship_name
if edge_key not in existing_edges_map:
graph_edges.append((
source_node_id,
target_node_id,
edge.relationship_name,
dict(
relationship_name = generate_edge_name(edge.relationship_name),
source_node_id = source_node_id,
target_node_id = target_node_id,
),
))
existing_edges_map[edge_key] = True
if len(data_points) > 0:
await add_data_points(data_points)
if len(graph_edges) > 0: if len(graph_edges) > 0:
await graph_engine.add_edges(graph_edges) await graph_engine.add_edges(graph_edges)

View file

@ -55,7 +55,7 @@ async def get_repo_dependency_graph(repo_path: str) -> nx.DiGraph:
if source_code is None: if source_code is None:
continue continue
dependencies = await get_local_script_dependencies(file_path, repo_path) dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path)
dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies] dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies]
dependency_graph.add_edges_from(dependency_edges) dependency_graph.add_edges_from(dependency_edges)
return dependency_graph return dependency_graph

View file

@ -1,2 +1,3 @@
from .summarize_text import summarize_text
from .query_summaries import query_summaries from .query_summaries import query_summaries
from .summarize_code import summarize_code
from .summarize_text import summarize_text

View file

@ -1,6 +1,8 @@
from cognee.infrastructure.engine import DataPoint from cognee.infrastructure.engine import DataPoint
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
from cognee.modules.data.processing.document_types import Document from cognee.modules.data.processing.document_types import Document
from cognee.shared.CodeGraphEntities import CodeFile
class TextSummary(DataPoint): class TextSummary(DataPoint):
__tablename__ = "text_summary" __tablename__ = "text_summary"
@ -10,3 +12,12 @@ class TextSummary(DataPoint):
_metadata: dict = { _metadata: dict = {
"index_fields": ["text"], "index_fields": ["text"],
} }
class CodeSummary(DataPoint):
text: str
made_from: CodeFile
_metadata: dict = {
"index_fields": ["text"],
}

View file

@ -0,0 +1,35 @@
import asyncio
from typing import Type, Union
from uuid import uuid5
from pydantic import BaseModel
from cognee.modules.data.extraction.extract_summary import extract_summary
from cognee.shared.CodeGraphEntities import CodeFile
from cognee.tasks.storage import add_data_points
from .models import CodeSummary
async def summarize_code(
code_files: list[CodeFile], summarization_model: Type[BaseModel]
) -> list[CodeFile]:
if len(code_files) == 0:
return code_files
file_summaries = await asyncio.gather(
*[extract_summary(file.source_code, summarization_model) for file in code_files]
)
summaries = [
CodeSummary(
id=uuid5(file.id, "CodeSummary"),
made_from=file,
text=file_summaries[file_index].summary,
)
for (file_index, file) in enumerate(code_files)
]
await add_data_points(summaries)
return code_files, summaries

View file

@ -0,0 +1,51 @@
import random
import string
import numpy as np
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship
def random_str(n, spaces=True):
candidates = string.ascii_letters + string.digits
if spaces:
candidates += " "
return "".join(random.choice(candidates) for _ in range(n))
def code_graph_test_data_generation():
nodes = [
CodeFile(
extracted_id=random_str(10, spaces=False),
type="file",
source_code=random_str(random.randrange(50, 500)),
)
for _ in range(100)
]
n_nodes = len(nodes)
first_source = np.random.randint(0, n_nodes)
reached_nodes = {first_source}
last_iteration = [first_source]
edges = []
while len(reached_nodes) < n_nodes:
for source in last_iteration:
last_iteration = []
tries = 0
while ((len(last_iteration) == 0 or tries < 500)) and (
len(reached_nodes) < n_nodes
):
tries += 1
target = np.random.randint(n_nodes)
if target not in reached_nodes:
last_iteration.append(target)
edges.append(
CodeRelationship(
source_id=nodes[source].extracted_id,
target_id=nodes[target].extracted_id,
type="files",
relation="depends",
)
)
reached_nodes = reached_nodes.union(set(last_iteration))
return (nodes, edges)

View file

@ -0,0 +1,27 @@
import asyncio
import pytest
from cognee.shared.CodeGraphEntities import Repository
from cognee.tasks.graph.convert_graph_from_code_graph import (
convert_graph_from_code_graph,
)
from cognee.tests.tasks.graph.code_graph_test_data_generation import (
code_graph_test_data_generation,
)
def test_convert_graph_from_code_graph():
repo = Repository(path="test/repo/path")
nodes, edges = code_graph_test_data_generation()
repo_out, nodes_out, edges_out = asyncio.run(
convert_graph_from_code_graph(repo, nodes, edges)
)
assert repo == repo_out, f"{repo = } != {repo_out = }"
for node_in, node_out in zip(nodes, nodes_out):
assert node_in == node_out, f"{node_in = } != {node_out = }"
for edge_in, edge_out in zip(edges, edges_out):
assert edge_in == edge_out, f"{edge_in = } != {edge_out = }"

View file

@ -0,0 +1,15 @@
import asyncio
from cognee.shared.data_models import SummarizedContent
from cognee.tasks.summarization import summarize_code
from cognee.tests.tasks.graph.code_graph_test_data_generation import (
code_graph_test_data_generation,
)
def test_summarize_code():
nodes, _ = code_graph_test_data_generation()
nodes_out = asyncio.run(summarize_code(nodes, SummarizedContent))
for node_in, node_out in zip(nodes, nodes_out):
assert node_in == node_out, f"{node_in = } != {node_out = }"

View file

@ -0,0 +1,64 @@
import argparse
import asyncio
import os
import cognee
import json
import numpy as np
from networkx.classes.digraph import DiGraph
from cognee.modules.pipelines import Task, run_tasks
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository
from cognee.shared.data_models import SummarizedContent
from cognee.tasks.code.get_local_dependencies_checker import (
get_local_script_dependencies,
)
from cognee.tasks.graph.convert_graph_from_code_graph import (
convert_graph_from_code_graph,
)
from cognee.tasks.repo_processor.get_repo_dependency_graph import (
get_repo_dependency_graph,
)
from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependency_graph
from cognee.tasks.summarization import summarize_code
from cognee.tasks.storage import index_data_points
async def print_results(pipeline):
async for result in pipeline:
print(result)
async def write_results(repo, pipeline):
output_dir = os.path.join(repo, "code_pipeline_output", "")
os.makedirs(output_dir, exist_ok = True)
async for code_files, summaries in pipeline:
for summary in summaries:
file_name = os.path.split(summary.made_from.extracted_id)[-1]
relpath = os.path.join(*os.path.split(os.path.relpath(summary.made_from.extracted_id, repo))[:-1])
output_dir2 = os.path.join(repo, "code_pipeline_output", relpath)
os.makedirs(output_dir2, exist_ok=True)
with open(os.path.join(output_dir2, file_name.replace(".py", ".json")), "w") as f:
f.write(json.dumps({"summary": summary.text, "source_code": summary.made_from.source_code}))
async def reset_system():
await cognee.prune.prune_data()
await cognee.prune.prune_system(metadata=True)
return(True)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process a file path")
parser.add_argument("path", help="Path to the file")
args = parser.parse_args()
abspath = os.path.abspath(args.path)
data = abspath
tasks = [
Task(get_repo_dependency_graph),
Task(enrich_dependency_graph),
Task(convert_graph_from_code_graph, repo_path = abspath),
Task(index_data_points),
Task(summarize_code, summarization_model=SummarizedContent),
]
pipeline = run_tasks(tasks, data, "cognify_pipeline")
asyncio.run(write_results(abspath, pipeline))