test: Test for code graph enrichment task
Co-authored-by: lxobr <lazar@topoteretes.com>
This commit is contained in:
parent
70bdaea8f7
commit
80b06c3acb
20 changed files with 527 additions and 108 deletions
6
.github/workflows/test_python_3_10.yml
vendored
6
.github/workflows/test_python_3_10.yml
vendored
|
|
@ -56,6 +56,12 @@ jobs:
|
||||||
- name: Run integration tests
|
- name: Run integration tests
|
||||||
run: poetry run pytest cognee/tests/integration/
|
run: poetry run pytest cognee/tests/integration/
|
||||||
|
|
||||||
|
- name: Run summarize_code test
|
||||||
|
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
|
||||||
|
env:
|
||||||
|
ENV: 'dev'
|
||||||
|
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||||
|
|
||||||
- name: Run default basic pipeline
|
- name: Run default basic pipeline
|
||||||
env:
|
env:
|
||||||
ENV: 'dev'
|
ENV: 'dev'
|
||||||
|
|
|
||||||
6
.github/workflows/test_python_3_11.yml
vendored
6
.github/workflows/test_python_3_11.yml
vendored
|
|
@ -56,6 +56,12 @@ jobs:
|
||||||
- name: Run integration tests
|
- name: Run integration tests
|
||||||
run: poetry run pytest cognee/tests/integration/
|
run: poetry run pytest cognee/tests/integration/
|
||||||
|
|
||||||
|
- name: Run summarize_code test
|
||||||
|
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
|
||||||
|
env:
|
||||||
|
ENV: 'dev'
|
||||||
|
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||||
|
|
||||||
- name: Run default basic pipeline
|
- name: Run default basic pipeline
|
||||||
env:
|
env:
|
||||||
ENV: 'dev'
|
ENV: 'dev'
|
||||||
|
|
|
||||||
6
.github/workflows/test_python_3_9.yml
vendored
6
.github/workflows/test_python_3_9.yml
vendored
|
|
@ -56,6 +56,12 @@ jobs:
|
||||||
- name: Run integration tests
|
- name: Run integration tests
|
||||||
run: poetry run pytest cognee/tests/integration/
|
run: poetry run pytest cognee/tests/integration/
|
||||||
|
|
||||||
|
- name: Run summarize_code test
|
||||||
|
run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
|
||||||
|
env:
|
||||||
|
ENV: 'dev'
|
||||||
|
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||||
|
|
||||||
- name: Run default basic pipeline
|
- name: Run default basic pipeline
|
||||||
env:
|
env:
|
||||||
ENV: 'dev'
|
ENV: 'dev'
|
||||||
|
|
|
||||||
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -4,6 +4,8 @@
|
||||||
.prod.env
|
.prod.env
|
||||||
cognee/.data/
|
cognee/.data/
|
||||||
|
|
||||||
|
code_pipeline_output*/
|
||||||
|
|
||||||
*.lance/
|
*.lance/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,17 @@
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
from cognee.infrastructure.engine import DataPoint
|
from cognee.infrastructure.engine import DataPoint
|
||||||
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
||||||
from .EntityType import EntityType
|
from cognee.modules.engine.models.EntityType import EntityType
|
||||||
|
from cognee.shared.CodeGraphEntities import Repository
|
||||||
|
|
||||||
|
|
||||||
class Entity(DataPoint):
|
class Entity(DataPoint):
|
||||||
__tablename__ = "entity"
|
__tablename__ = "entity"
|
||||||
name: str
|
name: str
|
||||||
is_a: EntityType
|
is_a: EntityType
|
||||||
description: str
|
description: str
|
||||||
mentioned_in: DocumentChunk
|
mentioned_in: Union[DocumentChunk, Repository]
|
||||||
_metadata: dict = {
|
_metadata: dict = {
|
||||||
"index_fields": ["name"],
|
"index_fields": ["name"],
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,16 @@
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
from cognee.infrastructure.engine import DataPoint
|
from cognee.infrastructure.engine import DataPoint
|
||||||
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
||||||
|
from cognee.shared.CodeGraphEntities import Repository
|
||||||
|
|
||||||
|
|
||||||
class EntityType(DataPoint):
|
class EntityType(DataPoint):
|
||||||
__tablename__ = "entity_type"
|
__tablename__ = "entity_type"
|
||||||
name: str
|
name: str
|
||||||
type: str
|
type: str
|
||||||
description: str
|
description: str
|
||||||
exists_in: DocumentChunk
|
exists_in: Union[DocumentChunk, Repository]
|
||||||
_metadata: dict = {
|
_metadata: dict = {
|
||||||
"index_fields": ["name"],
|
"index_fields": ["name"],
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,4 @@
|
||||||
|
from .expand_with_nodes_and_edges import expand_with_nodes_and_edges
|
||||||
from .get_graph_from_model import get_graph_from_model
|
from .get_graph_from_model import get_graph_from_model
|
||||||
from .get_model_instance_from_graph import get_model_instance_from_graph
|
from .get_model_instance_from_graph import get_model_instance_from_graph
|
||||||
|
from .retrieve_existing_edges import retrieve_existing_edges
|
||||||
|
|
|
||||||
83
cognee/modules/graph/utils/expand_with_nodes_and_edges.py
Normal file
83
cognee/modules/graph/utils/expand_with_nodes_and_edges.py
Normal file
|
|
@ -0,0 +1,83 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from cognee.infrastructure.engine import DataPoint
|
||||||
|
from cognee.modules.engine.models import Entity, EntityType
|
||||||
|
from cognee.modules.engine.utils import (
|
||||||
|
generate_edge_name,
|
||||||
|
generate_node_id,
|
||||||
|
generate_node_name,
|
||||||
|
)
|
||||||
|
from cognee.shared.data_models import KnowledgeGraph
|
||||||
|
|
||||||
|
|
||||||
|
def expand_with_nodes_and_edges(
|
||||||
|
graph_node_index: list[tuple[DataPoint, KnowledgeGraph]],
|
||||||
|
existing_edges_map: Optional[dict[str, bool]] = None,
|
||||||
|
):
|
||||||
|
if existing_edges_map is None:
|
||||||
|
existing_edges_map = {}
|
||||||
|
|
||||||
|
added_nodes_map = {}
|
||||||
|
relationships = []
|
||||||
|
data_points = []
|
||||||
|
|
||||||
|
for graph_source, graph in graph_node_index:
|
||||||
|
if graph is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for node in graph.nodes:
|
||||||
|
node_id = generate_node_id(node.id)
|
||||||
|
node_name = generate_node_name(node.name)
|
||||||
|
|
||||||
|
type_node_id = generate_node_id(node.type)
|
||||||
|
type_node_name = generate_node_name(node.type)
|
||||||
|
|
||||||
|
if f"{str(type_node_id)}_type" not in added_nodes_map:
|
||||||
|
type_node = EntityType(
|
||||||
|
id=type_node_id,
|
||||||
|
name=type_node_name,
|
||||||
|
type=type_node_name,
|
||||||
|
description=type_node_name,
|
||||||
|
exists_in=graph_source,
|
||||||
|
)
|
||||||
|
added_nodes_map[f"{str(type_node_id)}_type"] = type_node
|
||||||
|
else:
|
||||||
|
type_node = added_nodes_map[f"{str(type_node_id)}_type"]
|
||||||
|
|
||||||
|
if f"{str(node_id)}_entity" not in added_nodes_map:
|
||||||
|
entity_node = Entity(
|
||||||
|
id=node_id,
|
||||||
|
name=node_name,
|
||||||
|
is_a=type_node,
|
||||||
|
description=node.description,
|
||||||
|
mentioned_in=graph_source,
|
||||||
|
)
|
||||||
|
data_points.append(entity_node)
|
||||||
|
added_nodes_map[f"{str(node_id)}_entity"] = entity_node
|
||||||
|
|
||||||
|
# Add relationship that came from graphs.
|
||||||
|
for edge in graph.edges:
|
||||||
|
source_node_id = generate_node_id(edge.source_node_id)
|
||||||
|
target_node_id = generate_node_id(edge.target_node_id)
|
||||||
|
relationship_name = generate_edge_name(edge.relationship_name)
|
||||||
|
|
||||||
|
edge_key = str(source_node_id) + str(target_node_id) + relationship_name
|
||||||
|
|
||||||
|
if edge_key not in existing_edges_map:
|
||||||
|
relationships.append(
|
||||||
|
(
|
||||||
|
source_node_id,
|
||||||
|
target_node_id,
|
||||||
|
edge.relationship_name,
|
||||||
|
dict(
|
||||||
|
relationship_name=generate_edge_name(
|
||||||
|
edge.relationship_name
|
||||||
|
),
|
||||||
|
source_node_id=source_node_id,
|
||||||
|
target_node_id=target_node_id,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
existing_edges_map[edge_key] = True
|
||||||
|
|
||||||
|
return (data_points, relationships)
|
||||||
55
cognee/modules/graph/utils/retrieve_existing_edges.py
Normal file
55
cognee/modules/graph/utils/retrieve_existing_edges.py
Normal file
|
|
@ -0,0 +1,55 @@
|
||||||
|
from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface
|
||||||
|
from cognee.infrastructure.engine import DataPoint
|
||||||
|
from cognee.modules.engine.utils import generate_node_id
|
||||||
|
from cognee.shared.data_models import KnowledgeGraph
|
||||||
|
|
||||||
|
|
||||||
|
async def retrieve_existing_edges(
|
||||||
|
graph_node_index: list[tuple[DataPoint, KnowledgeGraph]],
|
||||||
|
graph_engine: GraphDBInterface,
|
||||||
|
) -> dict[str, bool]:
|
||||||
|
processed_nodes = {}
|
||||||
|
type_node_edges = []
|
||||||
|
entity_node_edges = []
|
||||||
|
type_entity_edges = []
|
||||||
|
|
||||||
|
for graph_source, graph in graph_node_index:
|
||||||
|
for node in graph.nodes:
|
||||||
|
type_node_id = generate_node_id(node.type)
|
||||||
|
entity_node_id = generate_node_id(node.id)
|
||||||
|
|
||||||
|
if str(type_node_id) not in processed_nodes:
|
||||||
|
type_node_edges.append(
|
||||||
|
(str(graph_source), str(type_node_id), "exists_in")
|
||||||
|
)
|
||||||
|
processed_nodes[str(type_node_id)] = True
|
||||||
|
|
||||||
|
if str(entity_node_id) not in processed_nodes:
|
||||||
|
entity_node_edges.append(
|
||||||
|
(str(graph_source), entity_node_id, "mentioned_in")
|
||||||
|
)
|
||||||
|
type_entity_edges.append(
|
||||||
|
(str(entity_node_id), str(type_node_id), "is_a")
|
||||||
|
)
|
||||||
|
processed_nodes[str(entity_node_id)] = True
|
||||||
|
|
||||||
|
graph_node_edges = [
|
||||||
|
(edge.target_node_id, edge.source_node_id, edge.relationship_name)
|
||||||
|
for edge in graph.edges
|
||||||
|
]
|
||||||
|
|
||||||
|
existing_edges = await graph_engine.has_edges(
|
||||||
|
[
|
||||||
|
*type_node_edges,
|
||||||
|
*entity_node_edges,
|
||||||
|
*type_entity_edges,
|
||||||
|
*graph_node_edges,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
existing_edges_map = {}
|
||||||
|
|
||||||
|
for edge in existing_edges:
|
||||||
|
existing_edges_map[edge[0] + edge[1] + edge[2]] = True
|
||||||
|
|
||||||
|
return existing_edges_map
|
||||||
23
cognee/shared/CodeGraphEntities.py
Normal file
23
cognee/shared/CodeGraphEntities.py
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
from typing import Any, List, Literal, Optional, Union
|
||||||
|
|
||||||
|
from cognee.infrastructure.engine import DataPoint
|
||||||
|
|
||||||
|
|
||||||
|
class Repository(DataPoint):
|
||||||
|
path: str
|
||||||
|
|
||||||
|
|
||||||
|
class CodeFile(DataPoint):
|
||||||
|
extracted_id: str # actually file path
|
||||||
|
type: str
|
||||||
|
source_code: str
|
||||||
|
|
||||||
|
_metadata: dict = {
|
||||||
|
"index_fields": ["source_code"]
|
||||||
|
}
|
||||||
|
|
||||||
|
class CodeRelationship(DataPoint):
|
||||||
|
source_id: str
|
||||||
|
target_id: str
|
||||||
|
type: str # between files
|
||||||
|
relation: str # depends on or depends directly
|
||||||
105
cognee/tasks/graph/convert_graph_from_code_graph.py
Normal file
105
cognee/tasks/graph/convert_graph_from_code_graph.py
Normal file
|
|
@ -0,0 +1,105 @@
|
||||||
|
from uuid import UUID, uuid4
|
||||||
|
import os
|
||||||
|
import networkx as nx
|
||||||
|
|
||||||
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||||
|
from cognee.modules.graph.utils import (
|
||||||
|
expand_with_nodes_and_edges,
|
||||||
|
retrieve_existing_edges,
|
||||||
|
)
|
||||||
|
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository
|
||||||
|
from cognee.shared.data_models import Edge, KnowledgeGraph, Node
|
||||||
|
from cognee.tasks.storage import add_data_points
|
||||||
|
|
||||||
|
|
||||||
|
async def convert_graph_from_code_graph(
|
||||||
|
graph: nx.DiGraph, repo_path: str
|
||||||
|
) -> tuple[str, list[CodeFile], list[CodeRelationship]]:
|
||||||
|
|
||||||
|
repo, nodes, edges = code_objects_from_di_graph(graph, repo_path)
|
||||||
|
|
||||||
|
graph_engine = await get_graph_engine()
|
||||||
|
|
||||||
|
code_knowledge_graph = build_code_knowledge_graph(nodes, edges)
|
||||||
|
repo_and_knowledge_graph = [(repo, code_knowledge_graph)]
|
||||||
|
|
||||||
|
existing_edges_map = await retrieve_existing_edges(
|
||||||
|
repo_and_knowledge_graph, graph_engine
|
||||||
|
)
|
||||||
|
|
||||||
|
graph_nodes, graph_edges = expand_with_nodes_and_edges(
|
||||||
|
repo_and_knowledge_graph, existing_edges_map
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(graph_nodes) > 0:
|
||||||
|
await add_data_points(graph_nodes)
|
||||||
|
|
||||||
|
if len(graph_edges) > 0:
|
||||||
|
await graph_engine.add_edges(graph_edges)
|
||||||
|
|
||||||
|
return nodes
|
||||||
|
|
||||||
|
|
||||||
|
def convert_node(node: CodeFile) -> Node:
|
||||||
|
return Node(
|
||||||
|
id=str(node.id),
|
||||||
|
name=node.extracted_id,
|
||||||
|
type=node.type,
|
||||||
|
description=f"{node.source_code = }",
|
||||||
|
properties={},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_edge(edge: CodeRelationship, extracted_ids_to_ids: dict[str, UUID]) -> Edge:
|
||||||
|
return Edge(
|
||||||
|
source_node_id=str(extracted_ids_to_ids[edge.source_id]),
|
||||||
|
target_node_id=str(extracted_ids_to_ids[edge.target_id]),
|
||||||
|
relationship_name=f"{edge.type}_{edge.relation}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_code_knowledge_graph(nodes: list[CodeFile], edges: list[CodeRelationship]):
|
||||||
|
extracted_ids_to_ids = {node.extracted_id: node.id for node in nodes}
|
||||||
|
graph_nodes = [convert_node(node) for node in nodes]
|
||||||
|
graph_edges = [convert_edge(edge, extracted_ids_to_ids) for edge in edges]
|
||||||
|
return KnowledgeGraph(nodes=graph_nodes, edges=graph_edges)
|
||||||
|
|
||||||
|
|
||||||
|
def create_code_file(path, type):
|
||||||
|
abspath = os.path.abspath(path)
|
||||||
|
print(f"{path = } - {abspath = }")
|
||||||
|
with open(abspath, "r") as f:
|
||||||
|
source_code = f.read()
|
||||||
|
code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code)
|
||||||
|
return (code_file, abspath)
|
||||||
|
|
||||||
|
|
||||||
|
def create_code_relationship(
|
||||||
|
source_path: str, target_path: str, type: str, relation: str
|
||||||
|
):
|
||||||
|
return CodeRelationship(
|
||||||
|
source_id=source_path, target_id=target_path, type=type, relation=relation
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def code_objects_from_di_graph(
|
||||||
|
graph: nx.DiGraph, repo_path: str
|
||||||
|
) -> tuple[Repository, list[CodeFile], list[CodeRelationship]]:
|
||||||
|
repo = Repository(path=repo_path)
|
||||||
|
|
||||||
|
code_files = [
|
||||||
|
create_code_file(os.path.join(repo_path, path), "python_file")[0]
|
||||||
|
for path in graph.nodes
|
||||||
|
]
|
||||||
|
|
||||||
|
code_relationships = [
|
||||||
|
create_code_relationship(
|
||||||
|
os.path.join(repo_path, source),
|
||||||
|
os.path.join(repo_path, target),
|
||||||
|
"python_file",
|
||||||
|
graph.get_edge_data(source, target, v)["relation"],
|
||||||
|
)
|
||||||
|
for source, target, v in graph.edges
|
||||||
|
]
|
||||||
|
|
||||||
|
return (repo, code_files, code_relationships)
|
||||||
|
|
@ -1,119 +1,38 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import Type
|
from typing import Type
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from cognee.infrastructure.databases.graph import get_graph_engine
|
from cognee.infrastructure.databases.graph import get_graph_engine
|
||||||
from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
|
|
||||||
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
||||||
from cognee.modules.engine.models import EntityType, Entity
|
from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
|
||||||
from cognee.modules.engine.utils import generate_edge_name, generate_node_id, generate_node_name
|
from cognee.modules.graph.utils import (
|
||||||
|
expand_with_nodes_and_edges,
|
||||||
|
retrieve_existing_edges,
|
||||||
|
)
|
||||||
from cognee.tasks.storage import add_data_points
|
from cognee.tasks.storage import add_data_points
|
||||||
|
|
||||||
async def extract_graph_from_data(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]):
|
|
||||||
|
async def extract_graph_from_data(
|
||||||
|
data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]
|
||||||
|
):
|
||||||
chunk_graphs = await asyncio.gather(
|
chunk_graphs = await asyncio.gather(
|
||||||
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
|
*[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
|
||||||
)
|
)
|
||||||
|
|
||||||
processed_nodes = {}
|
|
||||||
type_node_edges = []
|
|
||||||
entity_node_edges = []
|
|
||||||
type_entity_edges = []
|
|
||||||
|
|
||||||
for (chunk_index, chunk) in enumerate(data_chunks):
|
|
||||||
chunk_graph = chunk_graphs[chunk_index]
|
|
||||||
for node in chunk_graph.nodes:
|
|
||||||
type_node_id = generate_node_id(node.type)
|
|
||||||
entity_node_id = generate_node_id(node.id)
|
|
||||||
|
|
||||||
if str(type_node_id) not in processed_nodes:
|
|
||||||
type_node_edges.append((str(chunk.id), str(type_node_id), "exists_in"))
|
|
||||||
processed_nodes[str(type_node_id)] = True
|
|
||||||
|
|
||||||
if str(entity_node_id) not in processed_nodes:
|
|
||||||
entity_node_edges.append((str(chunk.id), entity_node_id, "mentioned_in"))
|
|
||||||
type_entity_edges.append((str(entity_node_id), str(type_node_id), "is_a"))
|
|
||||||
processed_nodes[str(entity_node_id)] = True
|
|
||||||
|
|
||||||
graph_node_edges = [
|
|
||||||
(edge.target_node_id, edge.source_node_id, edge.relationship_name) \
|
|
||||||
for edge in chunk_graph.edges
|
|
||||||
]
|
|
||||||
|
|
||||||
graph_engine = await get_graph_engine()
|
graph_engine = await get_graph_engine()
|
||||||
|
chunk_and_chunk_graphs = [
|
||||||
|
(chunk, chunk_graph) for chunk, chunk_graph in zip(data_chunks, chunk_graphs)
|
||||||
|
]
|
||||||
|
existing_edges_map = await retrieve_existing_edges(
|
||||||
|
chunk_and_chunk_graphs, graph_engine
|
||||||
|
)
|
||||||
|
|
||||||
existing_edges = await graph_engine.has_edges([
|
graph_nodes, graph_edges = expand_with_nodes_and_edges(
|
||||||
*type_node_edges,
|
chunk_and_chunk_graphs, existing_edges_map
|
||||||
*entity_node_edges,
|
)
|
||||||
*type_entity_edges,
|
|
||||||
*graph_node_edges,
|
|
||||||
])
|
|
||||||
|
|
||||||
existing_edges_map = {}
|
if len(graph_nodes) > 0:
|
||||||
|
await add_data_points(graph_nodes)
|
||||||
for edge in existing_edges:
|
|
||||||
existing_edges_map[edge[0] + edge[1] + edge[2]] = True
|
|
||||||
|
|
||||||
added_nodes_map = {}
|
|
||||||
graph_edges = []
|
|
||||||
data_points = []
|
|
||||||
|
|
||||||
for (chunk_index, chunk) in enumerate(data_chunks):
|
|
||||||
graph = chunk_graphs[chunk_index]
|
|
||||||
if graph is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for node in graph.nodes:
|
|
||||||
node_id = generate_node_id(node.id)
|
|
||||||
node_name = generate_node_name(node.name)
|
|
||||||
|
|
||||||
type_node_id = generate_node_id(node.type)
|
|
||||||
type_node_name = generate_node_name(node.type)
|
|
||||||
|
|
||||||
if f"{str(type_node_id)}_type" not in added_nodes_map:
|
|
||||||
type_node = EntityType(
|
|
||||||
id = type_node_id,
|
|
||||||
name = type_node_name,
|
|
||||||
type = type_node_name,
|
|
||||||
description = type_node_name,
|
|
||||||
exists_in = chunk,
|
|
||||||
)
|
|
||||||
added_nodes_map[f"{str(type_node_id)}_type"] = type_node
|
|
||||||
else:
|
|
||||||
type_node = added_nodes_map[f"{str(type_node_id)}_type"]
|
|
||||||
|
|
||||||
if f"{str(node_id)}_entity" not in added_nodes_map:
|
|
||||||
entity_node = Entity(
|
|
||||||
id = node_id,
|
|
||||||
name = node_name,
|
|
||||||
is_a = type_node,
|
|
||||||
description = node.description,
|
|
||||||
mentioned_in = chunk,
|
|
||||||
)
|
|
||||||
data_points.append(entity_node)
|
|
||||||
added_nodes_map[f"{str(node_id)}_entity"] = entity_node
|
|
||||||
|
|
||||||
# Add relationship that came from graphs.
|
|
||||||
for edge in graph.edges:
|
|
||||||
source_node_id = generate_node_id(edge.source_node_id)
|
|
||||||
target_node_id = generate_node_id(edge.target_node_id)
|
|
||||||
relationship_name = generate_edge_name(edge.relationship_name)
|
|
||||||
|
|
||||||
edge_key = str(source_node_id) + str(target_node_id) + relationship_name
|
|
||||||
|
|
||||||
if edge_key not in existing_edges_map:
|
|
||||||
graph_edges.append((
|
|
||||||
source_node_id,
|
|
||||||
target_node_id,
|
|
||||||
edge.relationship_name,
|
|
||||||
dict(
|
|
||||||
relationship_name = generate_edge_name(edge.relationship_name),
|
|
||||||
source_node_id = source_node_id,
|
|
||||||
target_node_id = target_node_id,
|
|
||||||
),
|
|
||||||
))
|
|
||||||
existing_edges_map[edge_key] = True
|
|
||||||
|
|
||||||
if len(data_points) > 0:
|
|
||||||
await add_data_points(data_points)
|
|
||||||
|
|
||||||
if len(graph_edges) > 0:
|
if len(graph_edges) > 0:
|
||||||
await graph_engine.add_edges(graph_edges)
|
await graph_engine.add_edges(graph_edges)
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ async def get_repo_dependency_graph(repo_path: str) -> nx.DiGraph:
|
||||||
if source_code is None:
|
if source_code is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
dependencies = await get_local_script_dependencies(file_path, repo_path)
|
dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path)
|
||||||
dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies]
|
dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies]
|
||||||
dependency_graph.add_edges_from(dependency_edges)
|
dependency_graph.add_edges_from(dependency_edges)
|
||||||
return dependency_graph
|
return dependency_graph
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,3 @@
|
||||||
from .summarize_text import summarize_text
|
|
||||||
from .query_summaries import query_summaries
|
from .query_summaries import query_summaries
|
||||||
|
from .summarize_code import summarize_code
|
||||||
|
from .summarize_text import summarize_text
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,8 @@
|
||||||
from cognee.infrastructure.engine import DataPoint
|
from cognee.infrastructure.engine import DataPoint
|
||||||
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
|
||||||
from cognee.modules.data.processing.document_types import Document
|
from cognee.modules.data.processing.document_types import Document
|
||||||
|
from cognee.shared.CodeGraphEntities import CodeFile
|
||||||
|
|
||||||
|
|
||||||
class TextSummary(DataPoint):
|
class TextSummary(DataPoint):
|
||||||
__tablename__ = "text_summary"
|
__tablename__ = "text_summary"
|
||||||
|
|
@ -10,3 +12,12 @@ class TextSummary(DataPoint):
|
||||||
_metadata: dict = {
|
_metadata: dict = {
|
||||||
"index_fields": ["text"],
|
"index_fields": ["text"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class CodeSummary(DataPoint):
|
||||||
|
text: str
|
||||||
|
made_from: CodeFile
|
||||||
|
|
||||||
|
_metadata: dict = {
|
||||||
|
"index_fields": ["text"],
|
||||||
|
}
|
||||||
|
|
|
||||||
35
cognee/tasks/summarization/summarize_code.py
Normal file
35
cognee/tasks/summarization/summarize_code.py
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
import asyncio
|
||||||
|
from typing import Type, Union
|
||||||
|
from uuid import uuid5
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from cognee.modules.data.extraction.extract_summary import extract_summary
|
||||||
|
from cognee.shared.CodeGraphEntities import CodeFile
|
||||||
|
from cognee.tasks.storage import add_data_points
|
||||||
|
|
||||||
|
from .models import CodeSummary
|
||||||
|
|
||||||
|
|
||||||
|
async def summarize_code(
|
||||||
|
code_files: list[CodeFile], summarization_model: Type[BaseModel]
|
||||||
|
) -> list[CodeFile]:
|
||||||
|
if len(code_files) == 0:
|
||||||
|
return code_files
|
||||||
|
|
||||||
|
file_summaries = await asyncio.gather(
|
||||||
|
*[extract_summary(file.source_code, summarization_model) for file in code_files]
|
||||||
|
)
|
||||||
|
|
||||||
|
summaries = [
|
||||||
|
CodeSummary(
|
||||||
|
id=uuid5(file.id, "CodeSummary"),
|
||||||
|
made_from=file,
|
||||||
|
text=file_summaries[file_index].summary,
|
||||||
|
)
|
||||||
|
for (file_index, file) in enumerate(code_files)
|
||||||
|
]
|
||||||
|
|
||||||
|
await add_data_points(summaries)
|
||||||
|
|
||||||
|
return code_files, summaries
|
||||||
51
cognee/tests/tasks/graph/code_graph_test_data_generation.py
Normal file
51
cognee/tests/tasks/graph/code_graph_test_data_generation.py
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship
|
||||||
|
|
||||||
|
|
||||||
|
def random_str(n, spaces=True):
|
||||||
|
candidates = string.ascii_letters + string.digits
|
||||||
|
if spaces:
|
||||||
|
candidates += " "
|
||||||
|
return "".join(random.choice(candidates) for _ in range(n))
|
||||||
|
|
||||||
|
|
||||||
|
def code_graph_test_data_generation():
|
||||||
|
nodes = [
|
||||||
|
CodeFile(
|
||||||
|
extracted_id=random_str(10, spaces=False),
|
||||||
|
type="file",
|
||||||
|
source_code=random_str(random.randrange(50, 500)),
|
||||||
|
)
|
||||||
|
for _ in range(100)
|
||||||
|
]
|
||||||
|
n_nodes = len(nodes)
|
||||||
|
first_source = np.random.randint(0, n_nodes)
|
||||||
|
reached_nodes = {first_source}
|
||||||
|
last_iteration = [first_source]
|
||||||
|
edges = []
|
||||||
|
while len(reached_nodes) < n_nodes:
|
||||||
|
for source in last_iteration:
|
||||||
|
last_iteration = []
|
||||||
|
tries = 0
|
||||||
|
while ((len(last_iteration) == 0 or tries < 500)) and (
|
||||||
|
len(reached_nodes) < n_nodes
|
||||||
|
):
|
||||||
|
tries += 1
|
||||||
|
target = np.random.randint(n_nodes)
|
||||||
|
if target not in reached_nodes:
|
||||||
|
last_iteration.append(target)
|
||||||
|
edges.append(
|
||||||
|
CodeRelationship(
|
||||||
|
source_id=nodes[source].extracted_id,
|
||||||
|
target_id=nodes[target].extracted_id,
|
||||||
|
type="files",
|
||||||
|
relation="depends",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
reached_nodes = reached_nodes.union(set(last_iteration))
|
||||||
|
|
||||||
|
return (nodes, edges)
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from cognee.shared.CodeGraphEntities import Repository
|
||||||
|
from cognee.tasks.graph.convert_graph_from_code_graph import (
|
||||||
|
convert_graph_from_code_graph,
|
||||||
|
)
|
||||||
|
from cognee.tests.tasks.graph.code_graph_test_data_generation import (
|
||||||
|
code_graph_test_data_generation,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_graph_from_code_graph():
|
||||||
|
repo = Repository(path="test/repo/path")
|
||||||
|
nodes, edges = code_graph_test_data_generation()
|
||||||
|
repo_out, nodes_out, edges_out = asyncio.run(
|
||||||
|
convert_graph_from_code_graph(repo, nodes, edges)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert repo == repo_out, f"{repo = } != {repo_out = }"
|
||||||
|
|
||||||
|
for node_in, node_out in zip(nodes, nodes_out):
|
||||||
|
assert node_in == node_out, f"{node_in = } != {node_out = }"
|
||||||
|
|
||||||
|
for edge_in, edge_out in zip(edges, edges_out):
|
||||||
|
assert edge_in == edge_out, f"{edge_in = } != {edge_out = }"
|
||||||
15
cognee/tests/tasks/summarization/summarize_code_test.py
Normal file
15
cognee/tests/tasks/summarization/summarize_code_test.py
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
from cognee.shared.data_models import SummarizedContent
|
||||||
|
from cognee.tasks.summarization import summarize_code
|
||||||
|
from cognee.tests.tasks.graph.code_graph_test_data_generation import (
|
||||||
|
code_graph_test_data_generation,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_summarize_code():
|
||||||
|
nodes, _ = code_graph_test_data_generation()
|
||||||
|
nodes_out = asyncio.run(summarize_code(nodes, SummarizedContent))
|
||||||
|
|
||||||
|
for node_in, node_out in zip(nodes, nodes_out):
|
||||||
|
assert node_in == node_out, f"{node_in = } != {node_out = }"
|
||||||
64
examples/python/code_graph_pipeline.py
Normal file
64
examples/python/code_graph_pipeline.py
Normal file
|
|
@ -0,0 +1,64 @@
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import cognee
|
||||||
|
import json
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from networkx.classes.digraph import DiGraph
|
||||||
|
|
||||||
|
from cognee.modules.pipelines import Task, run_tasks
|
||||||
|
from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository
|
||||||
|
from cognee.shared.data_models import SummarizedContent
|
||||||
|
from cognee.tasks.code.get_local_dependencies_checker import (
|
||||||
|
get_local_script_dependencies,
|
||||||
|
)
|
||||||
|
from cognee.tasks.graph.convert_graph_from_code_graph import (
|
||||||
|
convert_graph_from_code_graph,
|
||||||
|
)
|
||||||
|
from cognee.tasks.repo_processor.get_repo_dependency_graph import (
|
||||||
|
get_repo_dependency_graph,
|
||||||
|
)
|
||||||
|
from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependency_graph
|
||||||
|
from cognee.tasks.summarization import summarize_code
|
||||||
|
from cognee.tasks.storage import index_data_points
|
||||||
|
|
||||||
|
async def print_results(pipeline):
|
||||||
|
async for result in pipeline:
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
async def write_results(repo, pipeline):
|
||||||
|
output_dir = os.path.join(repo, "code_pipeline_output", "")
|
||||||
|
os.makedirs(output_dir, exist_ok = True)
|
||||||
|
async for code_files, summaries in pipeline:
|
||||||
|
for summary in summaries:
|
||||||
|
file_name = os.path.split(summary.made_from.extracted_id)[-1]
|
||||||
|
relpath = os.path.join(*os.path.split(os.path.relpath(summary.made_from.extracted_id, repo))[:-1])
|
||||||
|
output_dir2 = os.path.join(repo, "code_pipeline_output", relpath)
|
||||||
|
os.makedirs(output_dir2, exist_ok=True)
|
||||||
|
with open(os.path.join(output_dir2, file_name.replace(".py", ".json")), "w") as f:
|
||||||
|
f.write(json.dumps({"summary": summary.text, "source_code": summary.made_from.source_code}))
|
||||||
|
|
||||||
|
async def reset_system():
|
||||||
|
await cognee.prune.prune_data()
|
||||||
|
await cognee.prune.prune_system(metadata=True)
|
||||||
|
return(True)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Process a file path")
|
||||||
|
parser.add_argument("path", help="Path to the file")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
abspath = os.path.abspath(args.path)
|
||||||
|
data = abspath
|
||||||
|
tasks = [
|
||||||
|
Task(get_repo_dependency_graph),
|
||||||
|
Task(enrich_dependency_graph),
|
||||||
|
Task(convert_graph_from_code_graph, repo_path = abspath),
|
||||||
|
Task(index_data_points),
|
||||||
|
Task(summarize_code, summarization_model=SummarizedContent),
|
||||||
|
]
|
||||||
|
pipeline = run_tasks(tasks, data, "cognify_pipeline")
|
||||||
|
|
||||||
|
asyncio.run(write_results(abspath, pipeline))
|
||||||
Loading…
Add table
Reference in a new issue