diff --git a/.github/workflows/test_python_3_10.yml b/.github/workflows/test_python_3_10.yml index 7f762d778..a7ea005ef 100644 --- a/.github/workflows/test_python_3_10.yml +++ b/.github/workflows/test_python_3_10.yml @@ -56,6 +56,12 @@ jobs: - name: Run integration tests run: poetry run pytest cognee/tests/integration/ + - name: Run summarize_code test + run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py + env: + ENV: 'dev' + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + - name: Run default basic pipeline env: ENV: 'dev' diff --git a/.github/workflows/test_python_3_11.yml b/.github/workflows/test_python_3_11.yml index b05d901dc..18b04cd94 100644 --- a/.github/workflows/test_python_3_11.yml +++ b/.github/workflows/test_python_3_11.yml @@ -56,6 +56,12 @@ jobs: - name: Run integration tests run: poetry run pytest cognee/tests/integration/ + - name: Run summarize_code test + run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py + env: + ENV: 'dev' + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + - name: Run default basic pipeline env: ENV: 'dev' diff --git a/.github/workflows/test_python_3_9.yml b/.github/workflows/test_python_3_9.yml index 47c5ddc41..54194bd19 100644 --- a/.github/workflows/test_python_3_9.yml +++ b/.github/workflows/test_python_3_9.yml @@ -56,6 +56,12 @@ jobs: - name: Run integration tests run: poetry run pytest cognee/tests/integration/ + - name: Run summarize_code test + run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py + env: + ENV: 'dev' + LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} + - name: Run default basic pipeline env: ENV: 'dev' diff --git a/.gitignore b/.gitignore index d256013d2..47fa54130 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ .prod.env cognee/.data/ +code_pipeline_output*/ + *.lance/ .DS_Store # Byte-compiled / optimized / DLL files diff --git a/cognee/modules/engine/models/Entity.py b/cognee/modules/engine/models/Entity.py index cf946ceb6..c5579a610 100644 --- a/cognee/modules/engine/models/Entity.py +++ b/cognee/modules/engine/models/Entity.py @@ -1,13 +1,17 @@ +from typing import Union + from cognee.infrastructure.engine import DataPoint from cognee.modules.chunking.models.DocumentChunk import DocumentChunk -from .EntityType import EntityType +from cognee.modules.engine.models.EntityType import EntityType +from cognee.shared.CodeGraphEntities import Repository + class Entity(DataPoint): __tablename__ = "entity" name: str is_a: EntityType description: str - mentioned_in: DocumentChunk + mentioned_in: Union[DocumentChunk, Repository] _metadata: dict = { "index_fields": ["name"], } diff --git a/cognee/modules/engine/models/EntityType.py b/cognee/modules/engine/models/EntityType.py index 56092f261..685958935 100644 --- a/cognee/modules/engine/models/EntityType.py +++ b/cognee/modules/engine/models/EntityType.py @@ -1,12 +1,16 @@ +from typing import Union + from cognee.infrastructure.engine import DataPoint from cognee.modules.chunking.models.DocumentChunk import DocumentChunk +from cognee.shared.CodeGraphEntities import Repository + class EntityType(DataPoint): __tablename__ = "entity_type" name: str type: str description: str - exists_in: DocumentChunk + exists_in: Union[DocumentChunk, Repository] _metadata: dict = { "index_fields": ["name"], } diff --git a/cognee/modules/graph/utils/__init__.py b/cognee/modules/graph/utils/__init__.py index 18e7ac29c..6fbe2ee99 100644 --- a/cognee/modules/graph/utils/__init__.py +++ b/cognee/modules/graph/utils/__init__.py @@ -1,2 +1,4 @@ +from .expand_with_nodes_and_edges import expand_with_nodes_and_edges from .get_graph_from_model import get_graph_from_model from .get_model_instance_from_graph import get_model_instance_from_graph +from .retrieve_existing_edges import retrieve_existing_edges diff --git a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py new file mode 100644 index 000000000..5b443d12a --- /dev/null +++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py @@ -0,0 +1,83 @@ +from typing import Optional + +from cognee.infrastructure.engine import DataPoint +from cognee.modules.engine.models import Entity, EntityType +from cognee.modules.engine.utils import ( + generate_edge_name, + generate_node_id, + generate_node_name, +) +from cognee.shared.data_models import KnowledgeGraph + + +def expand_with_nodes_and_edges( + graph_node_index: list[tuple[DataPoint, KnowledgeGraph]], + existing_edges_map: Optional[dict[str, bool]] = None, +): + if existing_edges_map is None: + existing_edges_map = {} + + added_nodes_map = {} + relationships = [] + data_points = [] + + for graph_source, graph in graph_node_index: + if graph is None: + continue + + for node in graph.nodes: + node_id = generate_node_id(node.id) + node_name = generate_node_name(node.name) + + type_node_id = generate_node_id(node.type) + type_node_name = generate_node_name(node.type) + + if f"{str(type_node_id)}_type" not in added_nodes_map: + type_node = EntityType( + id=type_node_id, + name=type_node_name, + type=type_node_name, + description=type_node_name, + exists_in=graph_source, + ) + added_nodes_map[f"{str(type_node_id)}_type"] = type_node + else: + type_node = added_nodes_map[f"{str(type_node_id)}_type"] + + if f"{str(node_id)}_entity" not in added_nodes_map: + entity_node = Entity( + id=node_id, + name=node_name, + is_a=type_node, + description=node.description, + mentioned_in=graph_source, + ) + data_points.append(entity_node) + added_nodes_map[f"{str(node_id)}_entity"] = entity_node + + # Add relationship that came from graphs. + for edge in graph.edges: + source_node_id = generate_node_id(edge.source_node_id) + target_node_id = generate_node_id(edge.target_node_id) + relationship_name = generate_edge_name(edge.relationship_name) + + edge_key = str(source_node_id) + str(target_node_id) + relationship_name + + if edge_key not in existing_edges_map: + relationships.append( + ( + source_node_id, + target_node_id, + edge.relationship_name, + dict( + relationship_name=generate_edge_name( + edge.relationship_name + ), + source_node_id=source_node_id, + target_node_id=target_node_id, + ), + ) + ) + existing_edges_map[edge_key] = True + + return (data_points, relationships) diff --git a/cognee/modules/graph/utils/retrieve_existing_edges.py b/cognee/modules/graph/utils/retrieve_existing_edges.py new file mode 100644 index 000000000..6f237061a --- /dev/null +++ b/cognee/modules/graph/utils/retrieve_existing_edges.py @@ -0,0 +1,55 @@ +from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface +from cognee.infrastructure.engine import DataPoint +from cognee.modules.engine.utils import generate_node_id +from cognee.shared.data_models import KnowledgeGraph + + +async def retrieve_existing_edges( + graph_node_index: list[tuple[DataPoint, KnowledgeGraph]], + graph_engine: GraphDBInterface, +) -> dict[str, bool]: + processed_nodes = {} + type_node_edges = [] + entity_node_edges = [] + type_entity_edges = [] + + for graph_source, graph in graph_node_index: + for node in graph.nodes: + type_node_id = generate_node_id(node.type) + entity_node_id = generate_node_id(node.id) + + if str(type_node_id) not in processed_nodes: + type_node_edges.append( + (str(graph_source), str(type_node_id), "exists_in") + ) + processed_nodes[str(type_node_id)] = True + + if str(entity_node_id) not in processed_nodes: + entity_node_edges.append( + (str(graph_source), entity_node_id, "mentioned_in") + ) + type_entity_edges.append( + (str(entity_node_id), str(type_node_id), "is_a") + ) + processed_nodes[str(entity_node_id)] = True + + graph_node_edges = [ + (edge.target_node_id, edge.source_node_id, edge.relationship_name) + for edge in graph.edges + ] + + existing_edges = await graph_engine.has_edges( + [ + *type_node_edges, + *entity_node_edges, + *type_entity_edges, + *graph_node_edges, + ] + ) + + existing_edges_map = {} + + for edge in existing_edges: + existing_edges_map[edge[0] + edge[1] + edge[2]] = True + + return existing_edges_map diff --git a/cognee/shared/CodeGraphEntities.py b/cognee/shared/CodeGraphEntities.py new file mode 100644 index 000000000..f0061406e --- /dev/null +++ b/cognee/shared/CodeGraphEntities.py @@ -0,0 +1,23 @@ +from typing import Any, List, Literal, Optional, Union + +from cognee.infrastructure.engine import DataPoint + + +class Repository(DataPoint): + path: str + + +class CodeFile(DataPoint): + extracted_id: str # actually file path + type: str + source_code: str + + _metadata: dict = { + "index_fields": ["source_code"] + } + +class CodeRelationship(DataPoint): + source_id: str + target_id: str + type: str # between files + relation: str # depends on or depends directly diff --git a/cognee/tasks/graph/convert_graph_from_code_graph.py b/cognee/tasks/graph/convert_graph_from_code_graph.py new file mode 100644 index 000000000..91eaf660a --- /dev/null +++ b/cognee/tasks/graph/convert_graph_from_code_graph.py @@ -0,0 +1,105 @@ +from uuid import UUID, uuid4 +import os +import networkx as nx + +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.modules.graph.utils import ( + expand_with_nodes_and_edges, + retrieve_existing_edges, +) +from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository +from cognee.shared.data_models import Edge, KnowledgeGraph, Node +from cognee.tasks.storage import add_data_points + + +async def convert_graph_from_code_graph( + graph: nx.DiGraph, repo_path: str +) -> tuple[str, list[CodeFile], list[CodeRelationship]]: + + repo, nodes, edges = code_objects_from_di_graph(graph, repo_path) + + graph_engine = await get_graph_engine() + + code_knowledge_graph = build_code_knowledge_graph(nodes, edges) + repo_and_knowledge_graph = [(repo, code_knowledge_graph)] + + existing_edges_map = await retrieve_existing_edges( + repo_and_knowledge_graph, graph_engine + ) + + graph_nodes, graph_edges = expand_with_nodes_and_edges( + repo_and_knowledge_graph, existing_edges_map + ) + + if len(graph_nodes) > 0: + await add_data_points(graph_nodes) + + if len(graph_edges) > 0: + await graph_engine.add_edges(graph_edges) + + return nodes + + +def convert_node(node: CodeFile) -> Node: + return Node( + id=str(node.id), + name=node.extracted_id, + type=node.type, + description=f"{node.source_code = }", + properties={}, + ) + + +def convert_edge(edge: CodeRelationship, extracted_ids_to_ids: dict[str, UUID]) -> Edge: + return Edge( + source_node_id=str(extracted_ids_to_ids[edge.source_id]), + target_node_id=str(extracted_ids_to_ids[edge.target_id]), + relationship_name=f"{edge.type}_{edge.relation}", + ) + + +def build_code_knowledge_graph(nodes: list[CodeFile], edges: list[CodeRelationship]): + extracted_ids_to_ids = {node.extracted_id: node.id for node in nodes} + graph_nodes = [convert_node(node) for node in nodes] + graph_edges = [convert_edge(edge, extracted_ids_to_ids) for edge in edges] + return KnowledgeGraph(nodes=graph_nodes, edges=graph_edges) + + +def create_code_file(path, type): + abspath = os.path.abspath(path) + print(f"{path = } - {abspath = }") + with open(abspath, "r") as f: + source_code = f.read() + code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code) + return (code_file, abspath) + + +def create_code_relationship( + source_path: str, target_path: str, type: str, relation: str +): + return CodeRelationship( + source_id=source_path, target_id=target_path, type=type, relation=relation + ) + + +def code_objects_from_di_graph( + graph: nx.DiGraph, repo_path: str +) -> tuple[Repository, list[CodeFile], list[CodeRelationship]]: + repo = Repository(path=repo_path) + + code_files = [ + create_code_file(os.path.join(repo_path, path), "python_file")[0] + for path in graph.nodes + ] + + code_relationships = [ + create_code_relationship( + os.path.join(repo_path, source), + os.path.join(repo_path, target), + "python_file", + graph.get_edge_data(source, target, v)["relation"], + ) + for source, target, v in graph.edges + ] + + return (repo, code_files, code_relationships) diff --git a/cognee/tasks/graph/extract_graph_from_data.py b/cognee/tasks/graph/extract_graph_from_data.py index 9e6edcabd..ad6ae19d2 100644 --- a/cognee/tasks/graph/extract_graph_from_data.py +++ b/cognee/tasks/graph/extract_graph_from_data.py @@ -1,119 +1,38 @@ import asyncio from typing import Type + from pydantic import BaseModel + from cognee.infrastructure.databases.graph import get_graph_engine -from cognee.modules.data.extraction.knowledge_graph import extract_content_graph from cognee.modules.chunking.models.DocumentChunk import DocumentChunk -from cognee.modules.engine.models import EntityType, Entity -from cognee.modules.engine.utils import generate_edge_name, generate_node_id, generate_node_name +from cognee.modules.data.extraction.knowledge_graph import extract_content_graph +from cognee.modules.graph.utils import ( + expand_with_nodes_and_edges, + retrieve_existing_edges, +) from cognee.tasks.storage import add_data_points -async def extract_graph_from_data(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]): + +async def extract_graph_from_data( + data_chunks: list[DocumentChunk], graph_model: Type[BaseModel] +): chunk_graphs = await asyncio.gather( *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks] ) - - processed_nodes = {} - type_node_edges = [] - entity_node_edges = [] - type_entity_edges = [] - - for (chunk_index, chunk) in enumerate(data_chunks): - chunk_graph = chunk_graphs[chunk_index] - for node in chunk_graph.nodes: - type_node_id = generate_node_id(node.type) - entity_node_id = generate_node_id(node.id) - - if str(type_node_id) not in processed_nodes: - type_node_edges.append((str(chunk.id), str(type_node_id), "exists_in")) - processed_nodes[str(type_node_id)] = True - - if str(entity_node_id) not in processed_nodes: - entity_node_edges.append((str(chunk.id), entity_node_id, "mentioned_in")) - type_entity_edges.append((str(entity_node_id), str(type_node_id), "is_a")) - processed_nodes[str(entity_node_id)] = True - - graph_node_edges = [ - (edge.target_node_id, edge.source_node_id, edge.relationship_name) \ - for edge in chunk_graph.edges - ] - graph_engine = await get_graph_engine() + chunk_and_chunk_graphs = [ + (chunk, chunk_graph) for chunk, chunk_graph in zip(data_chunks, chunk_graphs) + ] + existing_edges_map = await retrieve_existing_edges( + chunk_and_chunk_graphs, graph_engine + ) - existing_edges = await graph_engine.has_edges([ - *type_node_edges, - *entity_node_edges, - *type_entity_edges, - *graph_node_edges, - ]) + graph_nodes, graph_edges = expand_with_nodes_and_edges( + chunk_and_chunk_graphs, existing_edges_map + ) - existing_edges_map = {} - - for edge in existing_edges: - existing_edges_map[edge[0] + edge[1] + edge[2]] = True - - added_nodes_map = {} - graph_edges = [] - data_points = [] - - for (chunk_index, chunk) in enumerate(data_chunks): - graph = chunk_graphs[chunk_index] - if graph is None: - continue - - for node in graph.nodes: - node_id = generate_node_id(node.id) - node_name = generate_node_name(node.name) - - type_node_id = generate_node_id(node.type) - type_node_name = generate_node_name(node.type) - - if f"{str(type_node_id)}_type" not in added_nodes_map: - type_node = EntityType( - id = type_node_id, - name = type_node_name, - type = type_node_name, - description = type_node_name, - exists_in = chunk, - ) - added_nodes_map[f"{str(type_node_id)}_type"] = type_node - else: - type_node = added_nodes_map[f"{str(type_node_id)}_type"] - - if f"{str(node_id)}_entity" not in added_nodes_map: - entity_node = Entity( - id = node_id, - name = node_name, - is_a = type_node, - description = node.description, - mentioned_in = chunk, - ) - data_points.append(entity_node) - added_nodes_map[f"{str(node_id)}_entity"] = entity_node - - # Add relationship that came from graphs. - for edge in graph.edges: - source_node_id = generate_node_id(edge.source_node_id) - target_node_id = generate_node_id(edge.target_node_id) - relationship_name = generate_edge_name(edge.relationship_name) - - edge_key = str(source_node_id) + str(target_node_id) + relationship_name - - if edge_key not in existing_edges_map: - graph_edges.append(( - source_node_id, - target_node_id, - edge.relationship_name, - dict( - relationship_name = generate_edge_name(edge.relationship_name), - source_node_id = source_node_id, - target_node_id = target_node_id, - ), - )) - existing_edges_map[edge_key] = True - - if len(data_points) > 0: - await add_data_points(data_points) + if len(graph_nodes) > 0: + await add_data_points(graph_nodes) if len(graph_edges) > 0: await graph_engine.add_edges(graph_edges) diff --git a/cognee/tasks/repo_processor/get_repo_dependency_graph.py b/cognee/tasks/repo_processor/get_repo_dependency_graph.py index 6a8184cd3..b36607ab8 100644 --- a/cognee/tasks/repo_processor/get_repo_dependency_graph.py +++ b/cognee/tasks/repo_processor/get_repo_dependency_graph.py @@ -55,7 +55,7 @@ async def get_repo_dependency_graph(repo_path: str) -> nx.DiGraph: if source_code is None: continue - dependencies = await get_local_script_dependencies(file_path, repo_path) + dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path) dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies] dependency_graph.add_edges_from(dependency_edges) return dependency_graph diff --git a/cognee/tasks/summarization/__init__.py b/cognee/tasks/summarization/__init__.py index cd459ac0b..a9a330cb2 100644 --- a/cognee/tasks/summarization/__init__.py +++ b/cognee/tasks/summarization/__init__.py @@ -1,2 +1,3 @@ -from .summarize_text import summarize_text from .query_summaries import query_summaries +from .summarize_code import summarize_code +from .summarize_text import summarize_text diff --git a/cognee/tasks/summarization/models.py b/cognee/tasks/summarization/models.py index 955c0e2fa..66fa4fc60 100644 --- a/cognee/tasks/summarization/models.py +++ b/cognee/tasks/summarization/models.py @@ -1,6 +1,8 @@ from cognee.infrastructure.engine import DataPoint from cognee.modules.chunking.models.DocumentChunk import DocumentChunk from cognee.modules.data.processing.document_types import Document +from cognee.shared.CodeGraphEntities import CodeFile + class TextSummary(DataPoint): __tablename__ = "text_summary" @@ -10,3 +12,12 @@ class TextSummary(DataPoint): _metadata: dict = { "index_fields": ["text"], } + + +class CodeSummary(DataPoint): + text: str + made_from: CodeFile + + _metadata: dict = { + "index_fields": ["text"], + } diff --git a/cognee/tasks/summarization/summarize_code.py b/cognee/tasks/summarization/summarize_code.py new file mode 100644 index 000000000..597bc4026 --- /dev/null +++ b/cognee/tasks/summarization/summarize_code.py @@ -0,0 +1,35 @@ +import asyncio +from typing import Type, Union +from uuid import uuid5 + +from pydantic import BaseModel + +from cognee.modules.data.extraction.extract_summary import extract_summary +from cognee.shared.CodeGraphEntities import CodeFile +from cognee.tasks.storage import add_data_points + +from .models import CodeSummary + + +async def summarize_code( + code_files: list[CodeFile], summarization_model: Type[BaseModel] +) -> list[CodeFile]: + if len(code_files) == 0: + return code_files + + file_summaries = await asyncio.gather( + *[extract_summary(file.source_code, summarization_model) for file in code_files] + ) + + summaries = [ + CodeSummary( + id=uuid5(file.id, "CodeSummary"), + made_from=file, + text=file_summaries[file_index].summary, + ) + for (file_index, file) in enumerate(code_files) + ] + + await add_data_points(summaries) + + return code_files, summaries diff --git a/cognee/tests/tasks/graph/code_graph_test_data_generation.py b/cognee/tests/tasks/graph/code_graph_test_data_generation.py new file mode 100644 index 000000000..74ca2de71 --- /dev/null +++ b/cognee/tests/tasks/graph/code_graph_test_data_generation.py @@ -0,0 +1,51 @@ +import random +import string + +import numpy as np + +from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship + + +def random_str(n, spaces=True): + candidates = string.ascii_letters + string.digits + if spaces: + candidates += " " + return "".join(random.choice(candidates) for _ in range(n)) + + +def code_graph_test_data_generation(): + nodes = [ + CodeFile( + extracted_id=random_str(10, spaces=False), + type="file", + source_code=random_str(random.randrange(50, 500)), + ) + for _ in range(100) + ] + n_nodes = len(nodes) + first_source = np.random.randint(0, n_nodes) + reached_nodes = {first_source} + last_iteration = [first_source] + edges = [] + while len(reached_nodes) < n_nodes: + for source in last_iteration: + last_iteration = [] + tries = 0 + while ((len(last_iteration) == 0 or tries < 500)) and ( + len(reached_nodes) < n_nodes + ): + tries += 1 + target = np.random.randint(n_nodes) + if target not in reached_nodes: + last_iteration.append(target) + edges.append( + CodeRelationship( + source_id=nodes[source].extracted_id, + target_id=nodes[target].extracted_id, + type="files", + relation="depends", + ) + ) + reached_nodes = reached_nodes.union(set(last_iteration)) + + return (nodes, edges) diff --git a/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py b/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py new file mode 100644 index 000000000..755840b01 --- /dev/null +++ b/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py @@ -0,0 +1,27 @@ +import asyncio + +import pytest + +from cognee.shared.CodeGraphEntities import Repository +from cognee.tasks.graph.convert_graph_from_code_graph import ( + convert_graph_from_code_graph, +) +from cognee.tests.tasks.graph.code_graph_test_data_generation import ( + code_graph_test_data_generation, +) + + +def test_convert_graph_from_code_graph(): + repo = Repository(path="test/repo/path") + nodes, edges = code_graph_test_data_generation() + repo_out, nodes_out, edges_out = asyncio.run( + convert_graph_from_code_graph(repo, nodes, edges) + ) + + assert repo == repo_out, f"{repo = } != {repo_out = }" + + for node_in, node_out in zip(nodes, nodes_out): + assert node_in == node_out, f"{node_in = } != {node_out = }" + + for edge_in, edge_out in zip(edges, edges_out): + assert edge_in == edge_out, f"{edge_in = } != {edge_out = }" diff --git a/cognee/tests/tasks/summarization/summarize_code_test.py b/cognee/tests/tasks/summarization/summarize_code_test.py new file mode 100644 index 000000000..5865a05fd --- /dev/null +++ b/cognee/tests/tasks/summarization/summarize_code_test.py @@ -0,0 +1,15 @@ +import asyncio + +from cognee.shared.data_models import SummarizedContent +from cognee.tasks.summarization import summarize_code +from cognee.tests.tasks.graph.code_graph_test_data_generation import ( + code_graph_test_data_generation, +) + + +def test_summarize_code(): + nodes, _ = code_graph_test_data_generation() + nodes_out = asyncio.run(summarize_code(nodes, SummarizedContent)) + + for node_in, node_out in zip(nodes, nodes_out): + assert node_in == node_out, f"{node_in = } != {node_out = }" diff --git a/examples/python/code_graph_pipeline.py b/examples/python/code_graph_pipeline.py new file mode 100644 index 000000000..e4fe29840 --- /dev/null +++ b/examples/python/code_graph_pipeline.py @@ -0,0 +1,64 @@ +import argparse +import asyncio +import os +import cognee +import json + +import numpy as np +from networkx.classes.digraph import DiGraph + +from cognee.modules.pipelines import Task, run_tasks +from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository +from cognee.shared.data_models import SummarizedContent +from cognee.tasks.code.get_local_dependencies_checker import ( + get_local_script_dependencies, +) +from cognee.tasks.graph.convert_graph_from_code_graph import ( + convert_graph_from_code_graph, +) +from cognee.tasks.repo_processor.get_repo_dependency_graph import ( + get_repo_dependency_graph, +) +from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependency_graph +from cognee.tasks.summarization import summarize_code +from cognee.tasks.storage import index_data_points + +async def print_results(pipeline): + async for result in pipeline: + print(result) + +async def write_results(repo, pipeline): + output_dir = os.path.join(repo, "code_pipeline_output", "") + os.makedirs(output_dir, exist_ok = True) + async for code_files, summaries in pipeline: + for summary in summaries: + file_name = os.path.split(summary.made_from.extracted_id)[-1] + relpath = os.path.join(*os.path.split(os.path.relpath(summary.made_from.extracted_id, repo))[:-1]) + output_dir2 = os.path.join(repo, "code_pipeline_output", relpath) + os.makedirs(output_dir2, exist_ok=True) + with open(os.path.join(output_dir2, file_name.replace(".py", ".json")), "w") as f: + f.write(json.dumps({"summary": summary.text, "source_code": summary.made_from.source_code})) + +async def reset_system(): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + return(True) + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="Process a file path") + parser.add_argument("path", help="Path to the file") + + args = parser.parse_args() + abspath = os.path.abspath(args.path) + data = abspath + tasks = [ + Task(get_repo_dependency_graph), + Task(enrich_dependency_graph), + Task(convert_graph_from_code_graph, repo_path = abspath), + Task(index_data_points), + Task(summarize_code, summarization_model=SummarizedContent), + ] + pipeline = run_tasks(tasks, data, "cognify_pipeline") + + asyncio.run(write_results(abspath, pipeline))