test: Test for code graph enrichment task

Co-authored-by: lxobr <lazar@topoteretes.com>
2024-11-24 19:24:47 +01:00 · 2024-11-24 19:24:47 +01:00 · 80b06c3acb
commit 80b06c3acb
parent 70bdaea8f7
20 changed files with 527 additions and 108 deletions
--- a/.github/workflows/test_python_3_10.yml
+++ b/.github/workflows/test_python_3_10.yml
@ -56,6 +56,12 @@ jobs:
      - name: Run integration tests
        run: poetry run pytest cognee/tests/integration/

+      - name: Run summarize_code test
+        run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
+        env:
+          ENV: 'dev'
+          LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
      - name: Run default basic pipeline
        env:
          ENV: 'dev'
--- a/.github/workflows/test_python_3_11.yml
+++ b/.github/workflows/test_python_3_11.yml
@ -56,6 +56,12 @@ jobs:
      - name: Run integration tests
        run: poetry run pytest cognee/tests/integration/

+      - name: Run summarize_code test
+        run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
+        env:
+          ENV: 'dev'
+          LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
      - name: Run default basic pipeline
        env:
          ENV: 'dev'
--- a/.github/workflows/test_python_3_9.yml
+++ b/.github/workflows/test_python_3_9.yml
@ -56,6 +56,12 @@ jobs:
      - name: Run integration tests
        run: poetry run pytest cognee/tests/integration/

+      - name: Run summarize_code test
+        run: poetry run pytest cognee/tests/tasks/summarization/summarize_code_test.py
+        env:
+          ENV: 'dev'
+          LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
      - name: Run default basic pipeline
        env:
          ENV: 'dev'
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,8 @@
 .prod.env
 cognee/.data/

+code_pipeline_output*/
+
 *.lance/
 .DS_Store
 # Byte-compiled / optimized / DLL files
--- a/cognee/modules/engine/models/Entity.py
+++ b/cognee/modules/engine/models/Entity.py
@ -1,13 +1,17 @@
+from typing import Union
+
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
-from .EntityType import EntityType
+from cognee.modules.engine.models.EntityType import EntityType
+from cognee.shared.CodeGraphEntities import Repository
+

 class Entity(DataPoint):
    __tablename__ = "entity"
    name: str
    is_a: EntityType
    description: str
-    mentioned_in: DocumentChunk
+    mentioned_in: Union[DocumentChunk, Repository]
    _metadata: dict = {
        "index_fields": ["name"],
    }
--- a/cognee/modules/engine/models/EntityType.py
+++ b/cognee/modules/engine/models/EntityType.py
@ -1,12 +1,16 @@
+from typing import Union
+
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
+from cognee.shared.CodeGraphEntities import Repository
+

 class EntityType(DataPoint):
    __tablename__ = "entity_type"
    name: str
    type: str
    description: str
-    exists_in: DocumentChunk
+    exists_in: Union[DocumentChunk, Repository]
    _metadata: dict = {
        "index_fields": ["name"],
    }
--- a/cognee/modules/graph/utils/init.py
+++ b/cognee/modules/graph/utils/init.py
@ -1,2 +1,4 @@
+from .expand_with_nodes_and_edges import expand_with_nodes_and_edges
 from .get_graph_from_model import get_graph_from_model
 from .get_model_instance_from_graph import get_model_instance_from_graph
+from .retrieve_existing_edges import retrieve_existing_edges
--- a/cognee/modules/graph/utils/expand_with_nodes_and_edges.py
+++ b/cognee/modules/graph/utils/expand_with_nodes_and_edges.py
@ -0,0 +1,83 @@
+from typing import Optional
+
+from cognee.infrastructure.engine import DataPoint
+from cognee.modules.engine.models import Entity, EntityType
+from cognee.modules.engine.utils import (
+    generate_edge_name,
+    generate_node_id,
+    generate_node_name,
+)
+from cognee.shared.data_models import KnowledgeGraph
+
+
+def expand_with_nodes_and_edges(
+    graph_node_index: list[tuple[DataPoint, KnowledgeGraph]],
+    existing_edges_map: Optional[dict[str, bool]] = None,
+):
+    if existing_edges_map is None:
+        existing_edges_map = {}
+
+    added_nodes_map = {}
+    relationships = []
+    data_points = []
+
+    for graph_source, graph in graph_node_index:
+        if graph is None:
+            continue
+
+        for node in graph.nodes:
+            node_id = generate_node_id(node.id)
+            node_name = generate_node_name(node.name)
+
+            type_node_id = generate_node_id(node.type)
+            type_node_name = generate_node_name(node.type)
+
+            if f"{str(type_node_id)}_type" not in added_nodes_map:
+                type_node = EntityType(
+                    id=type_node_id,
+                    name=type_node_name,
+                    type=type_node_name,
+                    description=type_node_name,
+                    exists_in=graph_source,
+                )
+                added_nodes_map[f"{str(type_node_id)}_type"] = type_node
+            else:
+                type_node = added_nodes_map[f"{str(type_node_id)}_type"]
+
+            if f"{str(node_id)}_entity" not in added_nodes_map:
+                entity_node = Entity(
+                    id=node_id,
+                    name=node_name,
+                    is_a=type_node,
+                    description=node.description,
+                    mentioned_in=graph_source,
+                )
+                data_points.append(entity_node)
+                added_nodes_map[f"{str(node_id)}_entity"] = entity_node
+
+        # Add relationship that came from graphs.
+        for edge in graph.edges:
+            source_node_id = generate_node_id(edge.source_node_id)
+            target_node_id = generate_node_id(edge.target_node_id)
+            relationship_name = generate_edge_name(edge.relationship_name)
+
+            edge_key = str(source_node_id) + str(target_node_id) + relationship_name
+
+            if edge_key not in existing_edges_map:
+                relationships.append(
+                    (
+                        source_node_id,
+                        target_node_id,
+                        edge.relationship_name,
+                        dict(
+                            relationship_name=generate_edge_name(
+                                edge.relationship_name
+                            ),
+                            source_node_id=source_node_id,
+                            target_node_id=target_node_id,
+                        ),
+                    )
+                )
+                existing_edges_map[edge_key] = True
+
+        return (data_points, relationships)
--- a/cognee/modules/graph/utils/retrieve_existing_edges.py
+++ b/cognee/modules/graph/utils/retrieve_existing_edges.py
@ -0,0 +1,55 @@
+from cognee.infrastructure.databases.graph.graph_db_interface import GraphDBInterface
+from cognee.infrastructure.engine import DataPoint
+from cognee.modules.engine.utils import generate_node_id
+from cognee.shared.data_models import KnowledgeGraph
+
+
+async def retrieve_existing_edges(
+    graph_node_index: list[tuple[DataPoint, KnowledgeGraph]],
+    graph_engine: GraphDBInterface,
+) -> dict[str, bool]:
+    processed_nodes = {}
+    type_node_edges = []
+    entity_node_edges = []
+    type_entity_edges = []
+
+    for graph_source, graph in graph_node_index:
+        for node in graph.nodes:
+            type_node_id = generate_node_id(node.type)
+            entity_node_id = generate_node_id(node.id)
+
+            if str(type_node_id) not in processed_nodes:
+                type_node_edges.append(
+                    (str(graph_source), str(type_node_id), "exists_in")
+                )
+                processed_nodes[str(type_node_id)] = True
+
+            if str(entity_node_id) not in processed_nodes:
+                entity_node_edges.append(
+                    (str(graph_source), entity_node_id, "mentioned_in")
+                )
+                type_entity_edges.append(
+                    (str(entity_node_id), str(type_node_id), "is_a")
+                )
+                processed_nodes[str(entity_node_id)] = True
+
+        graph_node_edges = [
+            (edge.target_node_id, edge.source_node_id, edge.relationship_name)
+            for edge in graph.edges
+        ]
+
+    existing_edges = await graph_engine.has_edges(
+        [
+            *type_node_edges,
+            *entity_node_edges,
+            *type_entity_edges,
+            *graph_node_edges,
+        ]
+    )
+
+    existing_edges_map = {}
+
+    for edge in existing_edges:
+        existing_edges_map[edge[0] + edge[1] + edge[2]] = True
+
+    return existing_edges_map
--- a/cognee/shared/CodeGraphEntities.py
+++ b/cognee/shared/CodeGraphEntities.py
@ -0,0 +1,23 @@
+from typing import Any, List, Literal, Optional, Union
+
+from cognee.infrastructure.engine import DataPoint
+
+
+class Repository(DataPoint):
+    path: str
+
+
+class CodeFile(DataPoint):
+    extracted_id: str  # actually file path
+    type: str
+    source_code: str
+
+    _metadata: dict = {
+        "index_fields": ["source_code"]
+    }
+
+class CodeRelationship(DataPoint):
+    source_id: str
+    target_id: str
+    type: str  # between files
+    relation: str  # depends on or depends directly
--- a/cognee/tasks/graph/convert_graph_from_code_graph.py
+++ b/cognee/tasks/graph/convert_graph_from_code_graph.py
@ -0,0 +1,105 @@
+from uuid import UUID, uuid4
+import os
+import networkx as nx
+
+from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.modules.graph.utils import (
+    expand_with_nodes_and_edges,
+    retrieve_existing_edges,
+)
+from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository
+from cognee.shared.data_models import Edge, KnowledgeGraph, Node
+from cognee.tasks.storage import add_data_points
+
+
+async def convert_graph_from_code_graph(
+    graph: nx.DiGraph, repo_path: str
+) -> tuple[str, list[CodeFile], list[CodeRelationship]]:
+
+    repo, nodes, edges = code_objects_from_di_graph(graph, repo_path)
+
+    graph_engine = await get_graph_engine()
+
+    code_knowledge_graph = build_code_knowledge_graph(nodes, edges)
+    repo_and_knowledge_graph = [(repo, code_knowledge_graph)]
+
+    existing_edges_map = await retrieve_existing_edges(
+        repo_and_knowledge_graph, graph_engine
+    )
+
+    graph_nodes, graph_edges = expand_with_nodes_and_edges(
+        repo_and_knowledge_graph, existing_edges_map
+    )
+
+    if len(graph_nodes) > 0:
+        await add_data_points(graph_nodes)
+
+    if len(graph_edges) > 0:
+        await graph_engine.add_edges(graph_edges)
+
+    return nodes
+
+
+def convert_node(node: CodeFile) -> Node:
+    return Node(
+        id=str(node.id),
+        name=node.extracted_id,
+        type=node.type,
+        description=f"{node.source_code = }",
+        properties={},
+    )
+
+
+def convert_edge(edge: CodeRelationship, extracted_ids_to_ids: dict[str, UUID]) -> Edge:
+    return Edge(
+        source_node_id=str(extracted_ids_to_ids[edge.source_id]),
+        target_node_id=str(extracted_ids_to_ids[edge.target_id]),
+        relationship_name=f"{edge.type}_{edge.relation}",
+    )
+
+
+def build_code_knowledge_graph(nodes: list[CodeFile], edges: list[CodeRelationship]):
+    extracted_ids_to_ids = {node.extracted_id: node.id for node in nodes}
+    graph_nodes = [convert_node(node) for node in nodes]
+    graph_edges = [convert_edge(edge, extracted_ids_to_ids) for edge in edges]
+    return KnowledgeGraph(nodes=graph_nodes, edges=graph_edges)
+
+
+def create_code_file(path, type):
+    abspath = os.path.abspath(path)
+    print(f"{path = } - {abspath = }")
+    with open(abspath, "r") as f:
+        source_code = f.read()
+    code_file = CodeFile(extracted_id=abspath, type=type, source_code=source_code)
+    return (code_file, abspath)
+
+
+def create_code_relationship(
+    source_path: str, target_path: str, type: str, relation: str
+):
+    return CodeRelationship(
+        source_id=source_path, target_id=target_path, type=type, relation=relation
+    )
+
+
+def code_objects_from_di_graph(
+    graph: nx.DiGraph, repo_path: str
+) -> tuple[Repository, list[CodeFile], list[CodeRelationship]]:
+    repo = Repository(path=repo_path)
+
+    code_files = [
+        create_code_file(os.path.join(repo_path, path), "python_file")[0]
+        for path in graph.nodes
+    ]
+
+    code_relationships = [
+        create_code_relationship(
+            os.path.join(repo_path, source),
+            os.path.join(repo_path, target),
+            "python_file",
+            graph.get_edge_data(source, target, v)["relation"],
+        )
+        for source, target, v in graph.edges
+    ]
+
+    return (repo, code_files, code_relationships)
--- a/cognee/tasks/graph/extract_graph_from_data.py
+++ b/cognee/tasks/graph/extract_graph_from_data.py
@ -1,119 +1,38 @@
 import asyncio
 from typing import Type
+
 from pydantic import BaseModel
+
 from cognee.infrastructure.databases.graph import get_graph_engine
-from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
 from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
-from cognee.modules.engine.models import EntityType, Entity
-from cognee.modules.engine.utils import generate_edge_name, generate_node_id, generate_node_name
+from cognee.modules.data.extraction.knowledge_graph import extract_content_graph
+from cognee.modules.graph.utils import (
+    expand_with_nodes_and_edges,
+    retrieve_existing_edges,
+)
 from cognee.tasks.storage import add_data_points

-async def extract_graph_from_data(data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]):
+
+async def extract_graph_from_data(
+    data_chunks: list[DocumentChunk], graph_model: Type[BaseModel]
+):
    chunk_graphs = await asyncio.gather(
        *[extract_content_graph(chunk.text, graph_model) for chunk in data_chunks]
    )
-
-    processed_nodes = {}
-    type_node_edges = []
-    entity_node_edges = []
-    type_entity_edges = []
-
-    for (chunk_index, chunk) in enumerate(data_chunks):
-        chunk_graph = chunk_graphs[chunk_index]
-        for node in chunk_graph.nodes:
-            type_node_id = generate_node_id(node.type)
-            entity_node_id = generate_node_id(node.id)
-
-            if str(type_node_id) not in processed_nodes:
-                type_node_edges.append((str(chunk.id), str(type_node_id), "exists_in"))
-                processed_nodes[str(type_node_id)] = True
-
-            if str(entity_node_id) not in processed_nodes:
-                entity_node_edges.append((str(chunk.id), entity_node_id, "mentioned_in"))
-                type_entity_edges.append((str(entity_node_id), str(type_node_id), "is_a"))
-                processed_nodes[str(entity_node_id)] = True
-
-        graph_node_edges = [
-            (edge.target_node_id, edge.source_node_id, edge.relationship_name) \
-                        for edge in chunk_graph.edges
-        ]
-
    graph_engine = await get_graph_engine()
+    chunk_and_chunk_graphs = [
+        (chunk, chunk_graph) for chunk, chunk_graph in zip(data_chunks, chunk_graphs)
+    ]
+    existing_edges_map = await retrieve_existing_edges(
+        chunk_and_chunk_graphs, graph_engine
+    )

-    existing_edges = await graph_engine.has_edges([
-        *type_node_edges,
-        *entity_node_edges,
-        *type_entity_edges,
-        *graph_node_edges,
-    ])
+    graph_nodes, graph_edges = expand_with_nodes_and_edges(
+        chunk_and_chunk_graphs, existing_edges_map
+    )

-    existing_edges_map = {}
-
-    for edge in existing_edges:
-        existing_edges_map[edge[0] + edge[1] + edge[2]] = True
-
-    added_nodes_map = {}
-    graph_edges = []
-    data_points = []
-
-    for (chunk_index, chunk) in enumerate(data_chunks):
-        graph = chunk_graphs[chunk_index]
-        if graph is None:
-            continue
-
-        for node in graph.nodes:
-            node_id = generate_node_id(node.id)
-            node_name = generate_node_name(node.name)
-
-            type_node_id = generate_node_id(node.type)
-            type_node_name = generate_node_name(node.type)
-
-            if f"{str(type_node_id)}_type" not in added_nodes_map:
-                type_node = EntityType(
-                    id = type_node_id,
-                    name = type_node_name,
-                    type = type_node_name,
-                    description = type_node_name,
-                    exists_in = chunk,
-                )
-                added_nodes_map[f"{str(type_node_id)}_type"] = type_node
-            else:
-                type_node = added_nodes_map[f"{str(type_node_id)}_type"]
-
-            if f"{str(node_id)}_entity" not in added_nodes_map:
-                entity_node = Entity(
-                    id = node_id,
-                    name = node_name,
-                    is_a = type_node,
-                    description = node.description,
-                    mentioned_in = chunk,
-                )
-                data_points.append(entity_node)
-                added_nodes_map[f"{str(node_id)}_entity"] = entity_node
-
-        # Add relationship that came from graphs.
-        for edge in graph.edges:
-            source_node_id = generate_node_id(edge.source_node_id)
-            target_node_id = generate_node_id(edge.target_node_id)
-            relationship_name = generate_edge_name(edge.relationship_name)
-
-            edge_key = str(source_node_id) + str(target_node_id) + relationship_name
-
-            if edge_key not in existing_edges_map:
-                graph_edges.append((
-                    source_node_id,
-                    target_node_id,
-                    edge.relationship_name,
-                    dict(
-                        relationship_name = generate_edge_name(edge.relationship_name),
-                        source_node_id = source_node_id,
-                        target_node_id = target_node_id,
-                    ),
-                ))
-                existing_edges_map[edge_key] = True
-
-    if len(data_points) > 0:
-        await add_data_points(data_points)
+    if len(graph_nodes) > 0:
+        await add_data_points(graph_nodes)

    if len(graph_edges) > 0:
        await graph_engine.add_edges(graph_edges)
--- a/cognee/tasks/repo_processor/get_repo_dependency_graph.py
+++ b/cognee/tasks/repo_processor/get_repo_dependency_graph.py
@ -55,7 +55,7 @@ async def get_repo_dependency_graph(repo_path: str) -> nx.DiGraph:
        if source_code is None:
            continue

-        dependencies = await get_local_script_dependencies(file_path, repo_path)
+        dependencies = await get_local_script_dependencies(os.path.join(repo_path, file_path), repo_path)
        dependency_edges = [get_edge(file_path, dependency, repo_path) for dependency in dependencies]
        dependency_graph.add_edges_from(dependency_edges)
    return dependency_graph
--- a/cognee/tasks/summarization/init.py
+++ b/cognee/tasks/summarization/init.py
@ -1,2 +1,3 @@
-from .summarize_text import summarize_text
 from .query_summaries import query_summaries
+from .summarize_code import summarize_code
+from .summarize_text import summarize_text
--- a/cognee/tasks/summarization/models.py
+++ b/cognee/tasks/summarization/models.py
@ -1,6 +1,8 @@
 from cognee.infrastructure.engine import DataPoint
 from cognee.modules.chunking.models.DocumentChunk import DocumentChunk
 from cognee.modules.data.processing.document_types import Document
+from cognee.shared.CodeGraphEntities import CodeFile
+

 class TextSummary(DataPoint):
    __tablename__ = "text_summary"
@ -10,3 +12,12 @@ class TextSummary(DataPoint):
    _metadata: dict = {
        "index_fields": ["text"],
    }
+
+
+class CodeSummary(DataPoint):
+    text: str
+    made_from: CodeFile
+
+    _metadata: dict = {
+        "index_fields": ["text"],
+    }
--- a/cognee/tasks/summarization/summarize_code.py
+++ b/cognee/tasks/summarization/summarize_code.py
@ -0,0 +1,35 @@
+import asyncio
+from typing import Type, Union
+from uuid import uuid5
+
+from pydantic import BaseModel
+
+from cognee.modules.data.extraction.extract_summary import extract_summary
+from cognee.shared.CodeGraphEntities import CodeFile
+from cognee.tasks.storage import add_data_points
+
+from .models import CodeSummary
+
+
+async def summarize_code(
+    code_files: list[CodeFile], summarization_model: Type[BaseModel]
+) -> list[CodeFile]:
+    if len(code_files) == 0:
+        return code_files
+
+    file_summaries = await asyncio.gather(
+        *[extract_summary(file.source_code, summarization_model) for file in code_files]
+    )
+
+    summaries = [
+        CodeSummary(
+            id=uuid5(file.id, "CodeSummary"),
+            made_from=file,
+            text=file_summaries[file_index].summary,
+        )
+        for (file_index, file) in enumerate(code_files)
+    ]
+
+    await add_data_points(summaries)
+
+    return code_files, summaries
--- a/cognee/tests/tasks/graph/code_graph_test_data_generation.py
+++ b/cognee/tests/tasks/graph/code_graph_test_data_generation.py
@ -0,0 +1,51 @@
+import random
+import string
+
+import numpy as np
+
+from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship
+
+
+def random_str(n, spaces=True):
+    candidates = string.ascii_letters + string.digits
+    if spaces:
+        candidates += "    "
+    return "".join(random.choice(candidates) for _ in range(n))
+
+
+def code_graph_test_data_generation():
+    nodes = [
+        CodeFile(
+            extracted_id=random_str(10, spaces=False),
+            type="file",
+            source_code=random_str(random.randrange(50, 500)),
+        )
+        for _ in range(100)
+    ]
+    n_nodes = len(nodes)
+    first_source = np.random.randint(0, n_nodes)
+    reached_nodes = {first_source}
+    last_iteration = [first_source]
+    edges = []
+    while len(reached_nodes) < n_nodes:
+        for source in last_iteration:
+            last_iteration = []
+            tries = 0
+            while ((len(last_iteration) == 0 or tries < 500)) and (
+                len(reached_nodes) < n_nodes
+            ):
+                tries += 1
+                target = np.random.randint(n_nodes)
+                if target not in reached_nodes:
+                    last_iteration.append(target)
+                    edges.append(
+                        CodeRelationship(
+                            source_id=nodes[source].extracted_id,
+                            target_id=nodes[target].extracted_id,
+                            type="files",
+                            relation="depends",
+                        )
+                    )
+            reached_nodes = reached_nodes.union(set(last_iteration))
+
+    return (nodes, edges)
--- a/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py
+++ b/cognee/tests/tasks/graph/convert_graph_from_code_graph_test.py
@ -0,0 +1,27 @@
+import asyncio
+
+import pytest
+
+from cognee.shared.CodeGraphEntities import Repository
+from cognee.tasks.graph.convert_graph_from_code_graph import (
+    convert_graph_from_code_graph,
+)
+from cognee.tests.tasks.graph.code_graph_test_data_generation import (
+    code_graph_test_data_generation,
+)
+
+
+def test_convert_graph_from_code_graph():
+    repo = Repository(path="test/repo/path")
+    nodes, edges = code_graph_test_data_generation()
+    repo_out, nodes_out, edges_out = asyncio.run(
+        convert_graph_from_code_graph(repo, nodes, edges)
+    )
+
+    assert repo == repo_out, f"{repo = } != {repo_out = }"
+
+    for node_in, node_out in zip(nodes, nodes_out):
+        assert node_in == node_out, f"{node_in = } != {node_out = }"
+
+    for edge_in, edge_out in zip(edges, edges_out):
+        assert edge_in == edge_out, f"{edge_in = } != {edge_out = }"
--- a/cognee/tests/tasks/summarization/summarize_code_test.py
+++ b/cognee/tests/tasks/summarization/summarize_code_test.py
@ -0,0 +1,15 @@
+import asyncio
+
+from cognee.shared.data_models import SummarizedContent
+from cognee.tasks.summarization import summarize_code
+from cognee.tests.tasks.graph.code_graph_test_data_generation import (
+    code_graph_test_data_generation,
+)
+
+
+def test_summarize_code():
+    nodes, _ = code_graph_test_data_generation()
+    nodes_out = asyncio.run(summarize_code(nodes, SummarizedContent))
+
+    for node_in, node_out in zip(nodes, nodes_out):
+        assert node_in == node_out, f"{node_in = } != {node_out = }"
--- a/examples/python/code_graph_pipeline.py
+++ b/examples/python/code_graph_pipeline.py
@ -0,0 +1,64 @@
+import argparse
+import asyncio
+import os
+import cognee
+import json
+
+import numpy as np
+from networkx.classes.digraph import DiGraph
+
+from cognee.modules.pipelines import Task, run_tasks
+from cognee.shared.CodeGraphEntities import CodeFile, CodeRelationship, Repository
+from cognee.shared.data_models import SummarizedContent
+from cognee.tasks.code.get_local_dependencies_checker import (
+    get_local_script_dependencies,
+)
+from cognee.tasks.graph.convert_graph_from_code_graph import (
+    convert_graph_from_code_graph,
+)
+from cognee.tasks.repo_processor.get_repo_dependency_graph import (
+    get_repo_dependency_graph,
+)
+from cognee.tasks.repo_processor.enrich_dependency_graph import enrich_dependency_graph
+from cognee.tasks.summarization import summarize_code
+from cognee.tasks.storage import index_data_points
+
+async def print_results(pipeline):
+    async for result in pipeline:
+        print(result)
+
+async def write_results(repo, pipeline):
+    output_dir = os.path.join(repo, "code_pipeline_output", "")
+    os.makedirs(output_dir, exist_ok = True)
+    async for code_files, summaries in pipeline:
+        for summary in summaries:
+            file_name = os.path.split(summary.made_from.extracted_id)[-1]
+            relpath = os.path.join(*os.path.split(os.path.relpath(summary.made_from.extracted_id, repo))[:-1])
+            output_dir2 = os.path.join(repo, "code_pipeline_output", relpath)            
+            os.makedirs(output_dir2, exist_ok=True)
+            with open(os.path.join(output_dir2, file_name.replace(".py", ".json")), "w") as f:
+                f.write(json.dumps({"summary": summary.text, "source_code": summary.made_from.source_code}))
+
+async def reset_system():
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    return(True)
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Process a file path")
+    parser.add_argument("path", help="Path to the file")
+
+    args = parser.parse_args()
+    abspath = os.path.abspath(args.path)
+    data = abspath
+    tasks = [
+        Task(get_repo_dependency_graph),
+        Task(enrich_dependency_graph),
+        Task(convert_graph_from_code_graph, repo_path = abspath),
+        Task(index_data_points),
+        Task(summarize_code, summarization_model=SummarizedContent),
+    ]
+    pipeline = run_tasks(tasks, data, "cognify_pipeline")
+
+    asyncio.run(write_results(abspath, pipeline))