Merge branch 'dev' into COG-912-search-by-dataset

2024-12-17 13:22:13 +01:00 · 2024-12-17 13:22:13 +01:00 · 8b09358552
commit 8b09358552
parent af335fafe3 9afd0ece63
18 changed files with 185 additions and 98 deletions
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@ -1,22 +1,35 @@
+# NOTICE: This module contains deprecated functions.
+# Use only the run_code_graph_pipeline function; all other functions are deprecated.
+# Related issue: COG-906
+
 import asyncio
 import logging
+from pathlib import Path
 from typing import Union

+from cognee.modules.data.methods import get_datasets, get_datasets_by_name
+from cognee.modules.data.methods.get_dataset_data import get_dataset_data
+from cognee.modules.data.models import Data, Dataset
+from cognee.modules.pipelines import run_tasks
+from cognee.modules.pipelines.models import PipelineRunStatus
+from cognee.modules.pipelines.operations.get_pipeline_status import \
+    get_pipeline_status
+from cognee.modules.pipelines.operations.log_pipeline_status import \
+    log_pipeline_status
+from cognee.modules.pipelines.tasks.Task import Task
+from cognee.modules.users.methods import get_default_user
+from cognee.modules.users.models import User
 from cognee.shared.SourceCodeGraph import SourceCodeGraph
 from cognee.shared.utils import send_telemetry
-from cognee.modules.data.models import Dataset, Data
-from cognee.modules.data.methods.get_dataset_data import get_dataset_data
-from cognee.modules.data.methods import get_datasets, get_datasets_by_name
-from cognee.modules.pipelines.tasks.Task import Task
-from cognee.modules.pipelines import run_tasks
-from cognee.modules.users.models import User
-from cognee.modules.users.methods import get_default_user
-from cognee.modules.pipelines.models import PipelineRunStatus
-from cognee.modules.pipelines.operations.get_pipeline_status import get_pipeline_status
-from cognee.modules.pipelines.operations.log_pipeline_status import log_pipeline_status
-from cognee.tasks.documents import classify_documents, check_permissions_on_documents, extract_chunks_from_documents
+from cognee.tasks.documents import (check_permissions_on_documents,
+                                    classify_documents,
+                                    extract_chunks_from_documents)
 from cognee.tasks.graph import extract_graph_from_code
+from cognee.tasks.repo_processor import (enrich_dependency_graph,
+                                         expand_dependency_graph,
+                                         get_repo_file_dependencies)
 from cognee.tasks.storage import add_data_points
+from cognee.tasks.summarization import summarize_code

 logger = logging.getLogger("code_graph_pipeline")

@ -51,6 +64,7 @@ async def code_graph_pipeline(datasets: Union[str, list[str]] = None, user: User


 async def run_pipeline(dataset: Dataset, user: User):
+    '''DEPRECATED: Use `run_code_graph_pipeline` instead. This function will be removed.'''
    data_documents: list[Data] = await get_dataset_data(dataset_id = dataset.id)

    document_ids_str = [str(document.id) for document in data_documents]
@ -103,3 +117,30 @@ async def run_pipeline(dataset: Dataset, user: User):

 def generate_dataset_name(dataset_name: str) -> str:
    return dataset_name.replace(".", "_").replace(" ", "_")
+
+
+async def run_code_graph_pipeline(repo_path):
+    import os
+    import pathlib
+    import cognee
+    from cognee.infrastructure.databases.relational import create_db_and_tables
+
+    file_path = Path(__file__).parent
+    data_directory_path = str(pathlib.Path(os.path.join(file_path, ".data_storage/code_graph")).resolve())
+    cognee.config.data_root_directory(data_directory_path)
+    cognee_directory_path = str(pathlib.Path(os.path.join(file_path, ".cognee_system/code_graph")).resolve())
+    cognee.config.system_root_directory(cognee_directory_path)
+
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+    await create_db_and_tables()
+
+    tasks = [
+        Task(get_repo_file_dependencies),
+        Task(enrich_dependency_graph, task_config={"batch_size": 50}),
+        Task(expand_dependency_graph, task_config={"batch_size": 50}),
+        Task(summarize_code, task_config={"batch_size": 50}),
+        Task(add_data_points, task_config={"batch_size": 50}),
+    ]
+
+    return run_tasks(tasks, repo_path, "cognify_code_pipeline")
--- a/cognee/infrastructure/llm/prompts/summarize_code.txt
+++ b/cognee/infrastructure/llm/prompts/summarize_code.txt
@ -0,0 +1,10 @@
+You are an expert Python programmer and technical writer. Your task is to summarize the given Python code snippet or file.
+The code may contain multiple imports, classes, functions, constants and logic. Provide a clear, structured explanation of its components 
+and their relationships.
+
+Instructions:
+Provide an overview: Start with a high-level summary of what the code does as a whole.
+Break it down: Summarize each class and function individually, explaining their purpose and how they interact.
+Describe the workflow: Outline how the classes and functions work together. Mention any control flow (e.g., main functions, entry points, loops).
+Key features: Highlight important elements like arguments, return values, or unique logic.
+Maintain clarity: Write in plain English for someone familiar with Python but unfamiliar with this code.
--- a/cognee/modules/data/extraction/extract_summary.py
+++ b/cognee/modules/data/extraction/extract_summary.py
@ -1,7 +1,11 @@
 from typing import Type
+
 from pydantic import BaseModel
-from cognee.infrastructure.llm.prompts import read_query_prompt
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from cognee.infrastructure.llm.prompts import read_query_prompt
+from cognee.shared.data_models import SummarizedCode
+

 async def extract_summary(content: str, response_model: Type[BaseModel]):
    llm_client = get_llm_client()
@ -11,3 +15,7 @@ async def extract_summary(content: str, response_model: Type[BaseModel]):
    llm_output = await llm_client.acreate_structured_output(content, system_prompt, response_model)

    return llm_output
+
+async def extract_code_summary(content: str):
+    
+    return await extract_summary(content, response_model=SummarizedCode)
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@ -1,6 +1,6 @@
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from cognee.modules.chunking.TextChunker import TextChunker
 from .Document import Document
+from .ChunkerMapping import ChunkerConfig

 class AudioDocument(Document):
    type: str = "audio"
@ -9,11 +9,12 @@ class AudioDocument(Document):
        result = get_llm_client().create_transcript(self.raw_data_location)
        return(result.text)

-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker: str):
        # Transcribe the audio file
        
        text = self.create_transcript()

-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text])
+        chunker_func = ChunkerConfig.get_chunker(chunker)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/ChunkerMapping.py
+++ b/cognee/modules/data/processing/document_types/ChunkerMapping.py
@ -0,0 +1,15 @@
+from cognee.modules.chunking.TextChunker import TextChunker
+
+class ChunkerConfig:
+    chunker_mapping = {
+        "text_chunker": TextChunker
+    }
+
+    @classmethod
+    def get_chunker(cls, chunker_name: str):
+        chunker_class = cls.chunker_mapping.get(chunker_name)
+        if chunker_class is None:
+            raise NotImplementedError(
+                f"Chunker '{chunker_name}' is not implemented. Available options: {list(cls.chunker_mapping.keys())}"
+            )
+        return chunker_class
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@ -13,5 +13,5 @@ class Document(DataPoint):
        "type": "Document"
    }

-    def read(self, chunk_size: int) -> str:
-        pass
+    def read(self, chunk_size: int, chunker = str) -> str:
+        pass
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@ -1,6 +1,6 @@
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from cognee.modules.chunking.TextChunker import TextChunker
 from .Document import Document
+from .ChunkerMapping import ChunkerConfig

 class ImageDocument(Document):
    type: str = "image"
@ -10,10 +10,11 @@ class ImageDocument(Document):
        result = get_llm_client().transcribe_image(self.raw_data_location)
        return(result.choices[0].message.content)

-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker: str):
        # Transcribe the image file
        text = self.transcribe_image()

-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text])
+        chunker_func = ChunkerConfig.get_chunker(chunker)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])

        yield from chunker.read()
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@ -1,11 +1,11 @@
 from pypdf import PdfReader
-from cognee.modules.chunking.TextChunker import TextChunker
 from .Document import Document
+from .ChunkerMapping import ChunkerConfig

 class PdfDocument(Document):
    type: str = "pdf"

-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker: str):
        file = PdfReader(self.raw_data_location)

        def get_text():
@ -13,7 +13,8 @@ class PdfDocument(Document):
                page_text = page.extract_text()
                yield page_text

-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
+        chunker_func = ChunkerConfig.get_chunker(chunker)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text)

        yield from chunker.read()

--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@ -1,10 +1,10 @@
-from cognee.modules.chunking.TextChunker import TextChunker
 from .Document import Document
+from .ChunkerMapping import ChunkerConfig

 class TextDocument(Document):
    type: str = "text"

-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker: str):
        def get_text():
            with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
                while True:
@ -15,6 +15,8 @@ class TextDocument(Document):

                    yield text

-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
+        chunker_func = ChunkerConfig.get_chunker(chunker)
+
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text)

        yield from chunker.read()
--- a/cognee/shared/data_models.py
+++ b/cognee/shared/data_models.py
@ -1,9 +1,11 @@
 """Data models for the cognitive architecture."""

 from enum import Enum, auto
-from typing import Optional, List, Union, Dict, Any
+from typing import Any, Dict, List, Optional, Union
+
 from pydantic import BaseModel, Field

+
 class Node(BaseModel):
    """Node in a knowledge graph."""
    id: str
@ -194,6 +196,29 @@ class SummarizedContent(BaseModel):
    summary: str
    description: str

+class SummarizedFunction(BaseModel):
+    name: str
+    description: str
+    inputs: Optional[List[str]] = None
+    outputs: Optional[List[str]] = None
+    decorators: Optional[List[str]] = None
+
+class SummarizedClass(BaseModel):
+    name: str
+    description: str
+    methods: Optional[List[SummarizedFunction]] = None
+    decorators: Optional[List[str]] = None
+
+class SummarizedCode(BaseModel):
+    file_name: str
+    high_level_summary: str
+    key_features: List[str]
+    imports: List[str] = []
+    constants: List[str] = []
+    classes: List[SummarizedClass] = []
+    functions: List[SummarizedFunction] = []
+    workflow_description: Optional[str] = None
+

 class GraphDBType(Enum):
    NETWORKX = auto()
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@ -1,7 +1,7 @@
 from cognee.modules.data.processing.document_types.Document import Document


-async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024):
+async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024, chunker = 'text_chunker'):
    for document in documents:
-        for document_chunk in document.read(chunk_size = chunk_size):
+        for document_chunk in document.read(chunk_size = chunk_size, chunker = chunker):
            yield document_chunk
--- a/cognee/tasks/summarization/summarize_code.py
+++ b/cognee/tasks/summarization/summarize_code.py
@ -1,39 +1,40 @@
 import asyncio
-from typing import Type
+from typing import AsyncGenerator, Union
 from uuid import uuid5
-
-from pydantic import BaseModel
+from typing import Type

 from cognee.infrastructure.engine import DataPoint
-from cognee.modules.data.extraction.extract_summary import extract_summary
-from cognee.shared.CodeGraphEntities import CodeFile
-from cognee.tasks.storage import add_data_points
-
+from cognee.modules.data.extraction.extract_summary import extract_code_summary
 from .models import CodeSummary


 async def summarize_code(
-    code_files: list[DataPoint],
-    summarization_model: Type[BaseModel],
-) -> list[DataPoint]:
-    if len(code_files) == 0:
-        return code_files
+    code_graph_nodes: list[DataPoint],
+) -> AsyncGenerator[Union[DataPoint, CodeSummary], None]:
+    if len(code_graph_nodes) == 0:
+        return

-    code_files_data_points = [file for file in code_files if isinstance(file, CodeFile)]
+    code_data_points = [file for file in code_graph_nodes if hasattr(file, "source_code")]

    file_summaries = await asyncio.gather(
-        *[extract_summary(file.source_code, summarization_model) for file in code_files_data_points]
+        *[extract_code_summary(file.source_code) for file in code_data_points]
    )

-    summaries = [
-        CodeSummary(
-            id = uuid5(file.id, "CodeSummary"),
-            made_from = file,
-            text = file_summaries[file_index].summary,
+    file_summaries_map = {
+        code_data_point.extracted_id: str(file_summary)
+        for code_data_point, file_summary in zip(code_data_points, file_summaries)
+    }
+
+    for node in code_graph_nodes:
+        if not isinstance(node, DataPoint):
+            continue
+        yield node
+
+        if not hasattr(node, "source_code"):
+            continue
+
+        yield CodeSummary(
+            id=uuid5(node.id, "CodeSummary"),
+            made_from=node,
+            text=file_summaries_map[node.extracted_id],
        )
-        for (file_index, file) in enumerate(code_files_data_points)
-    ]
-
-    await add_data_points(summaries)
-
-    return code_files
--- a/cognee/tests/integration/documents/AudioDocument_test.py
+++ b/cognee/tests/integration/documents/AudioDocument_test.py
@ -31,7 +31,7 @@ def test_AudioDocument():
    )
    with patch.object(AudioDocument, "create_transcript", return_value=TEST_TEXT):
        for ground_truth, paragraph_data in zip(
-            GROUND_TRUTH, document.read(chunk_size=64)
+            GROUND_TRUTH, document.read(chunk_size=64, chunker='text_chunker')
        ):
            assert (
                ground_truth["word_count"] == paragraph_data.word_count
--- a/cognee/tests/integration/documents/ImageDocument_test.py
+++ b/cognee/tests/integration/documents/ImageDocument_test.py
@ -21,7 +21,7 @@ def test_ImageDocument():
    with patch.object(ImageDocument, "transcribe_image", return_value=TEST_TEXT):

        for ground_truth, paragraph_data in zip(
-            GROUND_TRUTH, document.read(chunk_size=64)
+            GROUND_TRUTH, document.read(chunk_size=64, chunker='text_chunker')
        ):
            assert (
                ground_truth["word_count"] == paragraph_data.word_count
--- a/cognee/tests/integration/documents/PdfDocument_test.py
+++ b/cognee/tests/integration/documents/PdfDocument_test.py
@ -22,7 +22,7 @@ def test_PdfDocument():
    )

    for ground_truth, paragraph_data in zip(
-        GROUND_TRUTH, document.read(chunk_size=1024)
+        GROUND_TRUTH, document.read(chunk_size=1024, chunker='text_chunker')
    ):
        assert (
            ground_truth["word_count"] == paragraph_data.word_count
--- a/cognee/tests/integration/documents/TextDocument_test.py
+++ b/cognee/tests/integration/documents/TextDocument_test.py
@ -33,7 +33,7 @@ def test_TextDocument(input_file, chunk_size):
    )

    for ground_truth, paragraph_data in zip(
-        GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size)
+        GROUND_TRUTH[input_file], document.read(chunk_size=chunk_size, chunker='text_chunker')
    ):
        assert (
            ground_truth["word_count"] == paragraph_data.word_count
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@ -7,19 +7,13 @@ from pathlib import Path
 from swebench.harness.utils import load_swebench_dataset
 from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE

+from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
 from cognee.api.v1.search import SearchType
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt
-from cognee.modules.pipelines import Task, run_tasks
 from cognee.modules.retrieval.brute_force_triplet_search import \
    brute_force_triplet_search
-# from cognee.shared.data_models import SummarizedContent
 from cognee.shared.utils import render_graph
-from cognee.tasks.repo_processor import (enrich_dependency_graph,
-                                         expand_dependency_graph,
-                                         get_repo_file_dependencies)
-from cognee.tasks.storage import add_data_points
-# from cognee.tasks.summarization import summarize_code
 from evals.eval_utils import download_github_repo, retrieved_edges_to_string


@ -42,48 +36,22 @@ def check_install_package(package_name):


 async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
-    import os
-    import pathlib
-    import cognee
-    from cognee.infrastructure.databases.relational import create_db_and_tables
-
-    file_path = Path(__file__).parent
-    data_directory_path = str(pathlib.Path(os.path.join(file_path, ".data_storage/code_graph")).resolve())
-    cognee.config.data_root_directory(data_directory_path)
-    cognee_directory_path = str(pathlib.Path(os.path.join(file_path, ".cognee_system/code_graph")).resolve())
-    cognee.config.system_root_directory(cognee_directory_path)
-
-    await cognee.prune.prune_data()
-    await cognee.prune.prune_system(metadata = True)
-
-    await create_db_and_tables()
-
-    # repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')
-    
-    repo_path = '/Users/borisarzentar/Projects/graphrag'
-    
-    tasks = [
-        Task(get_repo_file_dependencies),
-        Task(enrich_dependency_graph, task_config = { "batch_size": 50 }),
-        Task(expand_dependency_graph, task_config = { "batch_size": 50 }),
-        Task(add_data_points, task_config = { "batch_size": 50 }),
-        # Task(summarize_code, summarization_model = SummarizedContent),
-    ]
-
-    pipeline = run_tasks(tasks, repo_path, "cognify_code_pipeline")
+    repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')
+    pipeline = await run_code_graph_pipeline(repo_path)

    async for result in pipeline:
        print(result)

    print('Here we have the repo under the repo_path')

-    await render_graph(None, include_labels = True, include_nodes = True)
+    await render_graph(None, include_labels=True, include_nodes=True)

    problem_statement = instance['problem_statement']
    instructions = read_query_prompt("patch_gen_kg_instructions.txt")

-    retrieved_edges = await brute_force_triplet_search(problem_statement, top_k = 3, collections = ["data_point_source_code", "data_point_text"])
-    
+    retrieved_edges = await brute_force_triplet_search(problem_statement, top_k=3,
+                                                       collections=["data_point_source_code", "data_point_text"])
+
    retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)

    prompt = "\n".join([
@ -171,7 +139,6 @@ async def main():
        with open(predictions_path, "w") as file:
            json.dump(preds, file)

-
    subprocess.run(
        [
            "python",
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@ -0,0 +1,15 @@
+import argparse
+import asyncio
+from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
+
+
+async def main(repo_path):
+    async for result in await run_code_graph_pipeline(repo_path):
+        print(result)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository")
+    args = parser.parse_args()
+    asyncio.run(main(args.repo_path))
+