From 8edfe7c5a4e27c4b499ce98c164ce84342057bc0 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail..com>
Date: Thu, 28 Nov 2024 16:52:54 +0100
Subject: [PATCH 1/3] feat/connect code graph pipeline to benchmarking

---
 evals/eval_swe_bench.py | 69 ++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py
index 1dd0e58ab..c16e821fa 100644
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@@ -8,26 +8,35 @@ from swebench.harness.utils import load_swebench_dataset
 from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE
 
 import cognee
-
-from cognee.shared.data_models import SummarizedContent
-from cognee.shared.utils import render_graph
-from cognee.tasks.repo_processor import (
-    enrich_dependency_graph,
-    expand_dependency_graph,
-    get_repo_file_dependencies,
-)
-from cognee.tasks.storage import add_data_points
-from cognee.tasks.summarization import summarize_code
-from cognee.modules.pipelines import Task, run_tasks
 from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline
 from cognee.api.v1.search import SearchType
 from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt
-from evals.eval_utils import download_instances
-from evals.eval_utils import ingest_repos
-from evals.eval_utils import download_github_repo
-from evals.eval_utils import delete_repo
+from cognee.modules.pipelines import Task, run_tasks
+from cognee.modules.retrieval.brute_force_triplet_search import \
+    brute_force_triplet_search
+from cognee.shared.data_models import SummarizedContent
+from cognee.shared.utils import render_graph
+from cognee.tasks.repo_processor import (enrich_dependency_graph,
+                                         expand_dependency_graph,
+                                         get_repo_file_dependencies)
+from cognee.tasks.storage import add_data_points
+from cognee.tasks.summarization import summarize_code
+from evals.eval_utils import (delete_repo, download_github_repo,
+                              download_instances, ingest_repos)
+
+
+def node_to_string(node):
+    text = node.attributes["text"]
+    return f"Node({node.id}, {text})"
+def retrieved_edges_to_string(retrieved_edges):
+    edge_strings = []
+    for edge in retrieved_edges:
+        relationship_type = edge.attributes["relationship_type"]
+        edge_str = f"{node_to_string(edge.node1)} {relationship_type} {node_to_string(edge.node2)}"
+        edge_strings.append(edge_str)
+    return "\n".join(edge_strings)  
 
 async def generate_patch_with_cognee(instance):
     await cognee.prune.prune_data()
@@ -39,19 +48,18 @@ async def generate_patch_with_cognee(instance):
 
     # repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')
 
-    repo_path = '/Users/borisarzentar/Projects/graphrag'
-
+    repo_path = '../minimal_repo'
     tasks = [
         Task(get_repo_file_dependencies),
         Task(add_data_points, task_config = { "batch_size": 50 }),
         Task(enrich_dependency_graph, task_config = { "batch_size": 50 }),
         Task(expand_dependency_graph, task_config = { "batch_size": 50 }),
         Task(add_data_points, task_config = { "batch_size": 50 }),
-        # Task(summarize_code, summarization_model = SummarizedContent),
+        Task(summarize_code, summarization_model = SummarizedContent),
     ]
 
     pipeline = run_tasks(tasks, repo_path, "cognify_code_pipeline")
-
+        
     async for result in pipeline:
         print(result)
 
@@ -62,29 +70,27 @@ async def generate_patch_with_cognee(instance):
     problem_statement = instance['problem_statement']
     instructions = read_query_prompt("patch_gen_instructions.txt")
 
-    graph_str = 'HERE WE SHOULD PASS THE TRIPLETS FROM GRAPHRAG'
+    retrieved_edges = await brute_force_triplet_search(problem_statement, top_k = 3)
+    
+    retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)
 
     prompt = "\n".join([
-        instructions,
         "<patch>",
         PATCH_EXAMPLE,
         "</patch>",
-        "This is the knowledge graph:",
-        graph_str
+        "These are the retrieved edges:",
+        retrieved_edges_str
     ])
 
-    return 0
-
-    ''' :TODO: We have to find out how do we do the generation
     llm_client = get_llm_client()
     answer_prediction = await llm_client.acreate_structured_output(
-        text_input=problem_statement,
-        system_prompt=prompt,
+        text_input=prompt,
+        system_prompt=instructions,
         response_model=str,
     )
 
     return answer_prediction
-    '''
+
 
 async def generate_patch_without_cognee(instance):
     problem_statement = instance['problem_statement']
@@ -111,12 +117,11 @@ async def get_preds(dataset, with_cognee=True):
     for instance in dataset:
         await pred_func(instance)
 
-    '''
     preds = [{"instance_id": instance["instance_id"],
               "model_patch": await pred_func(instance),
               "model_name_or_path": model_name} for instance in dataset]
-    '''
-    return 0
+  
+    return preds
 
 
 async def main():

From 8f241fa6c51d7fe9db3a7b6d84cf0b45aa2938a1 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail..com>
Date: Fri, 29 Nov 2024 12:01:01 +0100
Subject: [PATCH 2/3] convert edge to string

---
 cognee/infrastructure/llm/prompts/patch_gen_instructions.txt | 4 ++--
 evals/eval_swe_bench.py                                      | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt b/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt
index 1553753ab..ebbb03f75 100644
--- a/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt
+++ b/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt
@@ -1,3 +1,3 @@
-I need you to solve this issue by looking at the provided knowledge graph and 
-generating a single patch file that I can apply directly to this repository using git apply. 
+I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and 
+generate a single patch file that I can apply directly to this repository using git apply. 
 Please respond with a single patch file in the following format.
\ No newline at end of file
diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py
index c16e821fa..80fa35623 100644
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@@ -29,7 +29,8 @@ from evals.eval_utils import (delete_repo, download_github_repo,
 
 def node_to_string(node):
     text = node.attributes["text"]
-    return f"Node({node.id}, {text})"
+    type = node.attributes["type"]
+    return f"Node(id: {node.id}, type: {type}, description: {text})"
 def retrieved_edges_to_string(retrieved_edges):
     edge_strings = []
     for edge in retrieved_edges:

From a4c56f118d77d0ddc44c8f4071d2560ad6b19cb6 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail..com>
Date: Fri, 29 Nov 2024 15:24:49 +0100
Subject: [PATCH 3/3] Connect code graph pipeline + retriever + benchmarking

---
 .gitignore                                    |   2 +-
 .../llm/prompts/patch_gen_kg_instructions.txt |   4 +-
 .../retrieval/brute_force_triplet_search.py   |  16 +--
 evals/eval_swe_bench.py                       |  58 +++------
 evals/eval_utils.py                           | 119 +++---------------
 5 files changed, 45 insertions(+), 154 deletions(-)

diff --git a/.gitignore b/.gitignore
index 47fa54130..edaa94cd4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,7 @@ __pycache__/
 *$py.class
 
 full_run.ipynb
-evals/
+logs/
 
 # C extensions
 *.so
diff --git a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt
index 1553753ab..ebbb03f75 100644
--- a/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt
+++ b/cognee/infrastructure/llm/prompts/patch_gen_kg_instructions.txt
@@ -1,3 +1,3 @@
-I need you to solve this issue by looking at the provided knowledge graph and 
-generating a single patch file that I can apply directly to this repository using git apply. 
+I need you to solve this issue by looking at the provided edges retrieved from a knowledge graph and 
+generate a single patch file that I can apply directly to this repository using git apply. 
 Please respond with a single patch file in the following format.
\ No newline at end of file
diff --git a/cognee/modules/retrieval/brute_force_triplet_search.py b/cognee/modules/retrieval/brute_force_triplet_search.py
index 0a4e9dea5..b5ee5b612 100644
--- a/cognee/modules/retrieval/brute_force_triplet_search.py
+++ b/cognee/modules/retrieval/brute_force_triplet_search.py
@@ -1,13 +1,15 @@
 import asyncio
 import logging
 from typing import List
-from cognee.modules.users.models import User
-from cognee.modules.users.methods import get_default_user
-from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
-from cognee.infrastructure.databases.vector import get_vector_engine
+
 from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.infrastructure.databases.vector import get_vector_engine
+from cognee.modules.graph.cognee_graph.CogneeGraph import CogneeGraph
+from cognee.modules.users.methods import get_default_user
+from cognee.modules.users.models import User
 from cognee.shared.utils import send_telemetry
 
+
 def format_triplets(edges):
     print("\n\n\n")
     def filter_attributes(obj, attributes):
@@ -48,16 +50,14 @@ def format_triplets(edges):
     return "".join(triplets)
 
 
-async def brute_force_triplet_search(query: str, user: User = None, top_k = 5) -> list:
+async def brute_force_triplet_search(query: str, user: User = None, top_k = 5, collections = None) -> list:
     if user is None:
         user = await get_default_user()
 
     if user is None:
         raise PermissionError("No user found in the system. Please create a user.")
 
-    retrieved_results = await brute_force_search(query, user, top_k)
-
-
+    retrieved_results = await brute_force_search(query, user, top_k, collections=collections)
     return retrieved_results
 
 
diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py
index 694d5c8ea..8e6cfec8e 100644
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@@ -4,17 +4,24 @@ import subprocess
 import sys
 from pathlib import Path
 
-from datasets import Dataset
 from swebench.harness.utils import load_swebench_dataset
 from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE
 
 import cognee
-from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline
 from cognee.api.v1.search import SearchType
-from cognee.infrastructure.databases.graph import get_graph_engine
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
 from cognee.infrastructure.llm.prompts import read_query_prompt
-from evals.eval_utils import download_instances
+from cognee.modules.pipelines import Task, run_tasks
+from cognee.modules.retrieval.brute_force_triplet_search import \
+    brute_force_triplet_search
+from cognee.shared.data_models import SummarizedContent
+from cognee.shared.utils import render_graph
+from cognee.tasks.repo_processor import (enrich_dependency_graph,
+                                         expand_dependency_graph,
+                                         get_repo_file_dependencies)
+from cognee.tasks.storage import add_data_points
+from cognee.tasks.summarization import summarize_code
+from evals.eval_utils import download_github_repo, retrieved_edges_to_string
 
 
 def check_install_package(package_name):
@@ -33,44 +40,17 @@ def check_install_package(package_name):
             return True
         except subprocess.CalledProcessError:
             return False
-from cognee.modules.pipelines import Task, run_tasks
-from cognee.modules.retrieval.brute_force_triplet_search import \
-    brute_force_triplet_search
-from cognee.shared.data_models import SummarizedContent
-from cognee.shared.utils import render_graph
-from cognee.tasks.repo_processor import (enrich_dependency_graph,
-                                         expand_dependency_graph,
-                                         get_repo_file_dependencies)
-from cognee.tasks.storage import add_data_points
-from cognee.tasks.summarization import summarize_code
-from evals.eval_utils import (delete_repo, download_github_repo,
-                              download_instances, ingest_repos)
 
 
-def node_to_string(node):
-    text = node.attributes["text"]
-    type = node.attributes["type"]
-    return f"Node(id: {node.id}, type: {type}, description: {text})"
-def retrieved_edges_to_string(retrieved_edges):
-    edge_strings = []
-    for edge in retrieved_edges:
-        relationship_type = edge.attributes["relationship_type"]
-        edge_str = f"{node_to_string(edge.node1)} {relationship_type} {node_to_string(edge.node2)}"
-        edge_strings.append(edge_str)
-    return "\n".join(edge_strings)  
-
 async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
 
     await cognee.prune.prune_data()
     await cognee.prune.prune_system()
 
-    #dataset_name = "SWE_test_data"
-
-    #await cognee.add('', dataset_name = dataset_name)
-
     # repo_path = download_github_repo(instance, '../RAW_GIT_REPOS')
-
-    repo_path = '../minimal_repo'
+    
+    repo_path = '/Users/borisarzentar/Projects/graphrag'
+    
     tasks = [
         Task(get_repo_file_dependencies),
         Task(add_data_points, task_config = { "batch_size": 50 }),
@@ -92,11 +72,12 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp
     problem_statement = instance['problem_statement']
     instructions = read_query_prompt("patch_gen_kg_instructions.txt")
 
-    retrieved_edges = await brute_force_triplet_search(problem_statement, top_k = 3)
+    retrieved_edges = await brute_force_triplet_search(problem_statement, top_k = 3, collections = ["data_point_source_code", "data_point_text"])
     
     retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)
 
     prompt = "\n".join([
+        problem_statement,
         "<patch>",
         PATCH_EXAMPLE,
         "</patch>",
@@ -175,13 +156,8 @@ async def main():
         dataset_name = 'princeton-nlp/SWE-bench_Lite'
         swe_dataset = load_swebench_dataset(
             dataset_name, split='test')[:1]
-        filepath = Path("SWE-bench_testsample")
-        if filepath.exists():
-            dataset = Dataset.load_from_disk(filepath)
-        else:
-            dataset = download_instances(swe_dataset, filepath)
         predictions_path = "preds.json"
-        preds = await get_preds(dataset, with_cognee=not args.cognee_off)
+        preds = await get_preds(swe_dataset, with_cognee=not args.cognee_off)
         with open(predictions_path, "w") as file:
             json.dump(preds, file)
 
diff --git a/evals/eval_utils.py b/evals/eval_utils.py
index 3192127dc..26c4ec2b8 100644
--- a/evals/eval_utils.py
+++ b/evals/eval_utils.py
@@ -1,107 +1,7 @@
 import os
-from copy import deepcopy
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-from datasets import Dataset
-from swebench.inference.make_datasets.create_instance import make_code_text
-from swebench.inference.make_datasets.utils import (AutoContextManager,
-                                                    ingest_directory_contents)
-from tqdm.auto import tqdm
-from git import Repo
 import shutil
 
-def ingest_files(filenames):
-    files_dict = dict()
-    for filename in filenames:
-        with open(filename) as f:
-            content = f.read()
-        files_dict[filename] = content
-    return files_dict
-
-
-def ingest_repos(input_instances):
-    orig_dir = os.getcwd()
-    with TemporaryDirectory(
-        dir="/scratch" if os.path.exists("/scratch") else "/tmp"
-    ) as root_dir:
-        for instance in tqdm(
-            input_instances.values(),
-            total=len(input_instances),
-            desc="Downloading repos on specific commits",
-        ):
-            try:
-                with AutoContextManager(
-                    instance, root_dir
-                ) as cm:
-                    readmes = cm.get_readme_files()
-                    instance["readmes"] = ingest_files(readmes)
-                    instance["file_contents"] = ingest_directory_contents(
-                        cm.repo_path
-                    )
-            finally:
-                # if AutoContextManager fails to exit properly future exits will return the wrong directory
-                os.chdir(orig_dir)
-
-    return input_instances
-
-
-def extract_fields(instance):
-    readmes_text = make_code_text(instance["readmes"])
-    code_text = make_code_text(
-        instance["file_contents"], add_line_numbers=False)
-
-    text_inputs = "\n".join([readmes_text, code_text])
-    text_inputs = text_inputs.strip() + "\n\n"
-    # text_inputs = code_text
-    patch = "\n".join(["<patch>", instance["patch"], "</patch>"])
-    return {**instance, "text": text_inputs, "patch": patch}
-
-
-def create_dataset(input_instances):
-    columns = [
-        "instance_id",
-        "text",
-        "repo",
-        "base_commit",
-        "problem_statement",
-        "hints_text",
-        "created_at",
-        "patch",
-        "test_patch",
-        "version",
-        "FAIL_TO_PASS",
-        "PASS_TO_PASS",
-        "environment_setup_commit",
-    ]
-
-    data_table = {key: list() for key in columns}
-    for instance in input_instances.values():
-        datum = extract_fields(instance)
-        for key in columns:
-            data_table[key].append(datum[key] if key in datum else "")
-    dataset = Dataset.from_dict(data_table)
-
-    return dataset
-
-
-def download_instances(
-    input_data,
-    path=Path("SWE-bench_testsample"),
-    verbose=False,
-):
-    """Downloads code from github.
-
-    Args:
-    - input_data: dictionary with unprocessed input instances.
-    - verbose: set ContextManager verbose to True
-    """
-    input_instances = {x["instance_id"]: x for x in input_data}
-    input_instances_copy = deepcopy(input_instances)
-    input_instances_with_text = ingest_repos(input_instances_copy)
-    dataset = create_dataset(input_instances_with_text)
-    dataset.save_to_disk(path)
-    return dataset
+from git import Repo
 
 
 def download_github_repo(instance, output_dir):
@@ -154,4 +54,19 @@ def delete_repo(repo_path):
         else:
             print(f"Repository path {repo_path} does not exist. Nothing to delete.")
     except Exception as e:
-        print(f"Error deleting repository at {repo_path}: {e}")
\ No newline at end of file
+        print(f"Error deleting repository at {repo_path}: {e}")
+
+
+def node_to_string(node):
+    text = node.attributes["text"]
+    type = node.attributes["type"]
+    return f"Node(id: {node.id}, type: {type}, description: {text})"
+
+
+def retrieved_edges_to_string(retrieved_edges):
+    edge_strings = []
+    for edge in retrieved_edges:
+        relationship_type = edge.attributes["relationship_type"]
+        edge_str = f"{node_to_string(edge.node1)} {relationship_type} {node_to_string(edge.node2)}"
+        edge_strings.append(edge_str)
+    return "\n".join(edge_strings)  
\ No newline at end of file