Feat/cog 544 eval on swe bench (#5)

Evaluation script for SWE-bench benchmarking with and without cognee
2024-11-20 16:30:28 +01:00 · 2024-11-20 16:30:28 +01:00 · ad08b53ed7
commit ad08b53ed7
parent acd88e83e2 98e693ad97
3 changed files with 224 additions and 0 deletions
--- a/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt
+++ b/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt
@ -0,0 +1,3 @@
+I need you to solve this issue by looking at the provided knowledge graph and 
+generating a single patch file that I can apply directly to this repository using git apply. 
+Please respond with a single patch file in the following format.
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@ -0,0 +1,118 @@
+import argparse
+import json
+import subprocess
+from pathlib import Path
+
+from datasets import Dataset
+from swebench.harness.utils import load_swebench_dataset
+from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE
+
+import cognee
+from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline
+from cognee.api.v1.search import SearchType
+from cognee.infrastructure.databases.graph import get_graph_engine
+from cognee.infrastructure.llm.get_llm_client import get_llm_client
+from cognee.infrastructure.llm.prompts import read_query_prompt
+from evals.eval_utils import download_instances
+
+
+async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS):
+
+    await cognee.prune.prune_data()
+    await cognee.prune.prune_system(metadata=True)
+
+    dataset_name = "SWE_test_data"
+    code_text = instance["text"]
+    await cognee.add([code_text], dataset_name)
+    await code_graph_pipeline([dataset_name])
+    graph_engine = await get_graph_engine()
+    with open(graph_engine.filename, "r") as f:
+        graph_str = f.read()
+
+    problem_statement = instance['problem_statement']
+    instructions = read_query_prompt("patch_gen_instructions.txt")
+
+    prompt = "\n".join([
+        instructions,
+        "<patch>",
+        PATCH_EXAMPLE,
+        "</patch>",
+        "This is the knowledge graph:",
+        graph_str
+    ])
+
+    llm_client = get_llm_client()
+    answer_prediction = await llm_client.acreate_structured_output(
+        text_input=problem_statement,
+        system_prompt=prompt,
+        response_model=str,
+    )
+    return answer_prediction
+
+
+async def generate_patch_without_cognee(instance):
+    problem_statement = instance['problem_statement']
+    prompt = instance["text"]
+
+    llm_client = get_llm_client()
+    answer_prediction = await llm_client.acreate_structured_output(
+        text_input=problem_statement,
+        system_prompt=prompt,
+        response_model=str,
+    )
+    return answer_prediction
+
+
+async def get_preds(dataset, with_cognee=True):
+    if with_cognee:
+        model_name = "with_cognee"
+        pred_func = generate_patch_with_cognee
+    else:
+        model_name = "without_cognee"
+        pred_func = generate_patch_without_cognee
+
+    preds = [{"instance_id": instance["instance_id"],
+              "model_patch": await pred_func(instance),
+              "model_name_or_path": model_name} for instance in dataset]
+
+    return preds
+
+
+async def main():
+    parser = argparse.ArgumentParser(
+        description="Run LLM predictions on SWE-bench dataset")
+    parser.add_argument('--cognee_off', action='store_true')
+    args = parser.parse_args()
+
+    if args.cognee_off:
+        dataset_name = 'princeton-nlp/SWE-bench_Lite_bm25_13K'
+        dataset = load_swebench_dataset(dataset_name, split='test')
+        predictions_path = "preds_nocognee.json"
+        if not Path(predictions_path).exists():
+            preds = await get_preds(dataset, with_cognee=False)
+            with open(predictions_path, "w") as file:
+                json.dump(preds, file)
+    else:
+        dataset_name = 'princeton-nlp/SWE-bench_Lite'
+        swe_dataset = load_swebench_dataset(
+            dataset_name, split='test')[:1]
+        filepath = Path("SWE-bench_testsample")
+        if filepath.exists():
+            dataset = Dataset.load_from_disk(filepath)
+        else:
+            dataset = download_instances(swe_dataset, filepath)
+        predictions_path = "preds.json"
+        preds = await get_preds(dataset, with_cognee=not args.cognee_off)
+        with open(predictions_path, "w") as file:
+            json.dump(preds, file)
+
+    subprocess.run(["python", "-m", "swebench.harness.run_evaluation",
+                    "--dataset_name", dataset_name,
+                    "--split", "test",
+                    "--predictions_path",  predictions_path,
+                    "--max_workers", "1",
+                    "--run_id", "test_run"])
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main(), debug=True)
--- a/evals/eval_utils.py
+++ b/evals/eval_utils.py
@ -0,0 +1,103 @@
+import os
+from copy import deepcopy
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from datasets import Dataset
+from swebench.inference.make_datasets.create_instance import make_code_text
+from swebench.inference.make_datasets.utils import (AutoContextManager,
+                                                    ingest_directory_contents)
+from tqdm.auto import tqdm
+
+
+def ingest_files(filenames):
+    files_dict = dict()
+    for filename in filenames:
+        with open(filename) as f:
+            content = f.read()
+        files_dict[filename] = content
+    return files_dict
+
+
+def ingest_repos(input_instances):
+    orig_dir = os.getcwd()
+    with TemporaryDirectory(
+        dir="/scratch" if os.path.exists("/scratch") else "/tmp"
+    ) as root_dir:
+        for instance in tqdm(
+            input_instances.values(),
+            total=len(input_instances),
+            desc="Downloading repos on specific commits",
+        ):
+            try:
+                with AutoContextManager(
+                    instance, root_dir
+                ) as cm:
+                    readmes = cm.get_readme_files()
+                    instance["readmes"] = ingest_files(readmes)
+                    instance["file_contents"] = ingest_directory_contents(
+                        cm.repo_path
+                    )
+            finally:
+                # if AutoContextManager fails to exit properly future exits will return the wrong directory
+                os.chdir(orig_dir)
+
+    return input_instances
+
+
+def extract_fields(instance):
+    readmes_text = make_code_text(instance["readmes"])
+    code_text = make_code_text(
+        instance["file_contents"], add_line_numbers=False)
+
+    text_inputs = "\n".join([readmes_text, code_text])
+    text_inputs = text_inputs.strip() + "\n\n"
+    # text_inputs = code_text
+    patch = "\n".join(["<patch>", instance["patch"], "</patch>"])
+    return {**instance, "text": text_inputs, "patch": patch}
+
+
+def create_dataset(input_instances):
+    columns = [
+        "instance_id",
+        "text",
+        "repo",
+        "base_commit",
+        "problem_statement",
+        "hints_text",
+        "created_at",
+        "patch",
+        "test_patch",
+        "version",
+        "FAIL_TO_PASS",
+        "PASS_TO_PASS",
+        "environment_setup_commit",
+    ]
+
+    data_table = {key: list() for key in columns}
+    for instance in input_instances.values():
+        datum = extract_fields(instance)
+        for key in columns:
+            data_table[key].append(datum[key] if key in datum else "")
+    dataset = Dataset.from_dict(data_table)
+
+    return dataset
+
+
+def download_instances(
+    input_data,
+    path=Path("SWE-bench_testsample"),
+    verbose=False,
+):
+    """Downloads code from github.
+
+    Args:
+    - input_data: dictionary with unprocessed input instances.
+    - verbose: set ContextManager verbose to True
+    """
+    input_instances = {x["instance_id"]: x for x in input_data}
+    input_instances_copy = deepcopy(input_instances)
+    input_instances_with_text = ingest_repos(input_instances_copy)
+    dataset = create_dataset(input_instances_with_text)
+    dataset.save_to_disk(path)
+    return dataset