From 26d914eebc68a3c9c6c72df96525e3c15b1510c6 Mon Sep 17 00:00:00 2001 From: Rita Date: Mon, 11 Nov 2024 16:48:06 +0000 Subject: [PATCH 01/10] Loading SWE-bench dataset and converting it to DeepEval dataset --- evals/eval_on_swe_bench.py | 167 +++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 evals/eval_on_swe_bench.py diff --git a/evals/eval_on_swe_bench.py b/evals/eval_on_swe_bench.py new file mode 100644 index 000000000..b5b3f14b6 --- /dev/null +++ b/evals/eval_on_swe_bench.py @@ -0,0 +1,167 @@ +from typing import List, Dict, Type +from swebench.harness.utils import load_swebench_dataset +from deepeval.dataset import EvaluationDataset +from deepeval.test_case import LLMTestCase +from pydantic import BaseModel + +from deepeval.synthesizer import Synthesizer + + +# DeepEval dataset for reference +# synthesizer = Synthesizer() +# synthesizer.generate_goldens_from_docs( +# document_paths=['/app/.data/short_stories/soldiers_home.pdf'], +# include_expected_output=True +# ) + +def convert_swe_to_deepeval(swe_dataset: List[Dict]): + deepeval_dataset = EvaluationDataset() + for datum in swe_dataset: + input = datum["problem_statement"] + expected_output = datum["patch"] + context = [datum["text"]] + # retrieval_context = datum.get(retrieval_context_key_name) + # tools_called = datum.get(tools_called_key_name) + # expected_tools = json_obj.get(expected_tools_key_name) + + deepeval_dataset.add_test_case( + LLMTestCase( + input=input, + actual_output=None, + expected_output=expected_output, + context=context, + # retrieval_context=retrieval_context, + # tools_called=tools_called, + # expected_tools=expected_tools, + ) + ) + return deepeval_dataset + + +from cognee.infrastructure.llm.get_llm_client import get_llm_client + +swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') +deepeval_dataset = convert_swe_to_deepeval(swe_dataset) + +import logging + +logger = logging.getLogger(__name__) + +class AnswerModel(BaseModel): + response:str + +def get_answer_base(content: str, context:str, response_model: Type[BaseModel]): + llm_client = get_llm_client() + + system_prompt = "THIS IS YOUR CONTEXT:" + str(context) + + return llm_client.create_structured_output(content, system_prompt, response_model) + +def get_answer(content: str,context, model: Type[BaseModel]= AnswerModel): + + try: + return (get_answer_base( + content, + context, + model + )) + except Exception as error: + logger.error("Error extracting cognitive layers from content: %s", error, exc_info = True) + raise error + +async def run_cognify_base_rag(): + from cognee.api.v1.add import add + from cognee.api.v1.prune import prune + from cognee.api.v1.cognify.cognify import cognify + + await prune.prune_system() + + await add("data://test_datasets", "initial_test") + + graph = await cognify("initial_test") + pass + + +import os +from cognee.base_config import get_base_config +from cognee.infrastructure.databases.vector import get_vector_engine + +async def cognify_search_base_rag(content:str, context:str): + base_config = get_base_config() + + cognee_directory_path = os.path.abspath(".cognee_system") + base_config.system_root_directory = cognee_directory_path + + vector_engine = get_vector_engine() + + return_ = await vector_engine.search(collection_name="basic_rag", query_text=content, limit=10) + + print("results", return_) + return return_ + +async def cognify_search_graph(content:str, context:str): + from cognee.api.v1.search import search, SearchType + params = {'query': 'Donald Trump'} + + results = await search(SearchType.INSIGHTS, params) + print("results", results) + return results + + +def convert_goldens_to_test_cases(test_cases_raw: List[LLMTestCase]) -> List[LLMTestCase]: + test_cases = [] + for case in test_cases_raw: + test_case = LLMTestCase( + input=case.input, + # Generate actual output using the 'input' and 'additional_metadata' + actual_output= str(get_answer(case.input, case.context).model_dump()['response']), + expected_output=case.expected_output, + context=case.context, + retrieval_context=["retrieval_context"], + ) + test_cases.append(test_case) + return test_cases + +def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): + deepeval_dataset = EvaluationDataset() + for datum in swe_dataset[:4]: + input = datum["problem_statement"] + expected_output = datum["patch"] + context = [datum["text"]] + # retrieval_context = datum.get(retrieval_context_key_name) + # tools_called = datum.get(tools_called_key_name) + # expected_tools = json_obj.get(expected_tools_key_name) + + deepeval_dataset.add_test_case( + LLMTestCase( + input=input, + actual_output= str(get_answer(input, context).model_dump()['response']), + expected_output=expected_output, + context=context, + # retrieval_context=retrieval_context, + # tools_called=tools_called, + # expected_tools=expected_tools, + ) + ) + return deepeval_dataset + +swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') +test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) + +if __name__ == "__main__": + + import asyncio + + async def main(): + # await run_cognify_base_rag() + # await cognify_search_base_rag("show_all_processes", "context") + await cognify_search_graph("show_all_processes", "context") + asyncio.run(main()) + # run_cognify_base_rag_and_search() + # # Data preprocessing before setting the dataset test cases + swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') + test_dataset = convert_swe_to_deepeval_testcases(dataset.test_cases) + from deepeval.metrics import HallucinationMetric + metric = HallucinationMetric() + evalresult = dataset.evaluate([metric]) + pass \ No newline at end of file From aa95aa21afc585f643ae9cd6c732dcb1ba2d3a9b Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 12 Nov 2024 17:40:42 +0100 Subject: [PATCH 02/10] downloading example repo for eval --- evals/{eval_on_swe_bench.py => deepeval_on_swe_bench.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename evals/{eval_on_swe_bench.py => deepeval_on_swe_bench.py} (97%) diff --git a/evals/eval_on_swe_bench.py b/evals/deepeval_on_swe_bench.py similarity index 97% rename from evals/eval_on_swe_bench.py rename to evals/deepeval_on_swe_bench.py index b5b3f14b6..b191a08d4 100644 --- a/evals/eval_on_swe_bench.py +++ b/evals/deepeval_on_swe_bench.py @@ -160,8 +160,8 @@ if __name__ == "__main__": # run_cognify_base_rag_and_search() # # Data preprocessing before setting the dataset test cases swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') - test_dataset = convert_swe_to_deepeval_testcases(dataset.test_cases) + test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) from deepeval.metrics import HallucinationMetric metric = HallucinationMetric() - evalresult = dataset.evaluate([metric]) + evalresult = test_dataset.evaluate([metric]) pass \ No newline at end of file From 094ba7233e27583c97baf4b2a87e2c642348a8c6 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Thu, 14 Nov 2024 16:28:03 +0100 Subject: [PATCH 03/10] Running inference with and without cognee --- evals/eval_swe_bench.py | 81 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 evals/eval_swe_bench.py diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py new file mode 100644 index 000000000..8e8327fd5 --- /dev/null +++ b/evals/eval_swe_bench.py @@ -0,0 +1,81 @@ +from swebench.harness.utils import load_swebench_dataset +from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE +from evals.eval_utils import download_instances +import cognee +from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline +from cognee.api.v1.search import SearchType +import os +from pathlib import Path +from cognee.infrastructure.databases.graph import get_graph_engine +from cognee.infrastructure.llm.get_llm_client import get_llm_client +from cognee.shared.data_models import Answer + +async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata = True) + + dataset_name = "SWE_test_data" + code_text = dataset[0]["text"][:100000] + await cognee.add([code_text], dataset_name) + await cognee.cognify([dataset_name]) + graph_engine = await get_graph_engine() + with open(graph_engine.filename, "r") as f: + graph_str = f.read() + + problem_statement = dataset[0]['problem_statement'] + instructions = ( + f"I need you to solve this issue by looking at the provided knowledge graph and by " + + f"generating a single patch file that I can apply directly to this repository " + + f"using git apply. Please respond with a single patch " + + f"file in the following format." + ) + + prompt = "\n".join([ + instructions, + "", + PATCH_EXAMPLE, + "", + "This is the knowledge graph:", + graph_str + ]) + + llm_client = get_llm_client() + answer_prediction = llm_client.create_structured_output( + text_input = problem_statement, + system_prompt = prompt, + response_model = str, + ) + return answer_prediction + +def llm_on_preprocessed_data(dataset): + problem_statement = dataset[0]['problem_statement'] + prompt = dataset[0]["text"] + + llm_client = get_llm_client() + answer_prediction = llm_client.create_structured_output( + text_input = problem_statement, + system_prompt = prompt, # TODO check if this is correct + response_model = str, + ) + return answer_prediction + + +async def main(): + swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench', split='test') + swe_dataset_preprocessed = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') + test_data = swe_dataset[:1] + test_data_preprocessed = swe_dataset_preprocessed[:1] + assert test_data[0]["instance_id"] == test_data_preprocessed[0]["instance_id"] + filepath = Path("SWE-bench_testsample") + if filepath.exists(): + from datasets import Dataset + dataset = Dataset.load_from_disk(filepath) + else: + dataset = download_instances(test_data, filepath) + + llm_output_with_cognee = await cognee_and_llm(dataset) + llm_output_without_cognee = llm_on_preprocessed_data(test_data_preprocessed) + +if __name__ == "__main__": + import asyncio + asyncio.run(main(), debug=True) From 721fde3d602f527fa1944e8c9ef006c4555c9f24 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Fri, 15 Nov 2024 17:14:43 +0100 Subject: [PATCH 04/10] generating testspecs for data --- evals/eval_swe_bench.py | 57 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 8e8327fd5..3aabfcba3 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -1,14 +1,17 @@ from swebench.harness.utils import load_swebench_dataset +from swebench.harness.run_evaluation import get_dataset_from_preds +from swebench.harness.run_evaluation import run_instances +from swebench.harness.test_spec import make_test_spec, TestSpec + +import subprocess from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE from evals.eval_utils import download_instances import cognee from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline from cognee.api.v1.search import SearchType -import os from pathlib import Path from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.llm.get_llm_client import get_llm_client -from cognee.shared.data_models import Answer async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): await cognee.prune.prune_data() @@ -47,7 +50,8 @@ async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): ) return answer_prediction -def llm_on_preprocessed_data(dataset): + +async def llm_on_preprocessed_data(dataset): problem_statement = dataset[0]['problem_statement'] prompt = dataset[0]["text"] @@ -59,6 +63,47 @@ def llm_on_preprocessed_data(dataset): ) return answer_prediction +async def get_preds(dataset, with_cognee): + if with_cognee: + text_output = await cognee_and_llm(dataset) + model_name = "with_cognee" + else: + text_output = await llm_on_preprocessed_data(dataset) + model_name = "without_cognee" + + preds = {dataset[0]["instance_id"]: + {"instance_id": dataset[0]["instance_id"], + "model_patch": text_output, + "model_name_or_path": model_name}} + + dataset_name = 'princeton-nlp/SWE-bench' if with_cognee else 'princeton-nlp/SWE-bench_bm25_13K' + preds_dataset = get_dataset_from_preds(dataset_name, + "test", + [dataset[0]["instance_id"]], + preds, + model_name) + + return preds, preds_dataset + +async def evaluate(test_specs: list[TestSpec], + preds: dict, + ): + for test_spec in test_specs: + pred = preds[test_spec.instance_id] + log_dir = Path("logs") + log_dir.mkdir(parents=True, exist_ok=True) + + patch_file = Path(log_dir / "patch.diff") + patch_file.write_text(pred["model_patch"] or "") + for command in test_spec.repo_script_list: + if "/testbed" in command: + command = command.replace("/testbed", "./testbed") + result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True) + print(result) + + subprocess.run("git apply --allow-empty -v logs/patch.diff", shell=True, capture_output=True, text=True) + + async def main(): swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench', split='test') @@ -73,8 +118,10 @@ async def main(): else: dataset = download_instances(test_data, filepath) - llm_output_with_cognee = await cognee_and_llm(dataset) - llm_output_without_cognee = llm_on_preprocessed_data(test_data_preprocessed) + cognee_preds, cognee_preds_dataset = await get_preds(dataset, with_cognee=True) + # nocognee_preds = await get_preds(dataset, with_cognee=False) + test_specs = list(map(make_test_spec, test_data)) + results = await evaluate(test_specs, cognee_preds) if __name__ == "__main__": import asyncio From ed08cdb9f9cff6a3f33d1a5f7f81cc66835a4a1a Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Fri, 15 Nov 2024 17:54:41 +0100 Subject: [PATCH 05/10] using the code graph pipeline instead of cognify --- evals/eval_swe_bench.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 3aabfcba3..9acb176b7 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -20,7 +20,7 @@ async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): dataset_name = "SWE_test_data" code_text = dataset[0]["text"][:100000] await cognee.add([code_text], dataset_name) - await cognee.cognify([dataset_name]) + await code_graph_pipeline([dataset_name]) graph_engine = await get_graph_engine() with open(graph_engine.filename, "r") as f: graph_str = f.read() @@ -63,7 +63,7 @@ async def llm_on_preprocessed_data(dataset): ) return answer_prediction -async def get_preds(dataset, with_cognee): +async def get_preds(dataset, with_cognee=True): if with_cognee: text_output = await cognee_and_llm(dataset) model_name = "with_cognee" From 98e3445c2cc815fa2aff7a9046253a34ce4603ed Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Mon, 18 Nov 2024 15:02:16 +0100 Subject: [PATCH 06/10] running swebench evaluation as subprocess --- evals/deepeval_on_swe_bench.py | 59 +++++++++-------- evals/eval_swe_bench.py | 116 ++++++++++++++------------------- evals/eval_utils.py | 107 ++++++++++++++++++++++++++++++ 3 files changed, 190 insertions(+), 92 deletions(-) create mode 100644 evals/eval_utils.py diff --git a/evals/deepeval_on_swe_bench.py b/evals/deepeval_on_swe_bench.py index b191a08d4..8cb94abb3 100644 --- a/evals/deepeval_on_swe_bench.py +++ b/evals/deepeval_on_swe_bench.py @@ -1,3 +1,8 @@ +from cognee.infrastructure.databases.vector import get_vector_engine +from cognee.base_config import get_base_config +import os +import logging +from cognee.infrastructure.llm.get_llm_client import get_llm_client from typing import List, Dict, Type from swebench.harness.utils import load_swebench_dataset from deepeval.dataset import EvaluationDataset @@ -21,8 +26,6 @@ def convert_swe_to_deepeval(swe_dataset: List[Dict]): expected_output = datum["patch"] context = [datum["text"]] # retrieval_context = datum.get(retrieval_context_key_name) - # tools_called = datum.get(tools_called_key_name) - # expected_tools = json_obj.get(expected_tools_key_name) deepeval_dataset.add_test_case( LLMTestCase( @@ -31,33 +34,32 @@ def convert_swe_to_deepeval(swe_dataset: List[Dict]): expected_output=expected_output, context=context, # retrieval_context=retrieval_context, - # tools_called=tools_called, - # expected_tools=expected_tools, ) ) return deepeval_dataset -from cognee.infrastructure.llm.get_llm_client import get_llm_client - -swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') +swe_dataset = load_swebench_dataset( + 'princeton-nlp/SWE-bench_bm25_13K', split='test') deepeval_dataset = convert_swe_to_deepeval(swe_dataset) -import logging logger = logging.getLogger(__name__) -class AnswerModel(BaseModel): - response:str -def get_answer_base(content: str, context:str, response_model: Type[BaseModel]): +class AnswerModel(BaseModel): + response: str + + +def get_answer_base(content: str, context: str, response_model: Type[BaseModel]): llm_client = get_llm_client() system_prompt = "THIS IS YOUR CONTEXT:" + str(context) - return llm_client.create_structured_output(content, system_prompt, response_model) + return llm_client.create_structured_output(content, system_prompt, response_model) -def get_answer(content: str,context, model: Type[BaseModel]= AnswerModel): + +def get_answer(content: str, context, model: Type[BaseModel] = AnswerModel): try: return (get_answer_base( @@ -66,9 +68,11 @@ def get_answer(content: str,context, model: Type[BaseModel]= AnswerModel): model )) except Exception as error: - logger.error("Error extracting cognitive layers from content: %s", error, exc_info = True) + logger.error( + "Error extracting cognitive layers from content: %s", error, exc_info=True) raise error + async def run_cognify_base_rag(): from cognee.api.v1.add import add from cognee.api.v1.prune import prune @@ -82,11 +86,7 @@ async def run_cognify_base_rag(): pass -import os -from cognee.base_config import get_base_config -from cognee.infrastructure.databases.vector import get_vector_engine - -async def cognify_search_base_rag(content:str, context:str): +async def cognify_search_base_rag(content: str, context: str): base_config = get_base_config() cognee_directory_path = os.path.abspath(".cognee_system") @@ -99,7 +99,8 @@ async def cognify_search_base_rag(content:str, context:str): print("results", return_) return return_ -async def cognify_search_graph(content:str, context:str): + +async def cognify_search_graph(content: str, context: str): from cognee.api.v1.search import search, SearchType params = {'query': 'Donald Trump'} @@ -114,7 +115,8 @@ def convert_goldens_to_test_cases(test_cases_raw: List[LLMTestCase]) -> List[LLM test_case = LLMTestCase( input=case.input, # Generate actual output using the 'input' and 'additional_metadata' - actual_output= str(get_answer(case.input, case.context).model_dump()['response']), + actual_output=str(get_answer( + case.input, case.context).model_dump()['response']), expected_output=case.expected_output, context=case.context, retrieval_context=["retrieval_context"], @@ -122,6 +124,7 @@ def convert_goldens_to_test_cases(test_cases_raw: List[LLMTestCase]) -> List[LLM test_cases.append(test_case) return test_cases + def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): deepeval_dataset = EvaluationDataset() for datum in swe_dataset[:4]: @@ -135,7 +138,8 @@ def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): deepeval_dataset.add_test_case( LLMTestCase( input=input, - actual_output= str(get_answer(input, context).model_dump()['response']), + actual_output=str(get_answer( + input, context).model_dump()['response']), expected_output=expected_output, context=context, # retrieval_context=retrieval_context, @@ -145,9 +149,11 @@ def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): ) return deepeval_dataset -swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') + +swe_dataset = load_swebench_dataset( + 'princeton-nlp/SWE-bench_bm25_13K', split='test') test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) - + if __name__ == "__main__": import asyncio @@ -159,9 +165,10 @@ if __name__ == "__main__": asyncio.run(main()) # run_cognify_base_rag_and_search() # # Data preprocessing before setting the dataset test cases - swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') + swe_dataset = load_swebench_dataset( + 'princeton-nlp/SWE-bench_bm25_13K', split='test') test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) from deepeval.metrics import HallucinationMetric metric = HallucinationMetric() evalresult = test_dataset.evaluate([metric]) - pass \ No newline at end of file + pass diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 9acb176b7..2cb221576 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -1,38 +1,38 @@ -from swebench.harness.utils import load_swebench_dataset -from swebench.harness.run_evaluation import get_dataset_from_preds -from swebench.harness.run_evaluation import run_instances -from swebench.harness.test_spec import make_test_spec, TestSpec - +import json import subprocess +from pathlib import Path + +from swebench.harness.utils import load_swebench_dataset from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE -from evals.eval_utils import download_instances + import cognee from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline from cognee.api.v1.search import SearchType -from pathlib import Path from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.llm.get_llm_client import get_llm_client +from evals.eval_utils import download_instances -async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): + +async def cognee_and_llm(dataset, search_type=SearchType.CHUNKS): await cognee.prune.prune_data() - await cognee.prune.prune_system(metadata = True) + await cognee.prune.prune_system(metadata=True) dataset_name = "SWE_test_data" - code_text = dataset[0]["text"][:100000] + code_text = dataset[0]["text"] await cognee.add([code_text], dataset_name) await code_graph_pipeline([dataset_name]) graph_engine = await get_graph_engine() with open(graph_engine.filename, "r") as f: - graph_str = f.read() - + graph_str = f.read() + problem_statement = dataset[0]['problem_statement'] instructions = ( - f"I need you to solve this issue by looking at the provided knowledge graph and by " - + f"generating a single patch file that I can apply directly to this repository " - + f"using git apply. Please respond with a single patch " - + f"file in the following format." + "I need you to solve this issue by looking at the provided knowledge graph and by " + + "generating a single patch file that I can apply directly to this repository " + + "using git apply. Please respond with a single patch " + + "file in the following format." ) - + prompt = "\n".join([ instructions, "", @@ -41,28 +41,29 @@ async def cognee_and_llm(dataset, search_type = SearchType.CHUNKS): "This is the knowledge graph:", graph_str ]) - + llm_client = get_llm_client() answer_prediction = llm_client.create_structured_output( - text_input = problem_statement, - system_prompt = prompt, - response_model = str, - ) + text_input=problem_statement, + system_prompt=prompt, + response_model=str, + ) return answer_prediction async def llm_on_preprocessed_data(dataset): problem_statement = dataset[0]['problem_statement'] prompt = dataset[0]["text"] - + llm_client = get_llm_client() answer_prediction = llm_client.create_structured_output( - text_input = problem_statement, - system_prompt = prompt, # TODO check if this is correct - response_model = str, - ) + text_input=problem_statement, + system_prompt=prompt, + response_model=str, + ) return answer_prediction + async def get_preds(dataset, with_cognee=True): if with_cognee: text_output = await cognee_and_llm(dataset) @@ -70,46 +71,21 @@ async def get_preds(dataset, with_cognee=True): else: text_output = await llm_on_preprocessed_data(dataset) model_name = "without_cognee" - - preds = {dataset[0]["instance_id"]: - {"instance_id": dataset[0]["instance_id"], - "model_patch": text_output, - "model_name_or_path": model_name}} - - dataset_name = 'princeton-nlp/SWE-bench' if with_cognee else 'princeton-nlp/SWE-bench_bm25_13K' - preds_dataset = get_dataset_from_preds(dataset_name, - "test", - [dataset[0]["instance_id"]], - preds, - model_name) - - return preds, preds_dataset -async def evaluate(test_specs: list[TestSpec], - preds: dict, - ): - for test_spec in test_specs: - pred = preds[test_spec.instance_id] - log_dir = Path("logs") - log_dir.mkdir(parents=True, exist_ok=True) + preds = [{"instance_id": dataset[0]["instance_id"], + "model_patch": text_output, + "model_name_or_path": model_name}] - patch_file = Path(log_dir / "patch.diff") - patch_file.write_text(pred["model_patch"] or "") - for command in test_spec.repo_script_list: - if "/testbed" in command: - command = command.replace("/testbed", "./testbed") - result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True) - print(result) - - subprocess.run("git apply --allow-empty -v logs/patch.diff", shell=True, capture_output=True, text=True) + return preds - async def main(): - swe_dataset = load_swebench_dataset('princeton-nlp/SWE-bench', split='test') - swe_dataset_preprocessed = load_swebench_dataset('princeton-nlp/SWE-bench_bm25_13K', split='test') - test_data = swe_dataset[:1] - test_data_preprocessed = swe_dataset_preprocessed[:1] + swe_dataset = load_swebench_dataset( + 'princeton-nlp/SWE-bench', split='test') + swe_dataset_preprocessed = load_swebench_dataset( + 'princeton-nlp/SWE-bench_bm25_13K', split='test') + test_data = swe_dataset[:1] + test_data_preprocessed = swe_dataset_preprocessed[:1] assert test_data[0]["instance_id"] == test_data_preprocessed[0]["instance_id"] filepath = Path("SWE-bench_testsample") if filepath.exists(): @@ -117,11 +93,19 @@ async def main(): dataset = Dataset.load_from_disk(filepath) else: dataset = download_instances(test_data, filepath) - - cognee_preds, cognee_preds_dataset = await get_preds(dataset, with_cognee=True) + + cognee_preds = await get_preds(dataset, with_cognee=True) # nocognee_preds = await get_preds(dataset, with_cognee=False) - test_specs = list(map(make_test_spec, test_data)) - results = await evaluate(test_specs, cognee_preds) + with open("withcognee.json", "w") as file: + json.dump(cognee_preds, file) + + subprocess.run(["python", "-m", "swebench.harness.run_evaluation", + "--dataset_name", 'princeton-nlp/SWE-bench', + "--split", "test", + "--predictions_path", "withcognee.json", + "--max_workers", "1", + "--instance_ids", test_data[0]["instance_id"], + "--run_id", "with_cognee"]) if __name__ == "__main__": import asyncio diff --git a/evals/eval_utils.py b/evals/eval_utils.py new file mode 100644 index 000000000..1c2785737 --- /dev/null +++ b/evals/eval_utils.py @@ -0,0 +1,107 @@ +import json +import logging +import os +import traceback +from copy import deepcopy +from pathlib import Path +from tempfile import TemporaryDirectory + +import unidiff +from datasets import Dataset +from swebench.inference.make_datasets.create_instance import make_code_text +from swebench.inference.make_datasets.utils import (AutoContextManager, + ingest_directory_contents) +from tqdm.auto import tqdm + + +def ingest_files(filenames): + files_dict = dict() + for filename in filenames: + with open(filename) as f: + content = f.read() + files_dict[filename] = content + return files_dict + + +def ingest_repos(input_instances): + orig_dir = os.getcwd() + with TemporaryDirectory( + dir="/scratch" if os.path.exists("/scratch") else "/tmp" + ) as root_dir: + for instance in tqdm( + input_instances.values(), + total=len(input_instances), + desc="Downloading repos on specific commits", + ): + try: + with AutoContextManager( + instance, root_dir + ) as cm: + readmes = cm.get_readme_files() + instance["readmes"] = ingest_files(readmes) + instance["file_contents"] = ingest_directory_contents( + cm.repo_path + ) + finally: + # if AutoContextManager fails to exit properly future exits will return the wrong directory + os.chdir(orig_dir) + + return input_instances + + +def extract_fields(instance): + readmes_text = make_code_text(instance["readmes"]) + code_text = make_code_text( + instance["file_contents"], add_line_numbers=False) + + text_inputs = "\n".join([readmes_text, code_text]) + text_inputs = text_inputs.strip() + "\n\n" + # text_inputs = code_text + patch = "\n".join([f"", instance["patch"], ""]) + return {**instance, "text": text_inputs, "patch": patch} + + +def create_dataset(input_instances): + columns = [ + "instance_id", + "text", + "repo", + "base_commit", + "problem_statement", + "hints_text", + "created_at", + "patch", + "test_patch", + "version", + "FAIL_TO_PASS", + "PASS_TO_PASS", + "environment_setup_commit", + ] + + data_table = {key: list() for key in columns} + for instance in input_instances.values(): + datum = extract_fields(instance) + for key in columns: + data_table[key].append(datum[key] if key in datum else "") + dataset = Dataset.from_dict(data_table) + + return dataset + + +def download_instances( + input_data, + path=Path("SWE-bench_testsample"), + verbose=False, +): + """Downloads code from github. + + Args: + - input_data: dictionary with unprocessed input instances. + - verbose: set ContextManager verbose to True + """ + input_instances = {x["instance_id"]: x for x in input_data} + input_instances_copy = deepcopy(input_instances) + input_instances_with_text = ingest_repos(input_instances_copy) + dataset = create_dataset(input_instances_with_text) + dataset.save_to_disk(path) + return dataset From d986e7c981fc7cb2cfb1534c94b93e28dba0dea1 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Mon, 18 Nov 2024 15:59:18 +0100 Subject: [PATCH 07/10] minor code cleanup --- evals/eval_utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/evals/eval_utils.py b/evals/eval_utils.py index 1c2785737..e4f070ffd 100644 --- a/evals/eval_utils.py +++ b/evals/eval_utils.py @@ -1,12 +1,8 @@ -import json -import logging import os -import traceback from copy import deepcopy from pathlib import Path from tempfile import TemporaryDirectory -import unidiff from datasets import Dataset from swebench.inference.make_datasets.create_instance import make_code_text from swebench.inference.make_datasets.utils import (AutoContextManager, From 838d98238a8b4869d1e4aaf24f461902a17d04bc Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 19 Nov 2024 13:32:35 +0100 Subject: [PATCH 08/10] Code cleanup --- evals/deepeval_on_swe_bench.py | 174 --------------------------------- evals/eval_swe_bench.py | 75 ++++++++------ evals/eval_utils.py | 2 +- 3 files changed, 44 insertions(+), 207 deletions(-) delete mode 100644 evals/deepeval_on_swe_bench.py diff --git a/evals/deepeval_on_swe_bench.py b/evals/deepeval_on_swe_bench.py deleted file mode 100644 index 8cb94abb3..000000000 --- a/evals/deepeval_on_swe_bench.py +++ /dev/null @@ -1,174 +0,0 @@ -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.base_config import get_base_config -import os -import logging -from cognee.infrastructure.llm.get_llm_client import get_llm_client -from typing import List, Dict, Type -from swebench.harness.utils import load_swebench_dataset -from deepeval.dataset import EvaluationDataset -from deepeval.test_case import LLMTestCase -from pydantic import BaseModel - -from deepeval.synthesizer import Synthesizer - - -# DeepEval dataset for reference -# synthesizer = Synthesizer() -# synthesizer.generate_goldens_from_docs( -# document_paths=['/app/.data/short_stories/soldiers_home.pdf'], -# include_expected_output=True -# ) - -def convert_swe_to_deepeval(swe_dataset: List[Dict]): - deepeval_dataset = EvaluationDataset() - for datum in swe_dataset: - input = datum["problem_statement"] - expected_output = datum["patch"] - context = [datum["text"]] - # retrieval_context = datum.get(retrieval_context_key_name) - - deepeval_dataset.add_test_case( - LLMTestCase( - input=input, - actual_output=None, - expected_output=expected_output, - context=context, - # retrieval_context=retrieval_context, - ) - ) - return deepeval_dataset - - -swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') -deepeval_dataset = convert_swe_to_deepeval(swe_dataset) - - -logger = logging.getLogger(__name__) - - -class AnswerModel(BaseModel): - response: str - - -def get_answer_base(content: str, context: str, response_model: Type[BaseModel]): - llm_client = get_llm_client() - - system_prompt = "THIS IS YOUR CONTEXT:" + str(context) - - return llm_client.create_structured_output(content, system_prompt, response_model) - - -def get_answer(content: str, context, model: Type[BaseModel] = AnswerModel): - - try: - return (get_answer_base( - content, - context, - model - )) - except Exception as error: - logger.error( - "Error extracting cognitive layers from content: %s", error, exc_info=True) - raise error - - -async def run_cognify_base_rag(): - from cognee.api.v1.add import add - from cognee.api.v1.prune import prune - from cognee.api.v1.cognify.cognify import cognify - - await prune.prune_system() - - await add("data://test_datasets", "initial_test") - - graph = await cognify("initial_test") - pass - - -async def cognify_search_base_rag(content: str, context: str): - base_config = get_base_config() - - cognee_directory_path = os.path.abspath(".cognee_system") - base_config.system_root_directory = cognee_directory_path - - vector_engine = get_vector_engine() - - return_ = await vector_engine.search(collection_name="basic_rag", query_text=content, limit=10) - - print("results", return_) - return return_ - - -async def cognify_search_graph(content: str, context: str): - from cognee.api.v1.search import search, SearchType - params = {'query': 'Donald Trump'} - - results = await search(SearchType.INSIGHTS, params) - print("results", results) - return results - - -def convert_goldens_to_test_cases(test_cases_raw: List[LLMTestCase]) -> List[LLMTestCase]: - test_cases = [] - for case in test_cases_raw: - test_case = LLMTestCase( - input=case.input, - # Generate actual output using the 'input' and 'additional_metadata' - actual_output=str(get_answer( - case.input, case.context).model_dump()['response']), - expected_output=case.expected_output, - context=case.context, - retrieval_context=["retrieval_context"], - ) - test_cases.append(test_case) - return test_cases - - -def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): - deepeval_dataset = EvaluationDataset() - for datum in swe_dataset[:4]: - input = datum["problem_statement"] - expected_output = datum["patch"] - context = [datum["text"]] - # retrieval_context = datum.get(retrieval_context_key_name) - # tools_called = datum.get(tools_called_key_name) - # expected_tools = json_obj.get(expected_tools_key_name) - - deepeval_dataset.add_test_case( - LLMTestCase( - input=input, - actual_output=str(get_answer( - input, context).model_dump()['response']), - expected_output=expected_output, - context=context, - # retrieval_context=retrieval_context, - # tools_called=tools_called, - # expected_tools=expected_tools, - ) - ) - return deepeval_dataset - - -swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') -test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) - -if __name__ == "__main__": - - import asyncio - - async def main(): - # await run_cognify_base_rag() - # await cognify_search_base_rag("show_all_processes", "context") - await cognify_search_graph("show_all_processes", "context") - asyncio.run(main()) - # run_cognify_base_rag_and_search() - # # Data preprocessing before setting the dataset test cases - swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') - test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) - from deepeval.metrics import HallucinationMetric - metric = HallucinationMetric() - evalresult = test_dataset.evaluate([metric]) - pass diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 2cb221576..c0ab6d67e 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -1,7 +1,9 @@ +import argparse import json import subprocess from pathlib import Path +from datasets import Dataset from swebench.harness.utils import load_swebench_dataset from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE @@ -13,19 +15,20 @@ from cognee.infrastructure.llm.get_llm_client import get_llm_client from evals.eval_utils import download_instances -async def cognee_and_llm(dataset, search_type=SearchType.CHUNKS): +async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): + await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) dataset_name = "SWE_test_data" - code_text = dataset[0]["text"] + code_text = instance["text"] await cognee.add([code_text], dataset_name) await code_graph_pipeline([dataset_name]) graph_engine = await get_graph_engine() with open(graph_engine.filename, "r") as f: graph_str = f.read() - problem_statement = dataset[0]['problem_statement'] + problem_statement = instance['problem_statement'] instructions = ( "I need you to solve this issue by looking at the provided knowledge graph and by " + "generating a single patch file that I can apply directly to this repository " @@ -51,9 +54,9 @@ async def cognee_and_llm(dataset, search_type=SearchType.CHUNKS): return answer_prediction -async def llm_on_preprocessed_data(dataset): - problem_statement = dataset[0]['problem_statement'] - prompt = dataset[0]["text"] +async def generate_patch_without_cognee(instance): + problem_statement = instance['problem_statement'] + prompt = instance["text"] llm_client = get_llm_client() answer_prediction = llm_client.create_structured_output( @@ -66,46 +69,54 @@ async def llm_on_preprocessed_data(dataset): async def get_preds(dataset, with_cognee=True): if with_cognee: - text_output = await cognee_and_llm(dataset) model_name = "with_cognee" + pred_func = generate_patch_with_cognee else: - text_output = await llm_on_preprocessed_data(dataset) model_name = "without_cognee" + pred_func = generate_patch_without_cognee - preds = [{"instance_id": dataset[0]["instance_id"], - "model_patch": text_output, - "model_name_or_path": model_name}] + preds = [{"instance_id": instance["instance_id"], + "model_patch": await pred_func(instance), + "model_name_or_path": model_name} for instance in dataset] return preds async def main(): - swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench', split='test') - swe_dataset_preprocessed = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') - test_data = swe_dataset[:1] - test_data_preprocessed = swe_dataset_preprocessed[:1] - assert test_data[0]["instance_id"] == test_data_preprocessed[0]["instance_id"] - filepath = Path("SWE-bench_testsample") - if filepath.exists(): - from datasets import Dataset - dataset = Dataset.load_from_disk(filepath) - else: - dataset = download_instances(test_data, filepath) + parser = argparse.ArgumentParser( + description="Run LLM predictions on SWE-bench dataset") + parser.add_argument('--cognee_off', action='store_true') + args = parser.parse_args() - cognee_preds = await get_preds(dataset, with_cognee=True) - # nocognee_preds = await get_preds(dataset, with_cognee=False) - with open("withcognee.json", "w") as file: - json.dump(cognee_preds, file) + if args.cognee_off: + dataset_name = 'princeton-nlp/SWE-bench_Lite_bm25_13K' + dataset = load_swebench_dataset(dataset_name, split='test') + predictions_path = "preds_nocognee.json" + if Path(predictions_path).exists(): + with open(predictions_path, "r") as file: + preds = json.load(file) + else: + preds = await get_preds(dataset, with_cognee=False) + with open(predictions_path, "w") as file: + json.dump(preds, file) + else: + dataset_name = 'princeton-nlp/SWE-bench_Lite' + swe_dataset = load_swebench_dataset( + dataset_name, split='test')[:1] + filepath = Path("SWE-bench_testsample") + if filepath.exists(): + dataset = Dataset.load_from_disk(filepath) + else: + dataset = download_instances(swe_dataset, filepath) + predictions_path = "preds.json" + preds = await get_preds(dataset, with_cognee=not args.cognee_off) subprocess.run(["python", "-m", "swebench.harness.run_evaluation", - "--dataset_name", 'princeton-nlp/SWE-bench', + "--dataset_name", dataset_name, "--split", "test", - "--predictions_path", "withcognee.json", + "--predictions_path", predictions_path, "--max_workers", "1", - "--instance_ids", test_data[0]["instance_id"], - "--run_id", "with_cognee"]) + "--run_id", "test_run"]) if __name__ == "__main__": import asyncio diff --git a/evals/eval_utils.py b/evals/eval_utils.py index e4f070ffd..e95a84cec 100644 --- a/evals/eval_utils.py +++ b/evals/eval_utils.py @@ -53,7 +53,7 @@ def extract_fields(instance): text_inputs = "\n".join([readmes_text, code_text]) text_inputs = text_inputs.strip() + "\n\n" # text_inputs = code_text - patch = "\n".join([f"", instance["patch"], ""]) + patch = "\n".join(["", instance["patch"], ""]) return {**instance, "text": text_inputs, "patch": patch} From 2948089806d4465dc7cae264e0f4951cb6af2230 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Tue, 19 Nov 2024 14:07:53 +0100 Subject: [PATCH 09/10] Read patch generation instructions from file --- .../infrastructure/llm/prompts/patch_gen_instructions.txt | 3 +++ evals/eval_swe_bench.py | 8 ++------ 2 files changed, 5 insertions(+), 6 deletions(-) create mode 100644 cognee/infrastructure/llm/prompts/patch_gen_instructions.txt diff --git a/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt b/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt new file mode 100644 index 000000000..1553753ab --- /dev/null +++ b/cognee/infrastructure/llm/prompts/patch_gen_instructions.txt @@ -0,0 +1,3 @@ +I need you to solve this issue by looking at the provided knowledge graph and +generating a single patch file that I can apply directly to this repository using git apply. +Please respond with a single patch file in the following format. \ No newline at end of file diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index c0ab6d67e..e5b8643c1 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -12,6 +12,7 @@ from cognee.api.v1.cognify.code_graph_pipeline import code_graph_pipeline from cognee.api.v1.search import SearchType from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.llm.get_llm_client import get_llm_client +from cognee.infrastructure.llm.prompts import read_query_prompt from evals.eval_utils import download_instances @@ -29,12 +30,7 @@ async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): graph_str = f.read() problem_statement = instance['problem_statement'] - instructions = ( - "I need you to solve this issue by looking at the provided knowledge graph and by " - + "generating a single patch file that I can apply directly to this repository " - + "using git apply. Please respond with a single patch " - + "file in the following format." - ) + instructions = read_query_prompt("patch_gen_instructions.txt") prompt = "\n".join([ instructions, From e1d8f3ea8658713ee549cdc215fd2a8bcc34d1bd Mon Sep 17 00:00:00 2001 From: Rita Aleksziev Date: Wed, 20 Nov 2024 16:02:15 +0100 Subject: [PATCH 10/10] use acreate_structured_output instead of create_structured_output in eval script --- evals/eval_swe_bench.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index e5b8643c1..ec93bda07 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -42,7 +42,7 @@ async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): ]) llm_client = get_llm_client() - answer_prediction = llm_client.create_structured_output( + answer_prediction = await llm_client.acreate_structured_output( text_input=problem_statement, system_prompt=prompt, response_model=str, @@ -55,7 +55,7 @@ async def generate_patch_without_cognee(instance): prompt = instance["text"] llm_client = get_llm_client() - answer_prediction = llm_client.create_structured_output( + answer_prediction = await llm_client.acreate_structured_output( text_input=problem_statement, system_prompt=prompt, response_model=str, @@ -88,10 +88,7 @@ async def main(): dataset_name = 'princeton-nlp/SWE-bench_Lite_bm25_13K' dataset = load_swebench_dataset(dataset_name, split='test') predictions_path = "preds_nocognee.json" - if Path(predictions_path).exists(): - with open(predictions_path, "r") as file: - preds = json.load(file) - else: + if not Path(predictions_path).exists(): preds = await get_preds(dataset, with_cognee=False) with open(predictions_path, "w") as file: json.dump(preds, file) @@ -106,6 +103,8 @@ async def main(): dataset = download_instances(swe_dataset, filepath) predictions_path = "preds.json" preds = await get_preds(dataset, with_cognee=not args.cognee_off) + with open(predictions_path, "w") as file: + json.dump(preds, file) subprocess.run(["python", "-m", "swebench.harness.run_evaluation", "--dataset_name", dataset_name,