diff --git a/evals/deepeval_on_swe_bench.py b/evals/deepeval_on_swe_bench.py deleted file mode 100644 index 8cb94abb3..000000000 --- a/evals/deepeval_on_swe_bench.py +++ /dev/null @@ -1,174 +0,0 @@ -from cognee.infrastructure.databases.vector import get_vector_engine -from cognee.base_config import get_base_config -import os -import logging -from cognee.infrastructure.llm.get_llm_client import get_llm_client -from typing import List, Dict, Type -from swebench.harness.utils import load_swebench_dataset -from deepeval.dataset import EvaluationDataset -from deepeval.test_case import LLMTestCase -from pydantic import BaseModel - -from deepeval.synthesizer import Synthesizer - - -# DeepEval dataset for reference -# synthesizer = Synthesizer() -# synthesizer.generate_goldens_from_docs( -# document_paths=['/app/.data/short_stories/soldiers_home.pdf'], -# include_expected_output=True -# ) - -def convert_swe_to_deepeval(swe_dataset: List[Dict]): - deepeval_dataset = EvaluationDataset() - for datum in swe_dataset: - input = datum["problem_statement"] - expected_output = datum["patch"] - context = [datum["text"]] - # retrieval_context = datum.get(retrieval_context_key_name) - - deepeval_dataset.add_test_case( - LLMTestCase( - input=input, - actual_output=None, - expected_output=expected_output, - context=context, - # retrieval_context=retrieval_context, - ) - ) - return deepeval_dataset - - -swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') -deepeval_dataset = convert_swe_to_deepeval(swe_dataset) - - -logger = logging.getLogger(__name__) - - -class AnswerModel(BaseModel): - response: str - - -def get_answer_base(content: str, context: str, response_model: Type[BaseModel]): - llm_client = get_llm_client() - - system_prompt = "THIS IS YOUR CONTEXT:" + str(context) - - return llm_client.create_structured_output(content, system_prompt, response_model) - - -def get_answer(content: str, context, model: Type[BaseModel] = AnswerModel): - - try: - return (get_answer_base( - content, - context, - model - )) - except Exception as error: - logger.error( - "Error extracting cognitive layers from content: %s", error, exc_info=True) - raise error - - -async def run_cognify_base_rag(): - from cognee.api.v1.add import add - from cognee.api.v1.prune import prune - from cognee.api.v1.cognify.cognify import cognify - - await prune.prune_system() - - await add("data://test_datasets", "initial_test") - - graph = await cognify("initial_test") - pass - - -async def cognify_search_base_rag(content: str, context: str): - base_config = get_base_config() - - cognee_directory_path = os.path.abspath(".cognee_system") - base_config.system_root_directory = cognee_directory_path - - vector_engine = get_vector_engine() - - return_ = await vector_engine.search(collection_name="basic_rag", query_text=content, limit=10) - - print("results", return_) - return return_ - - -async def cognify_search_graph(content: str, context: str): - from cognee.api.v1.search import search, SearchType - params = {'query': 'Donald Trump'} - - results = await search(SearchType.INSIGHTS, params) - print("results", results) - return results - - -def convert_goldens_to_test_cases(test_cases_raw: List[LLMTestCase]) -> List[LLMTestCase]: - test_cases = [] - for case in test_cases_raw: - test_case = LLMTestCase( - input=case.input, - # Generate actual output using the 'input' and 'additional_metadata' - actual_output=str(get_answer( - case.input, case.context).model_dump()['response']), - expected_output=case.expected_output, - context=case.context, - retrieval_context=["retrieval_context"], - ) - test_cases.append(test_case) - return test_cases - - -def convert_swe_to_deepeval_testcases(swe_dataset: List[Dict]): - deepeval_dataset = EvaluationDataset() - for datum in swe_dataset[:4]: - input = datum["problem_statement"] - expected_output = datum["patch"] - context = [datum["text"]] - # retrieval_context = datum.get(retrieval_context_key_name) - # tools_called = datum.get(tools_called_key_name) - # expected_tools = json_obj.get(expected_tools_key_name) - - deepeval_dataset.add_test_case( - LLMTestCase( - input=input, - actual_output=str(get_answer( - input, context).model_dump()['response']), - expected_output=expected_output, - context=context, - # retrieval_context=retrieval_context, - # tools_called=tools_called, - # expected_tools=expected_tools, - ) - ) - return deepeval_dataset - - -swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') -test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) - -if __name__ == "__main__": - - import asyncio - - async def main(): - # await run_cognify_base_rag() - # await cognify_search_base_rag("show_all_processes", "context") - await cognify_search_graph("show_all_processes", "context") - asyncio.run(main()) - # run_cognify_base_rag_and_search() - # # Data preprocessing before setting the dataset test cases - swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') - test_dataset = convert_swe_to_deepeval_testcases(swe_dataset) - from deepeval.metrics import HallucinationMetric - metric = HallucinationMetric() - evalresult = test_dataset.evaluate([metric]) - pass diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 2cb221576..c0ab6d67e 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -1,7 +1,9 @@ +import argparse import json import subprocess from pathlib import Path +from datasets import Dataset from swebench.harness.utils import load_swebench_dataset from swebench.inference.make_datasets.create_instance import PATCH_EXAMPLE @@ -13,19 +15,20 @@ from cognee.infrastructure.llm.get_llm_client import get_llm_client from evals.eval_utils import download_instances -async def cognee_and_llm(dataset, search_type=SearchType.CHUNKS): +async def generate_patch_with_cognee(instance, search_type=SearchType.CHUNKS): + await cognee.prune.prune_data() await cognee.prune.prune_system(metadata=True) dataset_name = "SWE_test_data" - code_text = dataset[0]["text"] + code_text = instance["text"] await cognee.add([code_text], dataset_name) await code_graph_pipeline([dataset_name]) graph_engine = await get_graph_engine() with open(graph_engine.filename, "r") as f: graph_str = f.read() - problem_statement = dataset[0]['problem_statement'] + problem_statement = instance['problem_statement'] instructions = ( "I need you to solve this issue by looking at the provided knowledge graph and by " + "generating a single patch file that I can apply directly to this repository " @@ -51,9 +54,9 @@ async def cognee_and_llm(dataset, search_type=SearchType.CHUNKS): return answer_prediction -async def llm_on_preprocessed_data(dataset): - problem_statement = dataset[0]['problem_statement'] - prompt = dataset[0]["text"] +async def generate_patch_without_cognee(instance): + problem_statement = instance['problem_statement'] + prompt = instance["text"] llm_client = get_llm_client() answer_prediction = llm_client.create_structured_output( @@ -66,46 +69,54 @@ async def llm_on_preprocessed_data(dataset): async def get_preds(dataset, with_cognee=True): if with_cognee: - text_output = await cognee_and_llm(dataset) model_name = "with_cognee" + pred_func = generate_patch_with_cognee else: - text_output = await llm_on_preprocessed_data(dataset) model_name = "without_cognee" + pred_func = generate_patch_without_cognee - preds = [{"instance_id": dataset[0]["instance_id"], - "model_patch": text_output, - "model_name_or_path": model_name}] + preds = [{"instance_id": instance["instance_id"], + "model_patch": await pred_func(instance), + "model_name_or_path": model_name} for instance in dataset] return preds async def main(): - swe_dataset = load_swebench_dataset( - 'princeton-nlp/SWE-bench', split='test') - swe_dataset_preprocessed = load_swebench_dataset( - 'princeton-nlp/SWE-bench_bm25_13K', split='test') - test_data = swe_dataset[:1] - test_data_preprocessed = swe_dataset_preprocessed[:1] - assert test_data[0]["instance_id"] == test_data_preprocessed[0]["instance_id"] - filepath = Path("SWE-bench_testsample") - if filepath.exists(): - from datasets import Dataset - dataset = Dataset.load_from_disk(filepath) - else: - dataset = download_instances(test_data, filepath) + parser = argparse.ArgumentParser( + description="Run LLM predictions on SWE-bench dataset") + parser.add_argument('--cognee_off', action='store_true') + args = parser.parse_args() - cognee_preds = await get_preds(dataset, with_cognee=True) - # nocognee_preds = await get_preds(dataset, with_cognee=False) - with open("withcognee.json", "w") as file: - json.dump(cognee_preds, file) + if args.cognee_off: + dataset_name = 'princeton-nlp/SWE-bench_Lite_bm25_13K' + dataset = load_swebench_dataset(dataset_name, split='test') + predictions_path = "preds_nocognee.json" + if Path(predictions_path).exists(): + with open(predictions_path, "r") as file: + preds = json.load(file) + else: + preds = await get_preds(dataset, with_cognee=False) + with open(predictions_path, "w") as file: + json.dump(preds, file) + else: + dataset_name = 'princeton-nlp/SWE-bench_Lite' + swe_dataset = load_swebench_dataset( + dataset_name, split='test')[:1] + filepath = Path("SWE-bench_testsample") + if filepath.exists(): + dataset = Dataset.load_from_disk(filepath) + else: + dataset = download_instances(swe_dataset, filepath) + predictions_path = "preds.json" + preds = await get_preds(dataset, with_cognee=not args.cognee_off) subprocess.run(["python", "-m", "swebench.harness.run_evaluation", - "--dataset_name", 'princeton-nlp/SWE-bench', + "--dataset_name", dataset_name, "--split", "test", - "--predictions_path", "withcognee.json", + "--predictions_path", predictions_path, "--max_workers", "1", - "--instance_ids", test_data[0]["instance_id"], - "--run_id", "with_cognee"]) + "--run_id", "test_run"]) if __name__ == "__main__": import asyncio diff --git a/evals/eval_utils.py b/evals/eval_utils.py index e4f070ffd..e95a84cec 100644 --- a/evals/eval_utils.py +++ b/evals/eval_utils.py @@ -53,7 +53,7 @@ def extract_fields(instance): text_inputs = "\n".join([readmes_text, code_text]) text_inputs = text_inputs.strip() + "\n\n" # text_inputs = code_text - patch = "\n".join([f"", instance["patch"], ""]) + patch = "\n".join(["", instance["patch"], ""]) return {**instance, "text": text_inputs, "patch": patch}