diff --git a/cognee/eval_framework/eval_config.py b/cognee/eval_framework/eval_config.py index dbc32150f..a3087f4e8 100644 --- a/cognee/eval_framework/eval_config.py +++ b/cognee/eval_framework/eval_config.py @@ -1,6 +1,6 @@ from functools import lru_cache from pydantic_settings import BaseSettings, SettingsConfigDict -from typing import List +from typing import List, Optional class EvalConfig(BaseSettings): @@ -43,6 +43,9 @@ class EvalConfig(BaseSettings): dashboard_path: str = "dashboard.html" direct_llm_system_prompt: str = "direct_llm_eval_system.txt" direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt" + name_of_answers_file: str = "not_set.json" + name_of_html: str = "not_set.html" + instance_filter: Optional[List[str]] = None model_config = SettingsConfigDict(env_file=".env", extra="allow") @@ -68,6 +71,9 @@ class EvalConfig(BaseSettings): "task_getter_type": self.task_getter_type, "direct_llm_system_prompt": self.direct_llm_system_prompt, "direct_llm_eval_prompt": self.direct_llm_eval_prompt, + "name_of_answers_file": self.name_of_answers_file, + "name_of_html": self.name_of_html, + "instance_filter": self.instance_filter, } diff --git a/cognee/eval_framework/metrics_dashboard.py b/cognee/eval_framework/metrics_dashboard.py index eb4d2ed8e..e7ae7c2aa 100644 --- a/cognee/eval_framework/metrics_dashboard.py +++ b/cognee/eval_framework/metrics_dashboard.py @@ -167,4 +167,4 @@ def create_dashboard( with open(output_file, "w", encoding="utf-8") as f: f.write(dashboard_html) - return output_file + return dashboard_html diff --git a/cognee/eval_framework/modal_run_eval.py b/cognee/eval_framework/modal_run_eval.py index b2cfd4065..408549e54 100644 --- a/cognee/eval_framework/modal_run_eval.py +++ b/cognee/eval_framework/modal_run_eval.py @@ -10,8 +10,10 @@ from cognee.eval_framework.answer_generation.run_question_answering_module impor run_question_answering, ) from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation +from cognee.eval_framework.metrics_dashboard import create_dashboard logger = get_logger() +vol = modal.Volume.from_name("baseline_results", create_if_missing=True) def read_and_combine_metrics(eval_params: dict) -> dict: @@ -51,8 +53,8 @@ image = ( ) -@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1) -async def modal_run_eval(eval_params=None): +@app.function(image=image, concurrency_limit=6, timeout=86400, retries=1, volumes={"/data": vol}) +async def modal_run_eval2(eval_params=None): """Runs evaluation pipeline and returns combined metrics results.""" if eval_params is None: eval_params = EvalConfig().to_dict() @@ -60,9 +62,23 @@ async def modal_run_eval(eval_params=None): logger.info(f"Running evaluation with params: {eval_params}") # Run the evaluation pipeline - await run_corpus_builder(eval_params) + await run_corpus_builder(eval_params, instance_filter=eval_params.get("instance_filter")) await run_question_answering(eval_params) - await run_evaluation(eval_params) + answers = await run_evaluation(eval_params) + with open("/data/" + (str)(eval_params.get("name_of_answers_file")), "w") as f: + json.dump(answers, f, ensure_ascii=False, indent=4) + vol.commit() + if eval_params.get("dashboard"): + logger.info("Generating dashboard...") + html = create_dashboard( + metrics_path=eval_params["metrics_path"], + aggregate_metrics_path=eval_params["aggregate_metrics_path"], + output_file=eval_params["dashboard_path"], + benchmark=eval_params["benchmark"], + ) + with open("/data/" + (str)(eval_params.get("name_of_html")), "w") as f: + f.write(html) + vol.commit() # Early return if metrics calculation wasn't requested if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"): @@ -71,7 +87,9 @@ async def modal_run_eval(eval_params=None): ) return None - return read_and_combine_metrics(eval_params) + logger.info("Everything finished...") + + return True @app.local_entrypoint() @@ -80,37 +98,204 @@ async def main(): configs = [ EvalConfig( task_getter_type="Default", - number_of_samples_in_corpus=2, + benchmark="HotPotQA", + number_of_samples_in_corpus=24, building_corpus_from_scratch=True, + qa_engine="cognee_graph_completion", answering_questions=True, evaluating_answers=True, calculate_metrics=True, - dashboard=False, + dashboard=True, + name_of_answers_file="Hotpot_train3.json", + name_of_html="Hotpot_train3.html", + instance_filter=[ + "5a8e341c5542995085b373d6", + "5ab2a308554299340b52553b", + "5a79c1095542996c55b2dc62", + "5a8c52685542995e66a475bb", + "5a734dad5542994cef4bc522", + "5a74ab6055429916b01641b9", + "5ae2661d554299495565da60", + "5a88dcf9554299206df2b383", + "5ab8179f5542990e739ec817", + "5a812d0555429938b61422e1", + "5a79be0e5542994f819ef084", + "5a875b755542996e4f308796", + "5ae675245542991bbc9760dc", + "5ab819065542995dae37ea3c", + "5a74d64055429916b0164223", + "5abfea825542994516f45527", + "5ac279345542990b17b153b0", + "5ab3c48755429969a97a81b8", + "5adf35935542993344016c36", + "5a83d0845542996488c2e4e6", + "5a7af32e55429931da12c99c", + "5a7c9ead5542990527d554e4", + "5ae12aa6554299422ee99617", + "5a710a915542994082a3e504", + ], ), EvalConfig( task_getter_type="Default", - number_of_samples_in_corpus=10, + number_of_samples_in_corpus=12, + benchmark="HotPotQA", building_corpus_from_scratch=True, + qa_engine="cognee_graph_completion", answering_questions=True, evaluating_answers=True, calculate_metrics=True, - dashboard=False, + dashboard=True, + name_of_answers_file="Hotpot_test3.json", + name_of_html="Hotpot_test3.html", + instance_filter=[ + "5ae27df25542992decbdcd2a", + "5a72224755429971e9dc92be", + "5a8900c75542997e5c09a6ed", + "5ae1412a55429920d523434c", + "5ab2342a5542993be8fa98c3", + "5adde2475542997545bbbdc1", + "5ac434cb5542997ea680ca2f", + "5a8aed1755429950cd6afbf1", + "5ae328f45542991a06ce993c", + "5ae17f1e5542990adbacf7a6", + "5ac42f42554299076e296d88", + "5ab7484c5542992aa3b8c80d", + ], + ), + EvalConfig( + task_getter_type="Default", + number_of_samples_in_corpus=24, + building_corpus_from_scratch=True, + benchmark="TwoWikiMultiHop", + qa_engine="cognee_graph_completion", + answering_questions=True, + evaluating_answers=True, + calculate_metrics=True, + dashboard=True, + name_of_answers_file="TwoWiki_train3.json", + name_of_html="TwoWiki_train3.html", + instance_filter=[ + "37af9394085111ebbd58ac1f6bf848b6", + "2102541508ac11ebbd82ac1f6bf848b6", + "b249aa840bdc11eba7f7acde48001122", + "feb4b9dc0bdb11eba7f7acde48001122", + "13d3552e0bde11eba7f7acde48001122", + "cc6c68e4096511ebbdafac1f6bf848b6", + "10776f4508a211ebbd7aac1f6bf848b6", + "c096ef9e086d11ebbd62ac1f6bf848b6", + "20c7b59608db11ebbd9cac1f6bf848b6", + "7f3724780baf11ebab90acde48001122", + "482773fc0baf11ebab90acde48001122", + "e519fa3c0bae11ebab90acde48001122", + "d956416e086711ebbd5eac1f6bf848b6", + "89024aba08a411ebbd7dac1f6bf848b6", + "19a3ad5008c811ebbd91ac1f6bf848b6", + "ee484526089f11ebbd78ac1f6bf848b6", + "53625784086511ebbd5eac1f6bf848b6", + "f02d1c2208b811ebbd88ac1f6bf848b6", + "a2f105fa088511ebbd6dac1f6bf848b6", + "52618be00bb011ebab90acde48001122", + "ec70a8a208a311ebbd7cac1f6bf848b6", + "42b3c0b80bde11eba7f7acde48001122", + "c807422a0bda11eba7f7acde48001122", + "4e7c40ed08ea11ebbda7ac1f6bf848b6", + ], + ), + EvalConfig( + task_getter_type="Default", + number_of_samples_in_corpus=12, + building_corpus_from_scratch=True, + benchmark="TwoWikiMultiHop", + qa_engine="cognee_graph_completion", + answering_questions=True, + evaluating_answers=True, + calculate_metrics=True, + dashboard=True, + name_of_answers_file="TwoWiki_test3.json", + name_of_html="TwoWiki_test3.html", + instance_filter=[ + "5211d89a095011ebbdaeac1f6bf848b6", + "fe105e54089411ebbd75ac1f6bf848b6", + "bd6f350408d311ebbd96ac1f6bf848b6", + "57f2630e08ae11ebbd83ac1f6bf848b6", + "8d9cf88009b311ebbdb0ac1f6bf848b6", + "eafb6d960bae11ebab90acde48001122", + "45153f740bdb11eba7f7acde48001122", + "385457c20bde11eba7f7acde48001122", + "45a16d5a0bdb11eba7f7acde48001122", + "7253afc808c711ebbd91ac1f6bf848b6", + "d03449820baf11ebab90acde48001122", + "0ea215140bdd11eba7f7acde48001122", + ], + ), + EvalConfig( + task_getter_type="Default", + number_of_samples_in_corpus=24, + building_corpus_from_scratch=True, + qa_engine="cognee_graph_completion", + benchmark="Musique", + answering_questions=True, + evaluating_answers=True, + calculate_metrics=True, + dashboard=True, + name_of_answers_file="Musique_train3.json", + name_of_html="Musique_train3.html", + instance_filter=[ + "2hop__374495_68633", + "2hop__735014_83837", + "2hop__108158_83769", + "2hop__92051_827343", + "2hop__55552_158105", + "2hop__81825_49084", + "2hop__91667_81007", + "2hop__696442_51329", + "3hop1__516535_834494_34099", + "3hop1__57186_237521_291682", + "3hop1__475351_160713_77246", + "3hop2__304722_397371_63959", + "3hop1__135392_87694_64412", + "3hop1__354480_834494_33939", + "3hop1__446612_160545_34751", + "3hop1__232315_831637_91775", + "3hop2__222979_132536_40768", + "3hop2__304722_330033_63959", + "3hop1__488744_443779_52195", + "3hop1__146155_131905_41948", + "4hop1__788226_32392_823060_610794", + "4hop1__236903_153080_33897_81096", + "4hop1__199881_378185_282674_759393", + "4hop1__726391_153080_33952_34109", + ], + ), + EvalConfig( + task_getter_type="Default", + number_of_samples_in_corpus=12, + building_corpus_from_scratch=True, + qa_engine="cognee_graph_completion", + benchmark="Musique", + answering_questions=True, + evaluating_answers=True, + calculate_metrics=True, + dashboard=True, + name_of_answers_file="Musique_test3.json", + name_of_html="Musique_test3.html", + instance_filter=[ + "2hop__272714_113442", + "2hop__6827_49664", + "2hop__24648_192417", + "2hop__85958_87295", + "3hop2__222979_840908_40768", + "3hop1__640171_228453_10972", + "3hop1__92991_78276_68042", + "3hop1__147162_131905_41948", + "4hop1__813171_153080_159767_81096", + "4hop1__726391_153080_33952_33939", + "4hop1__707078_765799_282674_759393", + "4hop1__408432_32392_823060_610794", + ], ), ] # Run evaluations in parallel with different configurations - modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs] - results = await asyncio.gather(*modal_tasks) - - # Filter out None results and save combined results - results = [r for r in results if r is not None] - if results: - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") - output_file = f"combined_results_{timestamp}.json" - - with open(output_file, "w") as f: - json.dump(results, f, indent=2) - - logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}") - else: - logger.info("No metrics were collected from any of the evaluation runs") + modal_tasks = [modal_run_eval2.remote.aio(config.to_dict()) for config in configs] + await asyncio.gather(*modal_tasks)