Individual baseline runs

2025-05-13 14:33:40 +02:00
3 changed files with 217 additions and 26 deletions
--- a/cognee/eval_framework/eval_config.py
+++ b/cognee/eval_framework/eval_config.py
@ -1,6 +1,6 @@
 from functools import lru_cache
 from pydantic_settings import BaseSettings, SettingsConfigDict
-from typing import List
+from typing import List, Optional


 class EvalConfig(BaseSettings):
@ -43,6 +43,9 @@ class EvalConfig(BaseSettings):
    dashboard_path: str = "dashboard.html"
    direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
    direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
+    name_of_answers_file: str = "not_set.json"
+    name_of_html: str = "not_set.html"
+    instance_filter: Optional[List[str]] = None

    model_config = SettingsConfigDict(env_file=".env", extra="allow")

@ -68,6 +71,9 @@ class EvalConfig(BaseSettings):
            "task_getter_type": self.task_getter_type,
            "direct_llm_system_prompt": self.direct_llm_system_prompt,
            "direct_llm_eval_prompt": self.direct_llm_eval_prompt,
+            "name_of_answers_file": self.name_of_answers_file,
+            "name_of_html": self.name_of_html,
+            "instance_filter": self.instance_filter,
        }


--- a/cognee/eval_framework/metrics_dashboard.py
+++ b/cognee/eval_framework/metrics_dashboard.py
@ -167,4 +167,4 @@ def create_dashboard(
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(dashboard_html)

-    return output_file
+    return dashboard_html
--- a/cognee/eval_framework/modal_run_eval.py
+++ b/cognee/eval_framework/modal_run_eval.py
@ -10,8 +10,10 @@ from cognee.eval_framework.answer_generation.run_question_answering_module impor
    run_question_answering,
 )
 from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
+from cognee.eval_framework.metrics_dashboard import create_dashboard

 logger = get_logger()
+vol = modal.Volume.from_name("baseline_results", create_if_missing=True)


 def read_and_combine_metrics(eval_params: dict) -> dict:
@ -51,8 +53,8 @@ image = (
 )


-@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
-async def modal_run_eval(eval_params=None):
+@app.function(image=image, concurrency_limit=6, timeout=86400, retries=1, volumes={"/data": vol})
+async def modal_run_eval2(eval_params=None):
    """Runs evaluation pipeline and returns combined metrics results."""
    if eval_params is None:
        eval_params = EvalConfig().to_dict()
@ -60,9 +62,23 @@ async def modal_run_eval(eval_params=None):
    logger.info(f"Running evaluation with params: {eval_params}")

    # Run the evaluation pipeline
-    await run_corpus_builder(eval_params)
+    await run_corpus_builder(eval_params, instance_filter=eval_params.get("instance_filter"))
    await run_question_answering(eval_params)
-    await run_evaluation(eval_params)
+    answers = await run_evaluation(eval_params)
+    with open("/data/" + (str)(eval_params.get("name_of_answers_file")), "w") as f:
+        json.dump(answers, f, ensure_ascii=False, indent=4)
+    vol.commit()
+    if eval_params.get("dashboard"):
+        logger.info("Generating dashboard...")
+        html = create_dashboard(
+            metrics_path=eval_params["metrics_path"],
+            aggregate_metrics_path=eval_params["aggregate_metrics_path"],
+            output_file=eval_params["dashboard_path"],
+            benchmark=eval_params["benchmark"],
+        )
+        with open("/data/" + (str)(eval_params.get("name_of_html")), "w") as f:
+            f.write(html)
+        vol.commit()

    # Early return if metrics calculation wasn't requested
    if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
@ -71,7 +87,9 @@ async def modal_run_eval(eval_params=None):
        )
        return None

-    return read_and_combine_metrics(eval_params)
+    logger.info("Everything finished...")
+
+    return True


@app.local_entrypoint()
@ -80,37 +98,204 @@ async def main():
    configs = [
        EvalConfig(
            task_getter_type="Default",
-            number_of_samples_in_corpus=2,
+            benchmark="HotPotQA",
+            number_of_samples_in_corpus=24,
            building_corpus_from_scratch=True,
+            qa_engine="cognee_graph_completion",
            answering_questions=True,
            evaluating_answers=True,
            calculate_metrics=True,
-            dashboard=False,
+            dashboard=True,
+            name_of_answers_file="Hotpot_train3.json",
+            name_of_html="Hotpot_train3.html",
+            instance_filter=[
+                "5a8e341c5542995085b373d6",
+                "5ab2a308554299340b52553b",
+                "5a79c1095542996c55b2dc62",
+                "5a8c52685542995e66a475bb",
+                "5a734dad5542994cef4bc522",
+                "5a74ab6055429916b01641b9",
+                "5ae2661d554299495565da60",
+                "5a88dcf9554299206df2b383",
+                "5ab8179f5542990e739ec817",
+                "5a812d0555429938b61422e1",
+                "5a79be0e5542994f819ef084",
+                "5a875b755542996e4f308796",
+                "5ae675245542991bbc9760dc",
+                "5ab819065542995dae37ea3c",
+                "5a74d64055429916b0164223",
+                "5abfea825542994516f45527",
+                "5ac279345542990b17b153b0",
+                "5ab3c48755429969a97a81b8",
+                "5adf35935542993344016c36",
+                "5a83d0845542996488c2e4e6",
+                "5a7af32e55429931da12c99c",
+                "5a7c9ead5542990527d554e4",
+                "5ae12aa6554299422ee99617",
+                "5a710a915542994082a3e504",
+            ],
        ),
        EvalConfig(
            task_getter_type="Default",
-            number_of_samples_in_corpus=10,
+            number_of_samples_in_corpus=12,
+            benchmark="HotPotQA",
            building_corpus_from_scratch=True,
+            qa_engine="cognee_graph_completion",
            answering_questions=True,
            evaluating_answers=True,
            calculate_metrics=True,
-            dashboard=False,
+            dashboard=True,
+            name_of_answers_file="Hotpot_test3.json",
+            name_of_html="Hotpot_test3.html",
+            instance_filter=[
+                "5ae27df25542992decbdcd2a",
+                "5a72224755429971e9dc92be",
+                "5a8900c75542997e5c09a6ed",
+                "5ae1412a55429920d523434c",
+                "5ab2342a5542993be8fa98c3",
+                "5adde2475542997545bbbdc1",
+                "5ac434cb5542997ea680ca2f",
+                "5a8aed1755429950cd6afbf1",
+                "5ae328f45542991a06ce993c",
+                "5ae17f1e5542990adbacf7a6",
+                "5ac42f42554299076e296d88",
+                "5ab7484c5542992aa3b8c80d",
+            ],
+        ),
+        EvalConfig(
+            task_getter_type="Default",
+            number_of_samples_in_corpus=24,
+            building_corpus_from_scratch=True,
+            benchmark="TwoWikiMultiHop",
+            qa_engine="cognee_graph_completion",
+            answering_questions=True,
+            evaluating_answers=True,
+            calculate_metrics=True,
+            dashboard=True,
+            name_of_answers_file="TwoWiki_train3.json",
+            name_of_html="TwoWiki_train3.html",
+            instance_filter=[
+                "37af9394085111ebbd58ac1f6bf848b6",
+                "2102541508ac11ebbd82ac1f6bf848b6",
+                "b249aa840bdc11eba7f7acde48001122",
+                "feb4b9dc0bdb11eba7f7acde48001122",
+                "13d3552e0bde11eba7f7acde48001122",
+                "cc6c68e4096511ebbdafac1f6bf848b6",
+                "10776f4508a211ebbd7aac1f6bf848b6",
+                "c096ef9e086d11ebbd62ac1f6bf848b6",
+                "20c7b59608db11ebbd9cac1f6bf848b6",
+                "7f3724780baf11ebab90acde48001122",
+                "482773fc0baf11ebab90acde48001122",
+                "e519fa3c0bae11ebab90acde48001122",
+                "d956416e086711ebbd5eac1f6bf848b6",
+                "89024aba08a411ebbd7dac1f6bf848b6",
+                "19a3ad5008c811ebbd91ac1f6bf848b6",
+                "ee484526089f11ebbd78ac1f6bf848b6",
+                "53625784086511ebbd5eac1f6bf848b6",
+                "f02d1c2208b811ebbd88ac1f6bf848b6",
+                "a2f105fa088511ebbd6dac1f6bf848b6",
+                "52618be00bb011ebab90acde48001122",
+                "ec70a8a208a311ebbd7cac1f6bf848b6",
+                "42b3c0b80bde11eba7f7acde48001122",
+                "c807422a0bda11eba7f7acde48001122",
+                "4e7c40ed08ea11ebbda7ac1f6bf848b6",
+            ],
+        ),
+        EvalConfig(
+            task_getter_type="Default",
+            number_of_samples_in_corpus=12,
+            building_corpus_from_scratch=True,
+            benchmark="TwoWikiMultiHop",
+            qa_engine="cognee_graph_completion",
+            answering_questions=True,
+            evaluating_answers=True,
+            calculate_metrics=True,
+            dashboard=True,
+            name_of_answers_file="TwoWiki_test3.json",
+            name_of_html="TwoWiki_test3.html",
+            instance_filter=[
+                "5211d89a095011ebbdaeac1f6bf848b6",
+                "fe105e54089411ebbd75ac1f6bf848b6",
+                "bd6f350408d311ebbd96ac1f6bf848b6",
+                "57f2630e08ae11ebbd83ac1f6bf848b6",
+                "8d9cf88009b311ebbdb0ac1f6bf848b6",
+                "eafb6d960bae11ebab90acde48001122",
+                "45153f740bdb11eba7f7acde48001122",
+                "385457c20bde11eba7f7acde48001122",
+                "45a16d5a0bdb11eba7f7acde48001122",
+                "7253afc808c711ebbd91ac1f6bf848b6",
+                "d03449820baf11ebab90acde48001122",
+                "0ea215140bdd11eba7f7acde48001122",
+            ],
+        ),
+        EvalConfig(
+            task_getter_type="Default",
+            number_of_samples_in_corpus=24,
+            building_corpus_from_scratch=True,
+            qa_engine="cognee_graph_completion",
+            benchmark="Musique",
+            answering_questions=True,
+            evaluating_answers=True,
+            calculate_metrics=True,
+            dashboard=True,
+            name_of_answers_file="Musique_train3.json",
+            name_of_html="Musique_train3.html",
+            instance_filter=[
+                "2hop__374495_68633",
+                "2hop__735014_83837",
+                "2hop__108158_83769",
+                "2hop__92051_827343",
+                "2hop__55552_158105",
+                "2hop__81825_49084",
+                "2hop__91667_81007",
+                "2hop__696442_51329",
+                "3hop1__516535_834494_34099",
+                "3hop1__57186_237521_291682",
+                "3hop1__475351_160713_77246",
+                "3hop2__304722_397371_63959",
+                "3hop1__135392_87694_64412",
+                "3hop1__354480_834494_33939",
+                "3hop1__446612_160545_34751",
+                "3hop1__232315_831637_91775",
+                "3hop2__222979_132536_40768",
+                "3hop2__304722_330033_63959",
+                "3hop1__488744_443779_52195",
+                "3hop1__146155_131905_41948",
+                "4hop1__788226_32392_823060_610794",
+                "4hop1__236903_153080_33897_81096",
+                "4hop1__199881_378185_282674_759393",
+                "4hop1__726391_153080_33952_34109",
+            ],
+        ),
+        EvalConfig(
+            task_getter_type="Default",
+            number_of_samples_in_corpus=12,
+            building_corpus_from_scratch=True,
+            qa_engine="cognee_graph_completion",
+            benchmark="Musique",
+            answering_questions=True,
+            evaluating_answers=True,
+            calculate_metrics=True,
+            dashboard=True,
+            name_of_answers_file="Musique_test3.json",
+            name_of_html="Musique_test3.html",
+            instance_filter=[
+                "2hop__272714_113442",
+                "2hop__6827_49664",
+                "2hop__24648_192417",
+                "2hop__85958_87295",
+                "3hop2__222979_840908_40768",
+                "3hop1__640171_228453_10972",
+                "3hop1__92991_78276_68042",
+                "3hop1__147162_131905_41948",
+                "4hop1__813171_153080_159767_81096",
+                "4hop1__726391_153080_33952_33939",
+                "4hop1__707078_765799_282674_759393",
+                "4hop1__408432_32392_823060_610794",
+            ],
        ),
    ]

    # Run evaluations in parallel with different configurations
-    modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
-    results = await asyncio.gather(*modal_tasks)
-
-    # Filter out None results and save combined results
-    results = [r for r in results if r is not None]
-    if results:
-        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        output_file = f"combined_results_{timestamp}.json"
-
-        with open(output_file, "w") as f:
-            json.dump(results, f, indent=2)
-
-        logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
-    else:
-        logger.info("No metrics were collected from any of the evaluation runs")
+    modal_tasks = [modal_run_eval2.remote.aio(config.to_dict()) for config in configs]
+    await asyncio.gather(*modal_tasks)