cognee/cognee/eval_framework/modal_run_eval.py
2025-05-13 14:33:40 +02:00

301 lines
12 KiB
Python

import modal
import os
import json
import asyncio
import datetime
from cognee.shared.logging_utils import get_logger
from cognee.eval_framework.eval_config import EvalConfig
from cognee.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
from cognee.eval_framework.answer_generation.run_question_answering_module import (
run_question_answering,
)
from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
from cognee.eval_framework.metrics_dashboard import create_dashboard
logger = get_logger()
vol = modal.Volume.from_name("baseline_results", create_if_missing=True)
def read_and_combine_metrics(eval_params: dict) -> dict:
"""Read and combine metrics files into a single result dictionary."""
try:
with open(eval_params["metrics_path"], "r") as f:
metrics = json.load(f)
with open(eval_params["aggregate_metrics_path"], "r") as f:
aggregate_metrics = json.load(f)
return {
"task_getter_type": eval_params["task_getter_type"],
"number_of_samples": eval_params["number_of_samples_in_corpus"],
"metrics": metrics,
"aggregate_metrics": aggregate_metrics,
}
except (FileNotFoundError, json.JSONDecodeError) as e:
logger.error(f"Error reading metrics files: {e}")
return None
app = modal.App("modal-run-eval")
image = (
modal.Image.from_dockerfile(path="Dockerfile_modal", force_build=False)
.copy_local_file("pyproject.toml", "pyproject.toml")
.copy_local_file("poetry.lock", "poetry.lock")
.env(
{
"ENV": os.getenv("ENV"),
"LLM_API_KEY": os.getenv("LLM_API_KEY"),
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY"),
}
)
.poetry_install_from_file(poetry_pyproject_toml="pyproject.toml")
.pip_install("protobuf", "h2", "deepeval", "gdown", "plotly")
)
@app.function(image=image, concurrency_limit=6, timeout=86400, retries=1, volumes={"/data": vol})
async def modal_run_eval2(eval_params=None):
"""Runs evaluation pipeline and returns combined metrics results."""
if eval_params is None:
eval_params = EvalConfig().to_dict()
logger.info(f"Running evaluation with params: {eval_params}")
# Run the evaluation pipeline
await run_corpus_builder(eval_params, instance_filter=eval_params.get("instance_filter"))
await run_question_answering(eval_params)
answers = await run_evaluation(eval_params)
with open("/data/" + (str)(eval_params.get("name_of_answers_file")), "w") as f:
json.dump(answers, f, ensure_ascii=False, indent=4)
vol.commit()
if eval_params.get("dashboard"):
logger.info("Generating dashboard...")
html = create_dashboard(
metrics_path=eval_params["metrics_path"],
aggregate_metrics_path=eval_params["aggregate_metrics_path"],
output_file=eval_params["dashboard_path"],
benchmark=eval_params["benchmark"],
)
with open("/data/" + (str)(eval_params.get("name_of_html")), "w") as f:
f.write(html)
vol.commit()
# Early return if metrics calculation wasn't requested
if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
logger.info(
"Skipping metrics collection as either evaluating_answers or calculate_metrics is False"
)
return None
logger.info("Everything finished...")
return True
@app.local_entrypoint()
async def main():
# List of configurations to run
configs = [
EvalConfig(
task_getter_type="Default",
benchmark="HotPotQA",
number_of_samples_in_corpus=24,
building_corpus_from_scratch=True,
qa_engine="cognee_graph_completion",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
name_of_answers_file="Hotpot_train3.json",
name_of_html="Hotpot_train3.html",
instance_filter=[
"5a8e341c5542995085b373d6",
"5ab2a308554299340b52553b",
"5a79c1095542996c55b2dc62",
"5a8c52685542995e66a475bb",
"5a734dad5542994cef4bc522",
"5a74ab6055429916b01641b9",
"5ae2661d554299495565da60",
"5a88dcf9554299206df2b383",
"5ab8179f5542990e739ec817",
"5a812d0555429938b61422e1",
"5a79be0e5542994f819ef084",
"5a875b755542996e4f308796",
"5ae675245542991bbc9760dc",
"5ab819065542995dae37ea3c",
"5a74d64055429916b0164223",
"5abfea825542994516f45527",
"5ac279345542990b17b153b0",
"5ab3c48755429969a97a81b8",
"5adf35935542993344016c36",
"5a83d0845542996488c2e4e6",
"5a7af32e55429931da12c99c",
"5a7c9ead5542990527d554e4",
"5ae12aa6554299422ee99617",
"5a710a915542994082a3e504",
],
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=12,
benchmark="HotPotQA",
building_corpus_from_scratch=True,
qa_engine="cognee_graph_completion",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
name_of_answers_file="Hotpot_test3.json",
name_of_html="Hotpot_test3.html",
instance_filter=[
"5ae27df25542992decbdcd2a",
"5a72224755429971e9dc92be",
"5a8900c75542997e5c09a6ed",
"5ae1412a55429920d523434c",
"5ab2342a5542993be8fa98c3",
"5adde2475542997545bbbdc1",
"5ac434cb5542997ea680ca2f",
"5a8aed1755429950cd6afbf1",
"5ae328f45542991a06ce993c",
"5ae17f1e5542990adbacf7a6",
"5ac42f42554299076e296d88",
"5ab7484c5542992aa3b8c80d",
],
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=24,
building_corpus_from_scratch=True,
benchmark="TwoWikiMultiHop",
qa_engine="cognee_graph_completion",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
name_of_answers_file="TwoWiki_train3.json",
name_of_html="TwoWiki_train3.html",
instance_filter=[
"37af9394085111ebbd58ac1f6bf848b6",
"2102541508ac11ebbd82ac1f6bf848b6",
"b249aa840bdc11eba7f7acde48001122",
"feb4b9dc0bdb11eba7f7acde48001122",
"13d3552e0bde11eba7f7acde48001122",
"cc6c68e4096511ebbdafac1f6bf848b6",
"10776f4508a211ebbd7aac1f6bf848b6",
"c096ef9e086d11ebbd62ac1f6bf848b6",
"20c7b59608db11ebbd9cac1f6bf848b6",
"7f3724780baf11ebab90acde48001122",
"482773fc0baf11ebab90acde48001122",
"e519fa3c0bae11ebab90acde48001122",
"d956416e086711ebbd5eac1f6bf848b6",
"89024aba08a411ebbd7dac1f6bf848b6",
"19a3ad5008c811ebbd91ac1f6bf848b6",
"ee484526089f11ebbd78ac1f6bf848b6",
"53625784086511ebbd5eac1f6bf848b6",
"f02d1c2208b811ebbd88ac1f6bf848b6",
"a2f105fa088511ebbd6dac1f6bf848b6",
"52618be00bb011ebab90acde48001122",
"ec70a8a208a311ebbd7cac1f6bf848b6",
"42b3c0b80bde11eba7f7acde48001122",
"c807422a0bda11eba7f7acde48001122",
"4e7c40ed08ea11ebbda7ac1f6bf848b6",
],
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=12,
building_corpus_from_scratch=True,
benchmark="TwoWikiMultiHop",
qa_engine="cognee_graph_completion",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
name_of_answers_file="TwoWiki_test3.json",
name_of_html="TwoWiki_test3.html",
instance_filter=[
"5211d89a095011ebbdaeac1f6bf848b6",
"fe105e54089411ebbd75ac1f6bf848b6",
"bd6f350408d311ebbd96ac1f6bf848b6",
"57f2630e08ae11ebbd83ac1f6bf848b6",
"8d9cf88009b311ebbdb0ac1f6bf848b6",
"eafb6d960bae11ebab90acde48001122",
"45153f740bdb11eba7f7acde48001122",
"385457c20bde11eba7f7acde48001122",
"45a16d5a0bdb11eba7f7acde48001122",
"7253afc808c711ebbd91ac1f6bf848b6",
"d03449820baf11ebab90acde48001122",
"0ea215140bdd11eba7f7acde48001122",
],
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=24,
building_corpus_from_scratch=True,
qa_engine="cognee_graph_completion",
benchmark="Musique",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
name_of_answers_file="Musique_train3.json",
name_of_html="Musique_train3.html",
instance_filter=[
"2hop__374495_68633",
"2hop__735014_83837",
"2hop__108158_83769",
"2hop__92051_827343",
"2hop__55552_158105",
"2hop__81825_49084",
"2hop__91667_81007",
"2hop__696442_51329",
"3hop1__516535_834494_34099",
"3hop1__57186_237521_291682",
"3hop1__475351_160713_77246",
"3hop2__304722_397371_63959",
"3hop1__135392_87694_64412",
"3hop1__354480_834494_33939",
"3hop1__446612_160545_34751",
"3hop1__232315_831637_91775",
"3hop2__222979_132536_40768",
"3hop2__304722_330033_63959",
"3hop1__488744_443779_52195",
"3hop1__146155_131905_41948",
"4hop1__788226_32392_823060_610794",
"4hop1__236903_153080_33897_81096",
"4hop1__199881_378185_282674_759393",
"4hop1__726391_153080_33952_34109",
],
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=12,
building_corpus_from_scratch=True,
qa_engine="cognee_graph_completion",
benchmark="Musique",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
name_of_answers_file="Musique_test3.json",
name_of_html="Musique_test3.html",
instance_filter=[
"2hop__272714_113442",
"2hop__6827_49664",
"2hop__24648_192417",
"2hop__85958_87295",
"3hop2__222979_840908_40768",
"3hop1__640171_228453_10972",
"3hop1__92991_78276_68042",
"3hop1__147162_131905_41948",
"4hop1__813171_153080_159767_81096",
"4hop1__726391_153080_33952_33939",
"4hop1__707078_765799_282674_759393",
"4hop1__408432_32392_823060_610794",
],
),
]
# Run evaluations in parallel with different configurations
modal_tasks = [modal_run_eval2.remote.aio(config.to_dict()) for config in configs]
await asyncio.gather(*modal_tasks)