Compare commits

...
Sign in to create a new pull request.

1 commit

Author SHA1 Message Date
hajdul88
4d07806e5f Individual baseline runs 2025-05-13 14:33:40 +02:00
3 changed files with 217 additions and 26 deletions

View file

@ -1,6 +1,6 @@
from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict
from typing import List
from typing import List, Optional
class EvalConfig(BaseSettings):
@ -43,6 +43,9 @@ class EvalConfig(BaseSettings):
dashboard_path: str = "dashboard.html"
direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
name_of_answers_file: str = "not_set.json"
name_of_html: str = "not_set.html"
instance_filter: Optional[List[str]] = None
model_config = SettingsConfigDict(env_file=".env", extra="allow")
@ -68,6 +71,9 @@ class EvalConfig(BaseSettings):
"task_getter_type": self.task_getter_type,
"direct_llm_system_prompt": self.direct_llm_system_prompt,
"direct_llm_eval_prompt": self.direct_llm_eval_prompt,
"name_of_answers_file": self.name_of_answers_file,
"name_of_html": self.name_of_html,
"instance_filter": self.instance_filter,
}

View file

@ -167,4 +167,4 @@ def create_dashboard(
with open(output_file, "w", encoding="utf-8") as f:
f.write(dashboard_html)
return output_file
return dashboard_html

View file

@ -10,8 +10,10 @@ from cognee.eval_framework.answer_generation.run_question_answering_module impor
run_question_answering,
)
from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
from cognee.eval_framework.metrics_dashboard import create_dashboard
logger = get_logger()
vol = modal.Volume.from_name("baseline_results", create_if_missing=True)
def read_and_combine_metrics(eval_params: dict) -> dict:
@ -51,8 +53,8 @@ image = (
)
@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
async def modal_run_eval(eval_params=None):
@app.function(image=image, concurrency_limit=6, timeout=86400, retries=1, volumes={"/data": vol})
async def modal_run_eval2(eval_params=None):
"""Runs evaluation pipeline and returns combined metrics results."""
if eval_params is None:
eval_params = EvalConfig().to_dict()
@ -60,9 +62,23 @@ async def modal_run_eval(eval_params=None):
logger.info(f"Running evaluation with params: {eval_params}")
# Run the evaluation pipeline
await run_corpus_builder(eval_params)
await run_corpus_builder(eval_params, instance_filter=eval_params.get("instance_filter"))
await run_question_answering(eval_params)
await run_evaluation(eval_params)
answers = await run_evaluation(eval_params)
with open("/data/" + (str)(eval_params.get("name_of_answers_file")), "w") as f:
json.dump(answers, f, ensure_ascii=False, indent=4)
vol.commit()
if eval_params.get("dashboard"):
logger.info("Generating dashboard...")
html = create_dashboard(
metrics_path=eval_params["metrics_path"],
aggregate_metrics_path=eval_params["aggregate_metrics_path"],
output_file=eval_params["dashboard_path"],
benchmark=eval_params["benchmark"],
)
with open("/data/" + (str)(eval_params.get("name_of_html")), "w") as f:
f.write(html)
vol.commit()
# Early return if metrics calculation wasn't requested
if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
@ -71,7 +87,9 @@ async def modal_run_eval(eval_params=None):
)
return None
return read_and_combine_metrics(eval_params)
logger.info("Everything finished...")
return True
@app.local_entrypoint()
@ -80,37 +98,204 @@ async def main():
configs = [
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=2,
benchmark="HotPotQA",
number_of_samples_in_corpus=24,
building_corpus_from_scratch=True,
qa_engine="cognee_graph_completion",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=False,
dashboard=True,
name_of_answers_file="Hotpot_train3.json",
name_of_html="Hotpot_train3.html",
instance_filter=[
"5a8e341c5542995085b373d6",
"5ab2a308554299340b52553b",
"5a79c1095542996c55b2dc62",
"5a8c52685542995e66a475bb",
"5a734dad5542994cef4bc522",
"5a74ab6055429916b01641b9",
"5ae2661d554299495565da60",
"5a88dcf9554299206df2b383",
"5ab8179f5542990e739ec817",
"5a812d0555429938b61422e1",
"5a79be0e5542994f819ef084",
"5a875b755542996e4f308796",
"5ae675245542991bbc9760dc",
"5ab819065542995dae37ea3c",
"5a74d64055429916b0164223",
"5abfea825542994516f45527",
"5ac279345542990b17b153b0",
"5ab3c48755429969a97a81b8",
"5adf35935542993344016c36",
"5a83d0845542996488c2e4e6",
"5a7af32e55429931da12c99c",
"5a7c9ead5542990527d554e4",
"5ae12aa6554299422ee99617",
"5a710a915542994082a3e504",
],
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=10,
number_of_samples_in_corpus=12,
benchmark="HotPotQA",
building_corpus_from_scratch=True,
qa_engine="cognee_graph_completion",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=False,
dashboard=True,
name_of_answers_file="Hotpot_test3.json",
name_of_html="Hotpot_test3.html",
instance_filter=[
"5ae27df25542992decbdcd2a",
"5a72224755429971e9dc92be",
"5a8900c75542997e5c09a6ed",
"5ae1412a55429920d523434c",
"5ab2342a5542993be8fa98c3",
"5adde2475542997545bbbdc1",
"5ac434cb5542997ea680ca2f",
"5a8aed1755429950cd6afbf1",
"5ae328f45542991a06ce993c",
"5ae17f1e5542990adbacf7a6",
"5ac42f42554299076e296d88",
"5ab7484c5542992aa3b8c80d",
],
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=24,
building_corpus_from_scratch=True,
benchmark="TwoWikiMultiHop",
qa_engine="cognee_graph_completion",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
name_of_answers_file="TwoWiki_train3.json",
name_of_html="TwoWiki_train3.html",
instance_filter=[
"37af9394085111ebbd58ac1f6bf848b6",
"2102541508ac11ebbd82ac1f6bf848b6",
"b249aa840bdc11eba7f7acde48001122",
"feb4b9dc0bdb11eba7f7acde48001122",
"13d3552e0bde11eba7f7acde48001122",
"cc6c68e4096511ebbdafac1f6bf848b6",
"10776f4508a211ebbd7aac1f6bf848b6",
"c096ef9e086d11ebbd62ac1f6bf848b6",
"20c7b59608db11ebbd9cac1f6bf848b6",
"7f3724780baf11ebab90acde48001122",
"482773fc0baf11ebab90acde48001122",
"e519fa3c0bae11ebab90acde48001122",
"d956416e086711ebbd5eac1f6bf848b6",
"89024aba08a411ebbd7dac1f6bf848b6",
"19a3ad5008c811ebbd91ac1f6bf848b6",
"ee484526089f11ebbd78ac1f6bf848b6",
"53625784086511ebbd5eac1f6bf848b6",
"f02d1c2208b811ebbd88ac1f6bf848b6",
"a2f105fa088511ebbd6dac1f6bf848b6",
"52618be00bb011ebab90acde48001122",
"ec70a8a208a311ebbd7cac1f6bf848b6",
"42b3c0b80bde11eba7f7acde48001122",
"c807422a0bda11eba7f7acde48001122",
"4e7c40ed08ea11ebbda7ac1f6bf848b6",
],
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=12,
building_corpus_from_scratch=True,
benchmark="TwoWikiMultiHop",
qa_engine="cognee_graph_completion",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
name_of_answers_file="TwoWiki_test3.json",
name_of_html="TwoWiki_test3.html",
instance_filter=[
"5211d89a095011ebbdaeac1f6bf848b6",
"fe105e54089411ebbd75ac1f6bf848b6",
"bd6f350408d311ebbd96ac1f6bf848b6",
"57f2630e08ae11ebbd83ac1f6bf848b6",
"8d9cf88009b311ebbdb0ac1f6bf848b6",
"eafb6d960bae11ebab90acde48001122",
"45153f740bdb11eba7f7acde48001122",
"385457c20bde11eba7f7acde48001122",
"45a16d5a0bdb11eba7f7acde48001122",
"7253afc808c711ebbd91ac1f6bf848b6",
"d03449820baf11ebab90acde48001122",
"0ea215140bdd11eba7f7acde48001122",
],
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=24,
building_corpus_from_scratch=True,
qa_engine="cognee_graph_completion",
benchmark="Musique",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
name_of_answers_file="Musique_train3.json",
name_of_html="Musique_train3.html",
instance_filter=[
"2hop__374495_68633",
"2hop__735014_83837",
"2hop__108158_83769",
"2hop__92051_827343",
"2hop__55552_158105",
"2hop__81825_49084",
"2hop__91667_81007",
"2hop__696442_51329",
"3hop1__516535_834494_34099",
"3hop1__57186_237521_291682",
"3hop1__475351_160713_77246",
"3hop2__304722_397371_63959",
"3hop1__135392_87694_64412",
"3hop1__354480_834494_33939",
"3hop1__446612_160545_34751",
"3hop1__232315_831637_91775",
"3hop2__222979_132536_40768",
"3hop2__304722_330033_63959",
"3hop1__488744_443779_52195",
"3hop1__146155_131905_41948",
"4hop1__788226_32392_823060_610794",
"4hop1__236903_153080_33897_81096",
"4hop1__199881_378185_282674_759393",
"4hop1__726391_153080_33952_34109",
],
),
EvalConfig(
task_getter_type="Default",
number_of_samples_in_corpus=12,
building_corpus_from_scratch=True,
qa_engine="cognee_graph_completion",
benchmark="Musique",
answering_questions=True,
evaluating_answers=True,
calculate_metrics=True,
dashboard=True,
name_of_answers_file="Musique_test3.json",
name_of_html="Musique_test3.html",
instance_filter=[
"2hop__272714_113442",
"2hop__6827_49664",
"2hop__24648_192417",
"2hop__85958_87295",
"3hop2__222979_840908_40768",
"3hop1__640171_228453_10972",
"3hop1__92991_78276_68042",
"3hop1__147162_131905_41948",
"4hop1__813171_153080_159767_81096",
"4hop1__726391_153080_33952_33939",
"4hop1__707078_765799_282674_759393",
"4hop1__408432_32392_823060_610794",
],
),
]
# Run evaluations in parallel with different configurations
modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
results = await asyncio.gather(*modal_tasks)
# Filter out None results and save combined results
results = [r for r in results if r is not None]
if results:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"combined_results_{timestamp}.json"
with open(output_file, "w") as f:
json.dump(results, f, indent=2)
logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
else:
logger.info("No metrics were collected from any of the evaluation runs")
modal_tasks = [modal_run_eval2.remote.aio(config.to_dict()) for config in configs]
await asyncio.gather(*modal_tasks)