Individual baseline runs
This commit is contained in:
parent
9c131f0d14
commit
4d07806e5f
3 changed files with 217 additions and 26 deletions
|
|
@ -1,6 +1,6 @@
|
|||
from functools import lru_cache
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class EvalConfig(BaseSettings):
|
||||
|
|
@ -43,6 +43,9 @@ class EvalConfig(BaseSettings):
|
|||
dashboard_path: str = "dashboard.html"
|
||||
direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
|
||||
direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
|
||||
name_of_answers_file: str = "not_set.json"
|
||||
name_of_html: str = "not_set.html"
|
||||
instance_filter: Optional[List[str]] = None
|
||||
|
||||
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
||||
|
||||
|
|
@ -68,6 +71,9 @@ class EvalConfig(BaseSettings):
|
|||
"task_getter_type": self.task_getter_type,
|
||||
"direct_llm_system_prompt": self.direct_llm_system_prompt,
|
||||
"direct_llm_eval_prompt": self.direct_llm_eval_prompt,
|
||||
"name_of_answers_file": self.name_of_answers_file,
|
||||
"name_of_html": self.name_of_html,
|
||||
"instance_filter": self.instance_filter,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -167,4 +167,4 @@ def create_dashboard(
|
|||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(dashboard_html)
|
||||
|
||||
return output_file
|
||||
return dashboard_html
|
||||
|
|
|
|||
|
|
@ -10,8 +10,10 @@ from cognee.eval_framework.answer_generation.run_question_answering_module impor
|
|||
run_question_answering,
|
||||
)
|
||||
from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
|
||||
from cognee.eval_framework.metrics_dashboard import create_dashboard
|
||||
|
||||
logger = get_logger()
|
||||
vol = modal.Volume.from_name("baseline_results", create_if_missing=True)
|
||||
|
||||
|
||||
def read_and_combine_metrics(eval_params: dict) -> dict:
|
||||
|
|
@ -51,8 +53,8 @@ image = (
|
|||
)
|
||||
|
||||
|
||||
@app.function(image=image, concurrency_limit=2, timeout=1800, retries=1)
|
||||
async def modal_run_eval(eval_params=None):
|
||||
@app.function(image=image, concurrency_limit=6, timeout=86400, retries=1, volumes={"/data": vol})
|
||||
async def modal_run_eval2(eval_params=None):
|
||||
"""Runs evaluation pipeline and returns combined metrics results."""
|
||||
if eval_params is None:
|
||||
eval_params = EvalConfig().to_dict()
|
||||
|
|
@ -60,9 +62,23 @@ async def modal_run_eval(eval_params=None):
|
|||
logger.info(f"Running evaluation with params: {eval_params}")
|
||||
|
||||
# Run the evaluation pipeline
|
||||
await run_corpus_builder(eval_params)
|
||||
await run_corpus_builder(eval_params, instance_filter=eval_params.get("instance_filter"))
|
||||
await run_question_answering(eval_params)
|
||||
await run_evaluation(eval_params)
|
||||
answers = await run_evaluation(eval_params)
|
||||
with open("/data/" + (str)(eval_params.get("name_of_answers_file")), "w") as f:
|
||||
json.dump(answers, f, ensure_ascii=False, indent=4)
|
||||
vol.commit()
|
||||
if eval_params.get("dashboard"):
|
||||
logger.info("Generating dashboard...")
|
||||
html = create_dashboard(
|
||||
metrics_path=eval_params["metrics_path"],
|
||||
aggregate_metrics_path=eval_params["aggregate_metrics_path"],
|
||||
output_file=eval_params["dashboard_path"],
|
||||
benchmark=eval_params["benchmark"],
|
||||
)
|
||||
with open("/data/" + (str)(eval_params.get("name_of_html")), "w") as f:
|
||||
f.write(html)
|
||||
vol.commit()
|
||||
|
||||
# Early return if metrics calculation wasn't requested
|
||||
if not eval_params.get("evaluating_answers") or not eval_params.get("calculate_metrics"):
|
||||
|
|
@ -71,7 +87,9 @@ async def modal_run_eval(eval_params=None):
|
|||
)
|
||||
return None
|
||||
|
||||
return read_and_combine_metrics(eval_params)
|
||||
logger.info("Everything finished...")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@app.local_entrypoint()
|
||||
|
|
@ -80,37 +98,204 @@ async def main():
|
|||
configs = [
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=2,
|
||||
benchmark="HotPotQA",
|
||||
number_of_samples_in_corpus=24,
|
||||
building_corpus_from_scratch=True,
|
||||
qa_engine="cognee_graph_completion",
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=False,
|
||||
dashboard=True,
|
||||
name_of_answers_file="Hotpot_train3.json",
|
||||
name_of_html="Hotpot_train3.html",
|
||||
instance_filter=[
|
||||
"5a8e341c5542995085b373d6",
|
||||
"5ab2a308554299340b52553b",
|
||||
"5a79c1095542996c55b2dc62",
|
||||
"5a8c52685542995e66a475bb",
|
||||
"5a734dad5542994cef4bc522",
|
||||
"5a74ab6055429916b01641b9",
|
||||
"5ae2661d554299495565da60",
|
||||
"5a88dcf9554299206df2b383",
|
||||
"5ab8179f5542990e739ec817",
|
||||
"5a812d0555429938b61422e1",
|
||||
"5a79be0e5542994f819ef084",
|
||||
"5a875b755542996e4f308796",
|
||||
"5ae675245542991bbc9760dc",
|
||||
"5ab819065542995dae37ea3c",
|
||||
"5a74d64055429916b0164223",
|
||||
"5abfea825542994516f45527",
|
||||
"5ac279345542990b17b153b0",
|
||||
"5ab3c48755429969a97a81b8",
|
||||
"5adf35935542993344016c36",
|
||||
"5a83d0845542996488c2e4e6",
|
||||
"5a7af32e55429931da12c99c",
|
||||
"5a7c9ead5542990527d554e4",
|
||||
"5ae12aa6554299422ee99617",
|
||||
"5a710a915542994082a3e504",
|
||||
],
|
||||
),
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=10,
|
||||
number_of_samples_in_corpus=12,
|
||||
benchmark="HotPotQA",
|
||||
building_corpus_from_scratch=True,
|
||||
qa_engine="cognee_graph_completion",
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=False,
|
||||
dashboard=True,
|
||||
name_of_answers_file="Hotpot_test3.json",
|
||||
name_of_html="Hotpot_test3.html",
|
||||
instance_filter=[
|
||||
"5ae27df25542992decbdcd2a",
|
||||
"5a72224755429971e9dc92be",
|
||||
"5a8900c75542997e5c09a6ed",
|
||||
"5ae1412a55429920d523434c",
|
||||
"5ab2342a5542993be8fa98c3",
|
||||
"5adde2475542997545bbbdc1",
|
||||
"5ac434cb5542997ea680ca2f",
|
||||
"5a8aed1755429950cd6afbf1",
|
||||
"5ae328f45542991a06ce993c",
|
||||
"5ae17f1e5542990adbacf7a6",
|
||||
"5ac42f42554299076e296d88",
|
||||
"5ab7484c5542992aa3b8c80d",
|
||||
],
|
||||
),
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=24,
|
||||
building_corpus_from_scratch=True,
|
||||
benchmark="TwoWikiMultiHop",
|
||||
qa_engine="cognee_graph_completion",
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=True,
|
||||
name_of_answers_file="TwoWiki_train3.json",
|
||||
name_of_html="TwoWiki_train3.html",
|
||||
instance_filter=[
|
||||
"37af9394085111ebbd58ac1f6bf848b6",
|
||||
"2102541508ac11ebbd82ac1f6bf848b6",
|
||||
"b249aa840bdc11eba7f7acde48001122",
|
||||
"feb4b9dc0bdb11eba7f7acde48001122",
|
||||
"13d3552e0bde11eba7f7acde48001122",
|
||||
"cc6c68e4096511ebbdafac1f6bf848b6",
|
||||
"10776f4508a211ebbd7aac1f6bf848b6",
|
||||
"c096ef9e086d11ebbd62ac1f6bf848b6",
|
||||
"20c7b59608db11ebbd9cac1f6bf848b6",
|
||||
"7f3724780baf11ebab90acde48001122",
|
||||
"482773fc0baf11ebab90acde48001122",
|
||||
"e519fa3c0bae11ebab90acde48001122",
|
||||
"d956416e086711ebbd5eac1f6bf848b6",
|
||||
"89024aba08a411ebbd7dac1f6bf848b6",
|
||||
"19a3ad5008c811ebbd91ac1f6bf848b6",
|
||||
"ee484526089f11ebbd78ac1f6bf848b6",
|
||||
"53625784086511ebbd5eac1f6bf848b6",
|
||||
"f02d1c2208b811ebbd88ac1f6bf848b6",
|
||||
"a2f105fa088511ebbd6dac1f6bf848b6",
|
||||
"52618be00bb011ebab90acde48001122",
|
||||
"ec70a8a208a311ebbd7cac1f6bf848b6",
|
||||
"42b3c0b80bde11eba7f7acde48001122",
|
||||
"c807422a0bda11eba7f7acde48001122",
|
||||
"4e7c40ed08ea11ebbda7ac1f6bf848b6",
|
||||
],
|
||||
),
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=12,
|
||||
building_corpus_from_scratch=True,
|
||||
benchmark="TwoWikiMultiHop",
|
||||
qa_engine="cognee_graph_completion",
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=True,
|
||||
name_of_answers_file="TwoWiki_test3.json",
|
||||
name_of_html="TwoWiki_test3.html",
|
||||
instance_filter=[
|
||||
"5211d89a095011ebbdaeac1f6bf848b6",
|
||||
"fe105e54089411ebbd75ac1f6bf848b6",
|
||||
"bd6f350408d311ebbd96ac1f6bf848b6",
|
||||
"57f2630e08ae11ebbd83ac1f6bf848b6",
|
||||
"8d9cf88009b311ebbdb0ac1f6bf848b6",
|
||||
"eafb6d960bae11ebab90acde48001122",
|
||||
"45153f740bdb11eba7f7acde48001122",
|
||||
"385457c20bde11eba7f7acde48001122",
|
||||
"45a16d5a0bdb11eba7f7acde48001122",
|
||||
"7253afc808c711ebbd91ac1f6bf848b6",
|
||||
"d03449820baf11ebab90acde48001122",
|
||||
"0ea215140bdd11eba7f7acde48001122",
|
||||
],
|
||||
),
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=24,
|
||||
building_corpus_from_scratch=True,
|
||||
qa_engine="cognee_graph_completion",
|
||||
benchmark="Musique",
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=True,
|
||||
name_of_answers_file="Musique_train3.json",
|
||||
name_of_html="Musique_train3.html",
|
||||
instance_filter=[
|
||||
"2hop__374495_68633",
|
||||
"2hop__735014_83837",
|
||||
"2hop__108158_83769",
|
||||
"2hop__92051_827343",
|
||||
"2hop__55552_158105",
|
||||
"2hop__81825_49084",
|
||||
"2hop__91667_81007",
|
||||
"2hop__696442_51329",
|
||||
"3hop1__516535_834494_34099",
|
||||
"3hop1__57186_237521_291682",
|
||||
"3hop1__475351_160713_77246",
|
||||
"3hop2__304722_397371_63959",
|
||||
"3hop1__135392_87694_64412",
|
||||
"3hop1__354480_834494_33939",
|
||||
"3hop1__446612_160545_34751",
|
||||
"3hop1__232315_831637_91775",
|
||||
"3hop2__222979_132536_40768",
|
||||
"3hop2__304722_330033_63959",
|
||||
"3hop1__488744_443779_52195",
|
||||
"3hop1__146155_131905_41948",
|
||||
"4hop1__788226_32392_823060_610794",
|
||||
"4hop1__236903_153080_33897_81096",
|
||||
"4hop1__199881_378185_282674_759393",
|
||||
"4hop1__726391_153080_33952_34109",
|
||||
],
|
||||
),
|
||||
EvalConfig(
|
||||
task_getter_type="Default",
|
||||
number_of_samples_in_corpus=12,
|
||||
building_corpus_from_scratch=True,
|
||||
qa_engine="cognee_graph_completion",
|
||||
benchmark="Musique",
|
||||
answering_questions=True,
|
||||
evaluating_answers=True,
|
||||
calculate_metrics=True,
|
||||
dashboard=True,
|
||||
name_of_answers_file="Musique_test3.json",
|
||||
name_of_html="Musique_test3.html",
|
||||
instance_filter=[
|
||||
"2hop__272714_113442",
|
||||
"2hop__6827_49664",
|
||||
"2hop__24648_192417",
|
||||
"2hop__85958_87295",
|
||||
"3hop2__222979_840908_40768",
|
||||
"3hop1__640171_228453_10972",
|
||||
"3hop1__92991_78276_68042",
|
||||
"3hop1__147162_131905_41948",
|
||||
"4hop1__813171_153080_159767_81096",
|
||||
"4hop1__726391_153080_33952_33939",
|
||||
"4hop1__707078_765799_282674_759393",
|
||||
"4hop1__408432_32392_823060_610794",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
# Run evaluations in parallel with different configurations
|
||||
modal_tasks = [modal_run_eval.remote.aio(config.to_dict()) for config in configs]
|
||||
results = await asyncio.gather(*modal_tasks)
|
||||
|
||||
# Filter out None results and save combined results
|
||||
results = [r for r in results if r is not None]
|
||||
if results:
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_file = f"combined_results_{timestamp}.json"
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
json.dump(results, f, indent=2)
|
||||
|
||||
logger.info(f"Completed parallel evaluation runs. Results saved to {output_file}")
|
||||
else:
|
||||
logger.info("No metrics were collected from any of the evaluation runs")
|
||||
modal_tasks = [modal_run_eval2.remote.aio(config.to_dict()) for config in configs]
|
||||
await asyncio.gather(*modal_tasks)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue