cognee/evals/official_hotpot_metrics.py
alekszievr 2e010f8dd1
Incremental eval of cognee pipeline (#445)
* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets.

* Load dataset file by filename, outsource utilities

* restructure metric selection

* Add comprehensiveness, diversity and empowerment metrics

* add promptfoo as an option

* refactor RAG solution in eval;2C

* LLM as a judge metrics implemented in a uniform way

* Use requests.get instead of wget

* clean up promptfoo config template

* minor fixes

* get promptfoo path instead of hardcoding

* minor fixes

* Add LLM as a judge prompts

* Support 4 different rag options in eval

* Minor refactor and logger usage

* feat: make tasks a configurable argument in the cognify function

* Run eval on a set of parameters and save results as json and png

* fix: add data points task

* script for running all param combinations

* enable context provider to get tasks as param

* bugfix in simple rag

* Incremental eval of cognee pipeline

* potential fix: single asyncio run

* temp fix: exclude insights

* Remove insights, have single asyncio run, refactor

* minor fixes

* handle pipeline slices in utils

* include all options in params json

---------

Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com>
2025-01-17 14:16:48 +01:00

90 lines
2.6 KiB
Python

"""
These are the official evaluation metrics for HotpotQA taken from https://hotpotqa.github.io/
"""
import re
import string
from collections import Counter
def normalize_answer(s):
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
normalized_prediction = normalize_answer(prediction)
normalized_ground_truth = normalize_answer(ground_truth)
ZERO_METRIC = (0, 0, 0)
if (
normalized_prediction in ["yes", "no", "noanswer"]
and normalized_prediction != normalized_ground_truth
):
return ZERO_METRIC
if (
normalized_ground_truth in ["yes", "no", "noanswer"]
and normalized_prediction != normalized_ground_truth
):
return ZERO_METRIC
prediction_tokens = normalized_prediction.split()
ground_truth_tokens = normalized_ground_truth.split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return ZERO_METRIC
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1, precision, recall
def exact_match_score(prediction, ground_truth):
return normalize_answer(prediction) == normalize_answer(ground_truth)
def update_answer(metrics, prediction, gold):
em = exact_match_score(prediction, gold)
f1, prec, recall = f1_score(prediction, gold)
metrics["em"] += float(em)
metrics["f1"] += f1
metrics["prec"] += prec
metrics["recall"] += recall
return em, prec, recall
def update_sp(metrics, prediction, gold):
cur_sp_pred = set(map(tuple, prediction))
gold_sp_pred = set(map(tuple, gold))
tp, fp, fn = 0, 0, 0
for e in cur_sp_pred:
if e in gold_sp_pred:
tp += 1
else:
fp += 1
for e in gold_sp_pred:
if e not in cur_sp_pred:
fn += 1
prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
em = 1.0 if fp + fn == 0 else 0.0
metrics["sp_em"] += em
metrics["sp_f1"] += f1
metrics["sp_prec"] += prec
metrics["sp_recall"] += recall
return em, prec, recall