* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets. * Load dataset file by filename, outsource utilities * restructure metric selection * Add comprehensiveness, diversity and empowerment metrics * add promptfoo as an option * refactor RAG solution in eval;2C * LLM as a judge metrics implemented in a uniform way * Use requests.get instead of wget * clean up promptfoo config template * minor fixes * get promptfoo path instead of hardcoding * minor fixes * Add LLM as a judge prompts * Support 4 different rag options in eval * Minor refactor and logger usage * feat: make tasks a configurable argument in the cognify function * Run eval on a set of parameters and save results as json and png * fix: add data points task * script for running all param combinations * enable context provider to get tasks as param * bugfix in simple rag * Incremental eval of cognee pipeline * potential fix: single asyncio run * temp fix: exclude insights * Remove insights, have single asyncio run, refactor * minor fixes * handle pipeline slices in utils * include all options in params json --------- Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com> Co-authored-by: hajdul88 <52442977+hajdul88@users.noreply.github.com>
90 lines
2.6 KiB
Python
90 lines
2.6 KiB
Python
"""
|
|
These are the official evaluation metrics for HotpotQA taken from https://hotpotqa.github.io/
|
|
"""
|
|
|
|
import re
|
|
import string
|
|
from collections import Counter
|
|
|
|
|
|
def normalize_answer(s):
|
|
def remove_articles(text):
|
|
return re.sub(r"\b(a|an|the)\b", " ", text)
|
|
|
|
def white_space_fix(text):
|
|
return " ".join(text.split())
|
|
|
|
def remove_punc(text):
|
|
exclude = set(string.punctuation)
|
|
return "".join(ch for ch in text if ch not in exclude)
|
|
|
|
def lower(text):
|
|
return text.lower()
|
|
|
|
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
|
|
|
|
|
def f1_score(prediction, ground_truth):
|
|
normalized_prediction = normalize_answer(prediction)
|
|
normalized_ground_truth = normalize_answer(ground_truth)
|
|
|
|
ZERO_METRIC = (0, 0, 0)
|
|
|
|
if (
|
|
normalized_prediction in ["yes", "no", "noanswer"]
|
|
and normalized_prediction != normalized_ground_truth
|
|
):
|
|
return ZERO_METRIC
|
|
if (
|
|
normalized_ground_truth in ["yes", "no", "noanswer"]
|
|
and normalized_prediction != normalized_ground_truth
|
|
):
|
|
return ZERO_METRIC
|
|
|
|
prediction_tokens = normalized_prediction.split()
|
|
ground_truth_tokens = normalized_ground_truth.split()
|
|
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
|
|
num_same = sum(common.values())
|
|
if num_same == 0:
|
|
return ZERO_METRIC
|
|
precision = 1.0 * num_same / len(prediction_tokens)
|
|
recall = 1.0 * num_same / len(ground_truth_tokens)
|
|
f1 = (2 * precision * recall) / (precision + recall)
|
|
return f1, precision, recall
|
|
|
|
|
|
def exact_match_score(prediction, ground_truth):
|
|
return normalize_answer(prediction) == normalize_answer(ground_truth)
|
|
|
|
|
|
def update_answer(metrics, prediction, gold):
|
|
em = exact_match_score(prediction, gold)
|
|
f1, prec, recall = f1_score(prediction, gold)
|
|
metrics["em"] += float(em)
|
|
metrics["f1"] += f1
|
|
metrics["prec"] += prec
|
|
metrics["recall"] += recall
|
|
return em, prec, recall
|
|
|
|
|
|
def update_sp(metrics, prediction, gold):
|
|
cur_sp_pred = set(map(tuple, prediction))
|
|
gold_sp_pred = set(map(tuple, gold))
|
|
tp, fp, fn = 0, 0, 0
|
|
for e in cur_sp_pred:
|
|
if e in gold_sp_pred:
|
|
tp += 1
|
|
else:
|
|
fp += 1
|
|
for e in gold_sp_pred:
|
|
if e not in cur_sp_pred:
|
|
fn += 1
|
|
prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
|
|
recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
|
|
f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
|
|
em = 1.0 if fp + fn == 0 else 0.0
|
|
metrics["sp_em"] += em
|
|
metrics["sp_f1"] += f1
|
|
metrics["sp_prec"] += prec
|
|
metrics["sp_recall"] += recall
|
|
return em, prec, recall
|