* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets. * Load dataset file by filename, outsource utilities * restructure metric selection * Add comprehensiveness, diversity and empowerment metrics * add promptfoo as an option * refactor RAG solution in eval;2C * LLM as a judge metrics implemented in a uniform way * Use requests.get instead of wget * clean up promptfoo config template * minor fixes * get promptfoo path instead of hardcoding * minor fixes * Add LLM as a judge prompts * Support 4 different rag options in eval * Minor refactor and logger usage * Run eval on a set of parameters and save results as json and png * script for running all param combinations * bugfix in simple rag * potential fix: single asyncio run * temp fix: exclude insights * Remove insights, have single asyncio run, refactor --------- Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
66 lines
1.9 KiB
Python
66 lines
1.9 KiB
Python
from evals.deepeval_metrics import (
|
|
correctness_metric,
|
|
comprehensiveness_metric,
|
|
diversity_metric,
|
|
empowerment_metric,
|
|
directness_metric,
|
|
f1_score_metric,
|
|
em_score_metric,
|
|
)
|
|
from deepeval.metrics import AnswerRelevancyMetric
|
|
import deepeval.metrics
|
|
from evals.promptfoo_metrics import is_valid_promptfoo_metric, PromptfooMetric
|
|
|
|
native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric}
|
|
|
|
custom_deepeval_metrics = {
|
|
"Correctness": correctness_metric,
|
|
"Comprehensiveness": comprehensiveness_metric,
|
|
"Diversity": diversity_metric,
|
|
"Empowerment": empowerment_metric,
|
|
"Directness": directness_metric,
|
|
"F1": f1_score_metric,
|
|
"EM": em_score_metric,
|
|
}
|
|
|
|
qa_metrics = native_deepeval_metrics | custom_deepeval_metrics
|
|
|
|
|
|
def get_deepeval_metric(metric_name: str):
|
|
if metric_name in qa_metrics:
|
|
metric = qa_metrics[metric_name]
|
|
else:
|
|
try:
|
|
metric_cls = getattr(deepeval.metrics, metric_name)
|
|
metric = metric_cls()
|
|
except AttributeError:
|
|
raise Exception(f"Metric {metric_name} not supported")
|
|
|
|
if isinstance(metric, type):
|
|
metric = metric()
|
|
|
|
return metric
|
|
|
|
|
|
def get_metrics(metric_name_list: list[str]):
|
|
metrics = {
|
|
"deepeval_metrics": [],
|
|
}
|
|
|
|
promptfoo_metric_names = []
|
|
|
|
for metric_name in metric_name_list:
|
|
if (
|
|
(metric_name in native_deepeval_metrics)
|
|
or (metric_name in custom_deepeval_metrics)
|
|
or hasattr(deepeval.metrics, metric_name)
|
|
):
|
|
metric = get_deepeval_metric(metric_name)
|
|
metrics["deepeval_metrics"].append(metric)
|
|
elif is_valid_promptfoo_metric(metric_name):
|
|
promptfoo_metric_names.append(metric_name)
|
|
|
|
if len(promptfoo_metric_names) > 0:
|
|
metrics["promptfoo_metrics"] = PromptfooMetric(promptfoo_metric_names)
|
|
|
|
return metrics
|