cognee/evals/qa_metrics_utils.py
alekszievr 8ec1e48ff6
Run eval on a set of parameters and save them as png and json (#443)
* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets.

* Load dataset file by filename, outsource utilities

* restructure metric selection

* Add comprehensiveness, diversity and empowerment metrics

* add promptfoo as an option

* refactor RAG solution in eval;2C

* LLM as a judge metrics implemented in a uniform way

* Use requests.get instead of wget

* clean up promptfoo config template

* minor fixes

* get promptfoo path instead of hardcoding

* minor fixes

* Add LLM as a judge prompts

* Support 4 different rag options in eval

* Minor refactor and logger usage

* Run eval on a set of parameters and save results as json and png

* script for running all param combinations

* bugfix in simple rag

* potential fix: single asyncio run

* temp fix: exclude insights

* Remove insights, have single asyncio run, refactor

---------

Co-authored-by: lxobr <122801072+lxobr@users.noreply.github.com>
2025-01-17 00:18:51 +01:00

66 lines
1.9 KiB
Python

from evals.deepeval_metrics import (
correctness_metric,
comprehensiveness_metric,
diversity_metric,
empowerment_metric,
directness_metric,
f1_score_metric,
em_score_metric,
)
from deepeval.metrics import AnswerRelevancyMetric
import deepeval.metrics
from evals.promptfoo_metrics import is_valid_promptfoo_metric, PromptfooMetric
native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric}
custom_deepeval_metrics = {
"Correctness": correctness_metric,
"Comprehensiveness": comprehensiveness_metric,
"Diversity": diversity_metric,
"Empowerment": empowerment_metric,
"Directness": directness_metric,
"F1": f1_score_metric,
"EM": em_score_metric,
}
qa_metrics = native_deepeval_metrics | custom_deepeval_metrics
def get_deepeval_metric(metric_name: str):
if metric_name in qa_metrics:
metric = qa_metrics[metric_name]
else:
try:
metric_cls = getattr(deepeval.metrics, metric_name)
metric = metric_cls()
except AttributeError:
raise Exception(f"Metric {metric_name} not supported")
if isinstance(metric, type):
metric = metric()
return metric
def get_metrics(metric_name_list: list[str]):
metrics = {
"deepeval_metrics": [],
}
promptfoo_metric_names = []
for metric_name in metric_name_list:
if (
(metric_name in native_deepeval_metrics)
or (metric_name in custom_deepeval_metrics)
or hasattr(deepeval.metrics, metric_name)
):
metric = get_deepeval_metric(metric_name)
metrics["deepeval_metrics"].append(metric)
elif is_valid_promptfoo_metric(metric_name):
promptfoo_metric_names.append(metric_name)
if len(promptfoo_metric_names) > 0:
metrics["promptfoo_metrics"] = PromptfooMetric(promptfoo_metric_names)
return metrics