cognee/evals/qa_metrics_utils.py
alekszievr 6653d73556
Feat/cog 950 improve metric selection (#435)
* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets.

* Load dataset file by filename, outsource utilities

* restructure metric selection

* Add comprehensiveness, diversity and empowerment metrics

* add promptfoo as an option

* refactor RAG solution in eval;2C

* LLM as a judge metrics implemented in a uniform way

* Use requests.get instead of wget

* clean up promptfoo config template

* minor fixes

* get promptfoo path instead of hardcoding

* minor fixes

* Add LLM as a judge prompts

* Minor refactor and logger usage
2025-01-15 10:45:55 +01:00

51 lines
1.7 KiB
Python

from evals.deepeval_metrics import (
correctness_metric,
comprehensiveness_metric,
diversity_metric,
empowerment_metric,
directness_metric,
f1_score_metric,
em_score_metric,
)
from evals.promptfoo_metrics import PromptfooMetric
from deepeval.metrics import AnswerRelevancyMetric
import deepeval.metrics
from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
native_deepeval_metrics = {"AnswerRelevancy": AnswerRelevancyMetric}
custom_deepeval_metrics = {
"Correctness": correctness_metric,
"Comprehensiveness": comprehensiveness_metric,
"Diversity": diversity_metric,
"Empowerment": empowerment_metric,
"Directness": directness_metric,
"F1": f1_score_metric,
"EM": em_score_metric,
}
promptfoo_metrics = {
"promptfoo.correctness": PromptfooMetric(llm_judge_prompts["correctness"]),
"promptfoo.comprehensiveness": PromptfooMetric(llm_judge_prompts["comprehensiveness"]),
"promptfoo.diversity": PromptfooMetric(llm_judge_prompts["diversity"]),
"promptfoo.empowerment": PromptfooMetric(llm_judge_prompts["empowerment"]),
"promptfoo.directness": PromptfooMetric(llm_judge_prompts["directness"]),
}
qa_metrics = native_deepeval_metrics | custom_deepeval_metrics | promptfoo_metrics
def get_metric(metric_name: str):
if metric_name in qa_metrics:
metric = qa_metrics[metric_name]
else:
try:
metric_cls = getattr(deepeval.metrics, metric_name)
metric = metric_cls()
except AttributeError:
raise Exception(f"Metric {metric_name} not supported")
if isinstance(metric, type):
metric = metric()
return metric