cognee/evals/deepeval_metrics.py
alekszievr 6653d73556
Feat/cog 950 improve metric selection (#435)
* QA eval dataset as argument, with hotpot and 2wikimultihop as options. Json schema validation for datasets.

* Load dataset file by filename, outsource utilities

* restructure metric selection

* Add comprehensiveness, diversity and empowerment metrics

* add promptfoo as an option

* refactor RAG solution in eval;2C

* LLM as a judge metrics implemented in a uniform way

* Use requests.get instead of wget

* clean up promptfoo config template

* minor fixes

* get promptfoo path instead of hardcoding

* minor fixes

* Add LLM as a judge prompts

* Minor refactor and logger usage
2025-01-15 10:45:55 +01:00

111 lines
3.2 KiB
Python

from deepeval.metrics import BaseMetric, GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from evals.official_hotpot_metrics import exact_match_score, f1_score
from cognee.infrastructure.llm.prompts.llm_judge_prompts import llm_judge_prompts
correctness_metric = GEval(
name="Correctness",
model="gpt-4o-mini",
evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
evaluation_steps=[llm_judge_prompts["correctness"]],
)
comprehensiveness_metric = GEval(
name="Comprehensiveness",
model="gpt-4o-mini",
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT,
],
evaluation_steps=[llm_judge_prompts["comprehensiveness"]],
)
diversity_metric = GEval(
name="Diversity",
model="gpt-4o-mini",
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT,
],
evaluation_steps=[llm_judge_prompts["diversity"]],
)
empowerment_metric = GEval(
name="Empowerment",
model="gpt-4o-mini",
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT,
],
evaluation_steps=[llm_judge_prompts["empowerment"]],
)
directness_metric = GEval(
name="Directness",
model="gpt-4o-mini",
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT,
],
evaluation_steps=[llm_judge_prompts["directness"]],
)
class f1_score_metric(BaseMetric):
"""F1 score taken directly from the official hotpot benchmark
implementation and wrapped into a deepeval metric."""
def __init__(self, threshold: float = 0.5):
self.threshold = threshold
def measure(self, test_case: LLMTestCase):
f1, precision, recall = f1_score(
prediction=test_case.actual_output,
ground_truth=test_case.expected_output,
)
self.score = f1
self.success = self.score >= self.threshold
return self.score
# Reusing regular measure as async F1 score is not implemented
async def a_measure(self, test_case: LLMTestCase):
return self.measure(test_case)
def is_successful(self):
return self.success
@property
def __name__(self):
return "Official hotpot F1 score"
class em_score_metric(BaseMetric):
"""Exact Match score taken directly from the official hotpot benchmark
implementation and wrapped into a deepeval metric."""
def __init__(self, threshold: float = 0.5):
self.threshold = threshold
def measure(self, test_case: LLMTestCase):
self.score = exact_match_score(
prediction=test_case.actual_output,
ground_truth=test_case.expected_output,
)
self.success = self.score >= self.threshold
return self.score
# Reusing regular measure as async F1 score is not implemented
async def a_measure(self, test_case: LLMTestCase):
return self.measure(test_case)
def is_successful(self):
return self.success
@property
def __name__(self):
return "Official hotpot EM score"