cognee/evals/eval_framework/evaluation/deep_eval_adapter.py
hajdul88 6a0c0e3ef8
feat: Cognee evaluation framework development (#498)
<!-- .github/pull_request_template.md -->

This PR contains the evaluation framework development for cognee

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Expanded evaluation framework now integrates asynchronous corpus
building, question answering, and performance evaluation with adaptive
benchmarks for improved metrics (correctness, exact match, and F1
score).

- **Infrastructure**
- Added database integration for persistent storage of questions,
answers, and metrics.
- Launched an interactive metrics dashboard featuring advanced
visualizations.
- Introduced an automated testing workflow for continuous quality
assurance.

- **Documentation**
  - Updated guidelines for generating concise, clear answers.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-02-11 16:31:54 +01:00

60 lines
2.5 KiB
Python

from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from evals.eval_framework.eval_config import EvalConfig
from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
from typing import Any, Dict, List
class DeepEvalAdapter(BaseEvalAdapter):
def __init__(self):
self.g_eval_metrics = {
"correctness": self.g_eval_correctness(),
"EM": ExactMatchMetric(),
"f1": F1ScoreMetric(),
}
async def evaluate_answers(
self, answers: List[Dict[str, Any]], evaluator_metrics: List[str]
) -> List[Dict[str, Any]]:
# evaluator_metrics contains all the necessary metrics that are gonna be evaluated dynamically
for metric in evaluator_metrics:
if metric not in self.g_eval_metrics:
raise ValueError(f"Unsupported metric: {metric}")
results = []
for answer in answers:
test_case = LLMTestCase(
input=answer["question"],
actual_output=answer["answer"],
expected_output=answer["golden_answer"],
)
metric_results = {}
for metric in evaluator_metrics:
metric_to_calculate = self.g_eval_metrics[metric]
metric_to_calculate.measure(test_case)
metric_results[metric] = {
"score": metric_to_calculate.score,
"reason": metric_to_calculate.reason,
}
results.append({**answer, "metrics": metric_results})
return results
def g_eval_correctness(self):
return GEval(
name="Correctness",
criteria="Determine whether the actual output is factually correct based on the expected output.",
model=EvalConfig().to_dict()["deepeval_model"],
evaluation_steps=[
"Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
"You should also heavily penalize omission of detail",
"Vague language, or contradicting OPINIONS, are OK",
],
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.EXPECTED_OUTPUT,
],
)