<!-- .github/pull_request_template.md --> This PR contains the evaluation framework development for cognee ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Expanded evaluation framework now integrates asynchronous corpus building, question answering, and performance evaluation with adaptive benchmarks for improved metrics (correctness, exact match, and F1 score). - **Infrastructure** - Added database integration for persistent storage of questions, answers, and metrics. - Launched an interactive metrics dashboard featuring advanced visualizations. - Introduced an automated testing workflow for continuous quality assurance. - **Documentation** - Updated guidelines for generating concise, clear answers. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
60 lines
2.5 KiB
Python
60 lines
2.5 KiB
Python
from deepeval.metrics import GEval
|
|
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
|
from evals.eval_framework.eval_config import EvalConfig
|
|
from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
|
|
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
|
|
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
class DeepEvalAdapter(BaseEvalAdapter):
|
|
def __init__(self):
|
|
self.g_eval_metrics = {
|
|
"correctness": self.g_eval_correctness(),
|
|
"EM": ExactMatchMetric(),
|
|
"f1": F1ScoreMetric(),
|
|
}
|
|
|
|
async def evaluate_answers(
|
|
self, answers: List[Dict[str, Any]], evaluator_metrics: List[str]
|
|
) -> List[Dict[str, Any]]:
|
|
# evaluator_metrics contains all the necessary metrics that are gonna be evaluated dynamically
|
|
for metric in evaluator_metrics:
|
|
if metric not in self.g_eval_metrics:
|
|
raise ValueError(f"Unsupported metric: {metric}")
|
|
|
|
results = []
|
|
for answer in answers:
|
|
test_case = LLMTestCase(
|
|
input=answer["question"],
|
|
actual_output=answer["answer"],
|
|
expected_output=answer["golden_answer"],
|
|
)
|
|
metric_results = {}
|
|
for metric in evaluator_metrics:
|
|
metric_to_calculate = self.g_eval_metrics[metric]
|
|
metric_to_calculate.measure(test_case)
|
|
metric_results[metric] = {
|
|
"score": metric_to_calculate.score,
|
|
"reason": metric_to_calculate.reason,
|
|
}
|
|
results.append({**answer, "metrics": metric_results})
|
|
|
|
return results
|
|
|
|
def g_eval_correctness(self):
|
|
return GEval(
|
|
name="Correctness",
|
|
criteria="Determine whether the actual output is factually correct based on the expected output.",
|
|
model=EvalConfig().to_dict()["deepeval_model"],
|
|
evaluation_steps=[
|
|
"Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
|
|
"You should also heavily penalize omission of detail",
|
|
"Vague language, or contradicting OPINIONS, are OK",
|
|
],
|
|
evaluation_params=[
|
|
LLMTestCaseParams.INPUT,
|
|
LLMTestCaseParams.ACTUAL_OUTPUT,
|
|
LLMTestCaseParams.EXPECTED_OUTPUT,
|
|
],
|
|
)
|