fix: deepeval retry (#918)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> - Implemented retries when deepeval's evaluation fails - Updated metric aggregation to ignore Nones ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.
This commit is contained in:
parent
82c0279f45
commit
3da893c131
2 changed files with 46 additions and 17 deletions
|
|
@ -38,17 +38,19 @@ def extract_metrics_and_details(
|
||||||
for entry in data:
|
for entry in data:
|
||||||
for metric, values in entry["metrics"].items():
|
for metric, values in entry["metrics"].items():
|
||||||
score = values["score"]
|
score = values["score"]
|
||||||
metrics_data[metric].append(score)
|
# Skip None scores from failed evaluations
|
||||||
if "reason" in values:
|
if score is not None:
|
||||||
metric_details[metric].append(
|
metrics_data[metric].append(score)
|
||||||
{
|
if "reason" in values:
|
||||||
"question": entry["question"],
|
metric_details[metric].append(
|
||||||
"answer": entry["answer"],
|
{
|
||||||
"golden_answer": entry["golden_answer"],
|
"question": entry["question"],
|
||||||
"reason": values["reason"],
|
"answer": entry["answer"],
|
||||||
"score": score,
|
"golden_answer": entry["golden_answer"],
|
||||||
}
|
"reason": values["reason"],
|
||||||
)
|
"score": score,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return metrics_data, metric_details
|
return metrics_data, metric_details
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,10 +7,15 @@ from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
|
||||||
from cognee.eval_framework.evaluation.metrics.context_coverage import ContextCoverageMetric
|
from cognee.eval_framework.evaluation.metrics.context_coverage import ContextCoverageMetric
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
from deepeval.metrics import ContextualRelevancyMetric
|
from deepeval.metrics import ContextualRelevancyMetric
|
||||||
|
import time
|
||||||
|
from cognee.shared.logging_utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
class DeepEvalAdapter(BaseEvalAdapter):
|
class DeepEvalAdapter(BaseEvalAdapter):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
self.n_retries = 5
|
||||||
self.g_eval_metrics = {
|
self.g_eval_metrics = {
|
||||||
"correctness": self.g_eval_correctness(),
|
"correctness": self.g_eval_correctness(),
|
||||||
"EM": ExactMatchMetric(),
|
"EM": ExactMatchMetric(),
|
||||||
|
|
@ -19,6 +24,33 @@ class DeepEvalAdapter(BaseEvalAdapter):
|
||||||
"context_coverage": ContextCoverageMetric(),
|
"context_coverage": ContextCoverageMetric(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _calculate_metric(self, metric: str, test_case: LLMTestCase) -> Dict[str, Any]:
|
||||||
|
"""Calculate a single metric for a test case with retry logic."""
|
||||||
|
metric_to_calculate = self.g_eval_metrics[metric]
|
||||||
|
|
||||||
|
for attempt in range(self.n_retries):
|
||||||
|
try:
|
||||||
|
metric_to_calculate.measure(test_case)
|
||||||
|
return {
|
||||||
|
"score": metric_to_calculate.score,
|
||||||
|
"reason": metric_to_calculate.reason,
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Attempt {attempt + 1}/{self.n_retries} failed for metric '{metric}': {e}"
|
||||||
|
)
|
||||||
|
if attempt < self.n_retries - 1:
|
||||||
|
time.sleep(2**attempt) # Exponential backoff
|
||||||
|
else:
|
||||||
|
logger.error(
|
||||||
|
f"All {self.n_retries} attempts failed for metric '{metric}'. Returning None values."
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"score": None,
|
||||||
|
"reason": None,
|
||||||
|
}
|
||||||
|
|
||||||
async def evaluate_answers(
|
async def evaluate_answers(
|
||||||
self, answers: List[Dict[str, Any]], evaluator_metrics: List[str]
|
self, answers: List[Dict[str, Any]], evaluator_metrics: List[str]
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
|
|
@ -40,12 +72,7 @@ class DeepEvalAdapter(BaseEvalAdapter):
|
||||||
)
|
)
|
||||||
metric_results = {}
|
metric_results = {}
|
||||||
for metric in evaluator_metrics:
|
for metric in evaluator_metrics:
|
||||||
metric_to_calculate = self.g_eval_metrics[metric]
|
metric_results[metric] = self._calculate_metric(metric, test_case)
|
||||||
metric_to_calculate.measure(test_case)
|
|
||||||
metric_results[metric] = {
|
|
||||||
"score": metric_to_calculate.score,
|
|
||||||
"reason": metric_to_calculate.reason,
|
|
||||||
}
|
|
||||||
results.append({**answer, "metrics": metric_results})
|
results.append({**answer, "metrics": metric_results})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue