cognee/cognee/eval_framework/evaluation/metrics/context_coverage.py
alekszievr 7b5bd7897f
Feat: evaluate retrieved context against golden context [cog-1481] (#619)
<!-- .github/pull_request_template.md -->

## Description
- Compare retrieved context to golden context using deepeval's
summarization metric
- Display relevant fields to each metric on metrics dashboard

Example output:

![image](https://github.com/user-attachments/assets/9facf716-b2ab-4573-bfdf-7b343d2a57c5)


## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Enhanced context handling in answer generation and corpus building to
include extended details.
- Introduced a new context coverage metric for deeper evaluation
insights.
- Upgraded the evaluation dashboard with dynamic presentation of metric
details.
- Added a new parameter to support loading golden context in corpus
loading methods.

- **Bug Fixes**
- Improved clarity in how answers are structured and appended in the
answer generation process.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-03-10 15:27:48 +01:00

50 lines
2 KiB
Python

from deepeval.metrics import SummarizationMetric
from deepeval.test_case import LLMTestCase
from deepeval.metrics.summarization.schema import ScoreType
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.utils import get_or_create_event_loop
class ContextCoverageMetric(SummarizationMetric):
def measure(
self,
test_case,
_show_indicator: bool = True,
) -> float:
mapped_test_case = LLMTestCase(
input=test_case.context[0],
actual_output=test_case.retrieval_context[0],
)
self.assessment_questions = None
self.evaluation_cost = 0 if self.using_native_model else None
with metric_progress_indicator(self, _show_indicator=_show_indicator):
if self.async_mode:
loop = get_or_create_event_loop()
return loop.run_until_complete(
self.a_measure(mapped_test_case, _show_indicator=False)
)
else:
self.coverage_verdicts = self._generate_coverage_verdicts(mapped_test_case)
self.alignment_verdicts = []
self.score = self._calculate_score(ScoreType.COVERAGE)
self.reason = self._generate_reason()
self.success = self.score >= self.threshold
return self.score
async def a_measure(
self,
test_case,
_show_indicator: bool = True,
) -> float:
self.evaluation_cost = 0 if self.using_native_model else None
with metric_progress_indicator(
self,
async_mode=True,
_show_indicator=_show_indicator,
):
self.coverage_verdicts = await self._a_generate_coverage_verdicts(test_case)
self.alignment_verdicts = []
self.score = self._calculate_score(ScoreType.COVERAGE)
self.reason = await self._a_generate_reason()
self.success = self.score >= self.threshold
return self.score