Feat: evaluate retrieved context against golden context [cog-1481] (#619)
<!-- .github/pull_request_template.md --> ## Description - Compare retrieved context to golden context using deepeval's summarization metric - Display relevant fields to each metric on metrics dashboard Example output:  ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Enhanced context handling in answer generation and corpus building to include extended details. - Introduced a new context coverage metric for deeper evaluation insights. - Upgraded the evaluation dashboard with dynamic presentation of metric details. - Added a new parameter to support loading golden context in corpus loading methods. - **Bug Fixes** - Improved clarity in how answers are structured and appended in the answer generation process. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
parent
ac0156514d
commit
7b5bd7897f
9 changed files with 115 additions and 41 deletions
|
|
@ -29,13 +29,16 @@ class AnswerGeneratorExecutor:
|
||||||
retrieval_context = await retriever.get_context(query_text)
|
retrieval_context = await retriever.get_context(query_text)
|
||||||
search_results = await retriever.get_completion(query_text, retrieval_context)
|
search_results = await retriever.get_completion(query_text, retrieval_context)
|
||||||
|
|
||||||
answers.append(
|
answer = {
|
||||||
{
|
"question": query_text,
|
||||||
"question": query_text,
|
"answer": search_results[0],
|
||||||
"answer": search_results[0],
|
"golden_answer": correct_answer,
|
||||||
"golden_answer": correct_answer,
|
"retrieval_context": retrieval_context,
|
||||||
"retrieval_context": retrieval_context,
|
}
|
||||||
}
|
|
||||||
)
|
if "golden_context" in instance:
|
||||||
|
answer["golden_context"] = instance["golden_context"]
|
||||||
|
|
||||||
|
answers.append(answer)
|
||||||
|
|
||||||
return answers
|
return answers
|
||||||
|
|
|
||||||
|
|
@ -5,18 +5,21 @@ from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import Base
|
||||||
|
|
||||||
class DummyAdapter(BaseBenchmarkAdapter):
|
class DummyAdapter(BaseBenchmarkAdapter):
|
||||||
def load_corpus(
|
def load_corpus(
|
||||||
self, limit: Optional[int] = None, seed: int = 42
|
self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False
|
||||||
) -> tuple[list[str], list[dict[str, Any]]]:
|
) -> tuple[list[str], list[dict[str, Any]]]:
|
||||||
corpus_list = [
|
corpus_list = [
|
||||||
"The cognee is an AI memory engine that supports different vector and graph databases",
|
"The cognee is an AI memory engine that supports different vector and graph databases",
|
||||||
"Neo4j is a graph database supported by cognee",
|
"Neo4j is a graph database supported by cognee",
|
||||||
]
|
]
|
||||||
question_answer_pairs = [
|
qa_pair = {
|
||||||
{
|
"answer": "Yes",
|
||||||
"answer": "Yes",
|
"question": "Is Neo4j supported by cognee?",
|
||||||
"question": "Is Neo4j supported by cognee?",
|
"type": "dummy",
|
||||||
"type": "dummy",
|
}
|
||||||
}
|
|
||||||
]
|
if load_golden_context:
|
||||||
|
qa_pair["golden_context"] = "Cognee supports Neo4j and NetworkX"
|
||||||
|
|
||||||
|
question_answer_pairs = [qa_pair]
|
||||||
|
|
||||||
return corpus_list, question_answer_pairs
|
return corpus_list, question_answer_pairs
|
||||||
|
|
|
||||||
|
|
@ -28,14 +28,22 @@ class CorpusBuilderExecutor:
|
||||||
self.questions = None
|
self.questions = None
|
||||||
self.task_getter = task_getter
|
self.task_getter = task_getter
|
||||||
|
|
||||||
def load_corpus(self, limit: Optional[int] = None) -> Tuple[List[Dict], List[str]]:
|
def load_corpus(
|
||||||
self.raw_corpus, self.questions = self.adapter.load_corpus(limit=limit)
|
self, limit: Optional[int] = None, load_golden_context: bool = False
|
||||||
|
) -> Tuple[List[Dict], List[str]]:
|
||||||
|
self.raw_corpus, self.questions = self.adapter.load_corpus(
|
||||||
|
limit=limit, load_golden_context=load_golden_context
|
||||||
|
)
|
||||||
return self.raw_corpus, self.questions
|
return self.raw_corpus, self.questions
|
||||||
|
|
||||||
async def build_corpus(
|
async def build_corpus(
|
||||||
self, limit: Optional[int] = None, chunk_size=1024, chunker=TextChunker
|
self,
|
||||||
|
limit: Optional[int] = None,
|
||||||
|
chunk_size=1024,
|
||||||
|
chunker=TextChunker,
|
||||||
|
load_golden_context: bool = False,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
self.load_corpus(limit=limit)
|
self.load_corpus(limit=limit, load_golden_context=load_golden_context)
|
||||||
await self.run_cognee(chunk_size=chunk_size, chunker=chunker)
|
await self.run_cognee(chunk_size=chunk_size, chunker=chunker)
|
||||||
return self.questions
|
return self.questions
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,10 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
|
||||||
task_getter=task_getter,
|
task_getter=task_getter,
|
||||||
)
|
)
|
||||||
questions = await corpus_builder.build_corpus(
|
questions = await corpus_builder.build_corpus(
|
||||||
limit=params.get("number_of_samples_in_corpus"), chunk_size=chunk_size, chunker=chunker
|
limit=params.get("number_of_samples_in_corpus"),
|
||||||
|
chunk_size=chunk_size,
|
||||||
|
chunker=chunker,
|
||||||
|
load_golden_context=params.get("evaluating_contexts"),
|
||||||
)
|
)
|
||||||
with open(params["questions_path"], "w", encoding="utf-8") as f:
|
with open(params["questions_path"], "w", encoding="utf-8") as f:
|
||||||
json.dump(questions, f, ensure_ascii=False, indent=4)
|
json.dump(questions, f, ensure_ascii=False, indent=4)
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ from cognee.eval_framework.eval_config import EvalConfig
|
||||||
from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
|
from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
|
||||||
from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
|
from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
|
||||||
from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
|
from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
|
||||||
|
from cognee.eval_framework.evaluation.metrics.context_coverage import ContextCoverageMetric
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
from deepeval.metrics import ContextualRelevancyMetric
|
from deepeval.metrics import ContextualRelevancyMetric
|
||||||
|
|
||||||
|
|
@ -15,6 +16,7 @@ class DeepEvalAdapter(BaseEvalAdapter):
|
||||||
"EM": ExactMatchMetric(),
|
"EM": ExactMatchMetric(),
|
||||||
"f1": F1ScoreMetric(),
|
"f1": F1ScoreMetric(),
|
||||||
"contextual_relevancy": ContextualRelevancyMetric(),
|
"contextual_relevancy": ContextualRelevancyMetric(),
|
||||||
|
"context_coverage": ContextCoverageMetric(),
|
||||||
}
|
}
|
||||||
|
|
||||||
async def evaluate_answers(
|
async def evaluate_answers(
|
||||||
|
|
@ -32,6 +34,7 @@ class DeepEvalAdapter(BaseEvalAdapter):
|
||||||
actual_output=answer["answer"],
|
actual_output=answer["answer"],
|
||||||
expected_output=answer["golden_answer"],
|
expected_output=answer["golden_answer"],
|
||||||
retrieval_context=[answer["retrieval_context"]],
|
retrieval_context=[answer["retrieval_context"]],
|
||||||
|
context=[answer["golden_context"]] if "golden_context" in answer else None,
|
||||||
)
|
)
|
||||||
metric_results = {}
|
metric_results = {}
|
||||||
for metric in evaluator_metrics:
|
for metric in evaluator_metrics:
|
||||||
|
|
|
||||||
|
|
@ -23,5 +23,6 @@ class EvaluationExecutor:
|
||||||
async def execute(self, answers: List[Dict[str, str]], evaluator_metrics: Any) -> Any:
|
async def execute(self, answers: List[Dict[str, str]], evaluator_metrics: Any) -> Any:
|
||||||
if self.evaluate_contexts:
|
if self.evaluate_contexts:
|
||||||
evaluator_metrics.append("contextual_relevancy")
|
evaluator_metrics.append("contextual_relevancy")
|
||||||
|
evaluator_metrics.append("context_coverage")
|
||||||
metrics = await self.eval_adapter.evaluate_answers(answers, evaluator_metrics)
|
metrics = await self.eval_adapter.evaluate_answers(answers, evaluator_metrics)
|
||||||
return metrics
|
return metrics
|
||||||
|
|
|
||||||
50
cognee/eval_framework/evaluation/metrics/context_coverage.py
Normal file
50
cognee/eval_framework/evaluation/metrics/context_coverage.py
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
from deepeval.metrics import SummarizationMetric
|
||||||
|
from deepeval.test_case import LLMTestCase
|
||||||
|
from deepeval.metrics.summarization.schema import ScoreType
|
||||||
|
from deepeval.metrics.indicator import metric_progress_indicator
|
||||||
|
from deepeval.utils import get_or_create_event_loop
|
||||||
|
|
||||||
|
|
||||||
|
class ContextCoverageMetric(SummarizationMetric):
|
||||||
|
def measure(
|
||||||
|
self,
|
||||||
|
test_case,
|
||||||
|
_show_indicator: bool = True,
|
||||||
|
) -> float:
|
||||||
|
mapped_test_case = LLMTestCase(
|
||||||
|
input=test_case.context[0],
|
||||||
|
actual_output=test_case.retrieval_context[0],
|
||||||
|
)
|
||||||
|
self.assessment_questions = None
|
||||||
|
self.evaluation_cost = 0 if self.using_native_model else None
|
||||||
|
with metric_progress_indicator(self, _show_indicator=_show_indicator):
|
||||||
|
if self.async_mode:
|
||||||
|
loop = get_or_create_event_loop()
|
||||||
|
return loop.run_until_complete(
|
||||||
|
self.a_measure(mapped_test_case, _show_indicator=False)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.coverage_verdicts = self._generate_coverage_verdicts(mapped_test_case)
|
||||||
|
self.alignment_verdicts = []
|
||||||
|
self.score = self._calculate_score(ScoreType.COVERAGE)
|
||||||
|
self.reason = self._generate_reason()
|
||||||
|
self.success = self.score >= self.threshold
|
||||||
|
return self.score
|
||||||
|
|
||||||
|
async def a_measure(
|
||||||
|
self,
|
||||||
|
test_case,
|
||||||
|
_show_indicator: bool = True,
|
||||||
|
) -> float:
|
||||||
|
self.evaluation_cost = 0 if self.using_native_model else None
|
||||||
|
with metric_progress_indicator(
|
||||||
|
self,
|
||||||
|
async_mode=True,
|
||||||
|
_show_indicator=_show_indicator,
|
||||||
|
):
|
||||||
|
self.coverage_verdicts = await self._a_generate_coverage_verdicts(test_case)
|
||||||
|
self.alignment_verdicts = []
|
||||||
|
self.score = self._calculate_score(ScoreType.COVERAGE)
|
||||||
|
self.reason = await self._a_generate_reason()
|
||||||
|
self.success = self.score >= self.threshold
|
||||||
|
return self.score
|
||||||
|
|
@ -3,6 +3,12 @@ import plotly.graph_objects as go
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
metrics_fields = {
|
||||||
|
"contextual_relevancy": ["question", "retrieval_context"],
|
||||||
|
"context_coverage": ["question", "retrieval_context", "golden_context"],
|
||||||
|
}
|
||||||
|
default_metrics_fields = ["question", "answer", "golden_answer"]
|
||||||
|
|
||||||
|
|
||||||
def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
|
def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
|
||||||
"""Create distribution histogram plots for each metric."""
|
"""Create distribution histogram plots for each metric."""
|
||||||
|
|
@ -59,38 +65,30 @@ def generate_details_html(metrics_data: List[Dict]) -> List[str]:
|
||||||
for metric, values in entry["metrics"].items():
|
for metric, values in entry["metrics"].items():
|
||||||
if metric not in metric_details:
|
if metric not in metric_details:
|
||||||
metric_details[metric] = []
|
metric_details[metric] = []
|
||||||
|
current_metrics_fields = metrics_fields.get(metric, default_metrics_fields)
|
||||||
metric_details[metric].append(
|
metric_details[metric].append(
|
||||||
{
|
{key: entry[key] for key in current_metrics_fields}
|
||||||
"question": entry["question"],
|
| {
|
||||||
"answer": entry["answer"],
|
|
||||||
"golden_answer": entry["golden_answer"],
|
|
||||||
"reason": values.get("reason", ""),
|
"reason": values.get("reason", ""),
|
||||||
"score": values["score"],
|
"score": values["score"],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
for metric, details in metric_details.items():
|
for metric, details in metric_details.items():
|
||||||
|
formatted_column_names = [key.replace("_", " ").title() for key in details[0].keys()]
|
||||||
details_html.append(f"<h3>{metric} Details</h3>")
|
details_html.append(f"<h3>{metric} Details</h3>")
|
||||||
details_html.append("""
|
details_html.append(f"""
|
||||||
<table class="metric-table">
|
<table class="metric-table">
|
||||||
<tr>
|
<tr>
|
||||||
<th>Question</th>
|
{"".join(f"<th>{col}</th>" for col in formatted_column_names)}
|
||||||
<th>Answer</th>
|
|
||||||
<th>Golden Answer</th>
|
|
||||||
<th>Reason</th>
|
|
||||||
<th>Score</th>
|
|
||||||
</tr>
|
</tr>
|
||||||
""")
|
""")
|
||||||
for item in details:
|
for item in details:
|
||||||
details_html.append(
|
details_html.append(f"""
|
||||||
f"<tr>"
|
<tr>
|
||||||
f"<td>{item['question']}</td>"
|
{"".join(f"<td>{value}</td>" for value in item.values())}
|
||||||
f"<td>{item['answer']}</td>"
|
</tr>
|
||||||
f"<td>{item['golden_answer']}</td>"
|
""")
|
||||||
f"<td>{item['reason']}</td>"
|
|
||||||
f"<td>{item['score']}</td>"
|
|
||||||
f"</tr>"
|
|
||||||
)
|
|
||||||
details_html.append("</table>")
|
details_html.append("</table>")
|
||||||
return details_html
|
return details_html
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,12 @@ import sys
|
||||||
|
|
||||||
with patch.dict(
|
with patch.dict(
|
||||||
sys.modules,
|
sys.modules,
|
||||||
{"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()},
|
{
|
||||||
|
"deepeval": MagicMock(),
|
||||||
|
"deepeval.metrics": MagicMock(),
|
||||||
|
"deepeval.test_case": MagicMock(),
|
||||||
|
"cognee.eval_framework.evaluation.metrics.context_coverage": MagicMock(),
|
||||||
|
},
|
||||||
):
|
):
|
||||||
from cognee.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
|
from cognee.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue