diff --git a/cognee/eval_framework/answer_generation/answer_generation_executor.py b/cognee/eval_framework/answer_generation/answer_generation_executor.py index 1b984d465..67eb02578 100644 --- a/cognee/eval_framework/answer_generation/answer_generation_executor.py +++ b/cognee/eval_framework/answer_generation/answer_generation_executor.py @@ -29,13 +29,16 @@ class AnswerGeneratorExecutor: retrieval_context = await retriever.get_context(query_text) search_results = await retriever.get_completion(query_text, retrieval_context) - answers.append( - { - "question": query_text, - "answer": search_results[0], - "golden_answer": correct_answer, - "retrieval_context": retrieval_context, - } - ) + answer = { + "question": query_text, + "answer": search_results[0], + "golden_answer": correct_answer, + "retrieval_context": retrieval_context, + } + + if "golden_context" in instance: + answer["golden_context"] = instance["golden_context"] + + answers.append(answer) return answers diff --git a/cognee/eval_framework/benchmark_adapters/dummy_adapter.py b/cognee/eval_framework/benchmark_adapters/dummy_adapter.py index 69cc6e518..9bf945d06 100644 --- a/cognee/eval_framework/benchmark_adapters/dummy_adapter.py +++ b/cognee/eval_framework/benchmark_adapters/dummy_adapter.py @@ -5,18 +5,21 @@ from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import Base class DummyAdapter(BaseBenchmarkAdapter): def load_corpus( - self, limit: Optional[int] = None, seed: int = 42 + self, limit: Optional[int] = None, seed: int = 42, load_golden_context: bool = False ) -> tuple[list[str], list[dict[str, Any]]]: corpus_list = [ "The cognee is an AI memory engine that supports different vector and graph databases", "Neo4j is a graph database supported by cognee", ] - question_answer_pairs = [ - { - "answer": "Yes", - "question": "Is Neo4j supported by cognee?", - "type": "dummy", - } - ] + qa_pair = { + "answer": "Yes", + "question": "Is Neo4j supported by cognee?", + "type": "dummy", + } + + if load_golden_context: + qa_pair["golden_context"] = "Cognee supports Neo4j and NetworkX" + + question_answer_pairs = [qa_pair] return corpus_list, question_answer_pairs diff --git a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py index 2e4a7fd3d..1d2b31e41 100644 --- a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py +++ b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py @@ -28,14 +28,22 @@ class CorpusBuilderExecutor: self.questions = None self.task_getter = task_getter - def load_corpus(self, limit: Optional[int] = None) -> Tuple[List[Dict], List[str]]: - self.raw_corpus, self.questions = self.adapter.load_corpus(limit=limit) + def load_corpus( + self, limit: Optional[int] = None, load_golden_context: bool = False + ) -> Tuple[List[Dict], List[str]]: + self.raw_corpus, self.questions = self.adapter.load_corpus( + limit=limit, load_golden_context=load_golden_context + ) return self.raw_corpus, self.questions async def build_corpus( - self, limit: Optional[int] = None, chunk_size=1024, chunker=TextChunker + self, + limit: Optional[int] = None, + chunk_size=1024, + chunker=TextChunker, + load_golden_context: bool = False, ) -> List[str]: - self.load_corpus(limit=limit) + self.load_corpus(limit=limit, load_golden_context=load_golden_context) await self.run_cognee(chunk_size=chunk_size, chunker=chunker) return self.questions diff --git a/cognee/eval_framework/corpus_builder/run_corpus_builder.py b/cognee/eval_framework/corpus_builder/run_corpus_builder.py index 2aff21249..6054688d2 100644 --- a/cognee/eval_framework/corpus_builder/run_corpus_builder.py +++ b/cognee/eval_framework/corpus_builder/run_corpus_builder.py @@ -47,7 +47,10 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker) task_getter=task_getter, ) questions = await corpus_builder.build_corpus( - limit=params.get("number_of_samples_in_corpus"), chunk_size=chunk_size, chunker=chunker + limit=params.get("number_of_samples_in_corpus"), + chunk_size=chunk_size, + chunker=chunker, + load_golden_context=params.get("evaluating_contexts"), ) with open(params["questions_path"], "w", encoding="utf-8") as f: json.dump(questions, f, ensure_ascii=False, indent=4) diff --git a/cognee/eval_framework/evaluation/deep_eval_adapter.py b/cognee/eval_framework/evaluation/deep_eval_adapter.py index 11f33571b..761d66e05 100644 --- a/cognee/eval_framework/evaluation/deep_eval_adapter.py +++ b/cognee/eval_framework/evaluation/deep_eval_adapter.py @@ -4,6 +4,7 @@ from cognee.eval_framework.eval_config import EvalConfig from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric +from cognee.eval_framework.evaluation.metrics.context_coverage import ContextCoverageMetric from typing import Any, Dict, List from deepeval.metrics import ContextualRelevancyMetric @@ -15,6 +16,7 @@ class DeepEvalAdapter(BaseEvalAdapter): "EM": ExactMatchMetric(), "f1": F1ScoreMetric(), "contextual_relevancy": ContextualRelevancyMetric(), + "context_coverage": ContextCoverageMetric(), } async def evaluate_answers( @@ -32,6 +34,7 @@ class DeepEvalAdapter(BaseEvalAdapter): actual_output=answer["answer"], expected_output=answer["golden_answer"], retrieval_context=[answer["retrieval_context"]], + context=[answer["golden_context"]] if "golden_context" in answer else None, ) metric_results = {} for metric in evaluator_metrics: diff --git a/cognee/eval_framework/evaluation/evaluation_executor.py b/cognee/eval_framework/evaluation/evaluation_executor.py index 5e56b50c7..1de01f101 100644 --- a/cognee/eval_framework/evaluation/evaluation_executor.py +++ b/cognee/eval_framework/evaluation/evaluation_executor.py @@ -23,5 +23,6 @@ class EvaluationExecutor: async def execute(self, answers: List[Dict[str, str]], evaluator_metrics: Any) -> Any: if self.evaluate_contexts: evaluator_metrics.append("contextual_relevancy") + evaluator_metrics.append("context_coverage") metrics = await self.eval_adapter.evaluate_answers(answers, evaluator_metrics) return metrics diff --git a/cognee/eval_framework/evaluation/metrics/context_coverage.py b/cognee/eval_framework/evaluation/metrics/context_coverage.py new file mode 100644 index 000000000..9fdd5e14e --- /dev/null +++ b/cognee/eval_framework/evaluation/metrics/context_coverage.py @@ -0,0 +1,50 @@ +from deepeval.metrics import SummarizationMetric +from deepeval.test_case import LLMTestCase +from deepeval.metrics.summarization.schema import ScoreType +from deepeval.metrics.indicator import metric_progress_indicator +from deepeval.utils import get_or_create_event_loop + + +class ContextCoverageMetric(SummarizationMetric): + def measure( + self, + test_case, + _show_indicator: bool = True, + ) -> float: + mapped_test_case = LLMTestCase( + input=test_case.context[0], + actual_output=test_case.retrieval_context[0], + ) + self.assessment_questions = None + self.evaluation_cost = 0 if self.using_native_model else None + with metric_progress_indicator(self, _show_indicator=_show_indicator): + if self.async_mode: + loop = get_or_create_event_loop() + return loop.run_until_complete( + self.a_measure(mapped_test_case, _show_indicator=False) + ) + else: + self.coverage_verdicts = self._generate_coverage_verdicts(mapped_test_case) + self.alignment_verdicts = [] + self.score = self._calculate_score(ScoreType.COVERAGE) + self.reason = self._generate_reason() + self.success = self.score >= self.threshold + return self.score + + async def a_measure( + self, + test_case, + _show_indicator: bool = True, + ) -> float: + self.evaluation_cost = 0 if self.using_native_model else None + with metric_progress_indicator( + self, + async_mode=True, + _show_indicator=_show_indicator, + ): + self.coverage_verdicts = await self._a_generate_coverage_verdicts(test_case) + self.alignment_verdicts = [] + self.score = self._calculate_score(ScoreType.COVERAGE) + self.reason = await self._a_generate_reason() + self.success = self.score >= self.threshold + return self.score diff --git a/cognee/eval_framework/metrics_dashboard.py b/cognee/eval_framework/metrics_dashboard.py index 2c917740a..eb4d2ed8e 100644 --- a/cognee/eval_framework/metrics_dashboard.py +++ b/cognee/eval_framework/metrics_dashboard.py @@ -3,6 +3,12 @@ import plotly.graph_objects as go from typing import Dict, List, Tuple from collections import defaultdict +metrics_fields = { + "contextual_relevancy": ["question", "retrieval_context"], + "context_coverage": ["question", "retrieval_context", "golden_context"], +} +default_metrics_fields = ["question", "answer", "golden_answer"] + def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]: """Create distribution histogram plots for each metric.""" @@ -59,38 +65,30 @@ def generate_details_html(metrics_data: List[Dict]) -> List[str]: for metric, values in entry["metrics"].items(): if metric not in metric_details: metric_details[metric] = [] + current_metrics_fields = metrics_fields.get(metric, default_metrics_fields) metric_details[metric].append( - { - "question": entry["question"], - "answer": entry["answer"], - "golden_answer": entry["golden_answer"], + {key: entry[key] for key in current_metrics_fields} + | { "reason": values.get("reason", ""), "score": values["score"], } ) for metric, details in metric_details.items(): + formatted_column_names = [key.replace("_", " ").title() for key in details[0].keys()] details_html.append(f"
| Question | -Answer | -Golden Answer | -Reason | -Score | + {"".join(f"{col} | " for col in formatted_column_names)}
|---|---|---|---|---|---|
| {item['question']} | " - f"{item['answer']} | " - f"{item['golden_answer']} | " - f"{item['reason']} | " - f"{item['score']} | " - f"|
| {value} | " for value in item.values())} +