Feature/cog 1312 integrating evaluation framework into dreamify (#562)

<!-- .github/pull_request_template.md -->

## Description
This PR contains eval framework changes due to the autooptimizer
integration

## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
  - Enhanced answer generation now returns structured answer details.
  - Search functionality accepts configurable prompt inputs.
  - Option to generate a metrics dashboard from evaluations.
- Corpus building tasks now support adjustable chunk settings for
greater flexibility.
- New task retrieval functionality allows for flexible task
configuration.
  - Introduced new methods for creating and managing metrics dashboards.

- **Refactor/Chore**
- Streamlined API signatures and reorganized module interfaces for
better consistency.
  - Updated import paths to reflect new module structure.

- **Tests**
- Updated test scenarios to align with new configurations and parameter
adjustments.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
hajdul88 2025-03-03 19:55:47 +01:00 committed by GitHub
parent 933c7c86c2
commit e3f3d49a3b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
46 changed files with 322 additions and 107 deletions

View file

@ -14,7 +14,7 @@ jobs:
run_eval_framework_test: run_eval_framework_test:
uses: ./.github/workflows/reusable_python_example.yml uses: ./.github/workflows/reusable_python_example.yml
with: with:
example-location: ./evals/eval_framework/run_eval.py example-location: ./cognee/eval_framework/run_eval.py
secrets: secrets:
LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

View file

@ -112,8 +112,8 @@ def generate_dataset_name(dataset_name: str) -> str:
return dataset_name.replace(".", "_").replace(" ", "_") return dataset_name.replace(".", "_").replace(" ", "_")
async def get_default_tasks( async def get_default_tasks( # TODO: Find out a better way to do this (Boris's comment)
user: User = None, graph_model: BaseModel = KnowledgeGraph user: User = None, graph_model: BaseModel = KnowledgeGraph, chunk_size=1024, chunker=TextChunker
) -> list[Task]: ) -> list[Task]:
if user is None: if user is None:
user = await get_default_user() user = await get_default_user()
@ -126,7 +126,8 @@ async def get_default_tasks(
Task( Task(
extract_chunks_from_documents, extract_chunks_from_documents,
max_chunk_tokens=get_max_chunk_tokens(), max_chunk_tokens=get_max_chunk_tokens(),
chunker=TextChunker, chunker=chunker,
chunk_size=chunk_size,
), # Extract text chunks based on the document type. ), # Extract text chunks based on the document type.
Task( Task(
extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10} extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10}

View file

@ -12,6 +12,7 @@ async def search(
query_type: SearchType = SearchType.GRAPH_COMPLETION, query_type: SearchType = SearchType.GRAPH_COMPLETION,
user: User = None, user: User = None,
datasets: Union[list[str], str, None] = None, datasets: Union[list[str], str, None] = None,
system_prompt_path: str = "answer_simple_question.txt",
) -> list: ) -> list:
# We use lists from now on for datasets # We use lists from now on for datasets
if isinstance(datasets, str): if isinstance(datasets, str):
@ -23,6 +24,8 @@ async def search(
if user is None: if user is None:
raise UserNotFoundError raise UserNotFoundError
filtered_search_results = await search_function(query_text, query_type, datasets, user) filtered_search_results = await search_function(
query_text, query_type, datasets, user, system_prompt_path=system_prompt_path
)
return filtered_search_results return filtered_search_results

View file

@ -3,20 +3,19 @@ from typing import List, Dict, Callable, Awaitable
from cognee.api.v1.search import SearchType from cognee.api.v1.search import SearchType
question_answering_engine_options: Dict[str, Callable[[str], Awaitable[List[str]]]] = { question_answering_engine_options: Dict[str, Callable[[str], Awaitable[List[str]]]] = {
"cognee_graph_completion": lambda query: cognee.search( "cognee_graph_completion": lambda query, system_prompt_path: cognee.search(
query_type=SearchType.GRAPH_COMPLETION, query_text=query query_type=SearchType.GRAPH_COMPLETION,
query_text=query,
system_prompt_path=system_prompt_path,
), ),
"cognee_completion": lambda query: cognee.search( "cognee_completion": lambda query, system_prompt_path: cognee.search(
query_type=SearchType.COMPLETION, query_text=query query_type=SearchType.COMPLETION, query_text=query, system_prompt_path=system_prompt_path
), ),
"cognee_summaries": lambda query: cognee.search( "graph_summary_completion": lambda query, system_prompt_path: cognee.search(
query_type=SearchType.SUMMARIES, query_text=query query_type=SearchType.GRAPH_SUMMARY_COMPLETION,
query_text=query,
system_prompt_path=system_prompt_path,
), ),
"cognee_insights": lambda query: cognee.search(
query_type=SearchType.INSIGHTS, query_text=query
),
"cognee_chunks": lambda query: cognee.search(query_type=SearchType.CHUNKS, query_text=query),
"cognee_code": lambda query: cognee.search(query_type=SearchType.CODE, query_text=query),
} }
@ -25,13 +24,14 @@ class AnswerGeneratorExecutor:
self, self,
questions: List[Dict[str, str]], questions: List[Dict[str, str]],
answer_resolver: Callable[[str], Awaitable[List[str]]], answer_resolver: Callable[[str], Awaitable[List[str]]],
system_prompt: str = "answer_simple_question.txt",
) -> List[Dict[str, str]]: ) -> List[Dict[str, str]]:
answers = [] answers = []
for instance in questions: for instance in questions:
query_text = instance["question"] query_text = instance["question"]
correct_answer = instance["answer"] correct_answer = instance["answer"]
search_results = await answer_resolver(query_text) search_results = await answer_resolver(query_text, system_prompt)
answers.append( answers.append(
{ {

View file

@ -1,6 +1,7 @@
import logging import logging
import json import json
from evals.eval_framework.answer_generation.answer_generation_executor import ( from typing import List
from cognee.eval_framework.answer_generation.answer_generation_executor import (
AnswerGeneratorExecutor, AnswerGeneratorExecutor,
question_answering_engine_options, question_answering_engine_options,
) )
@ -30,7 +31,9 @@ async def create_and_insert_answers_table(questions_payload):
await session.commit() await session.commit()
async def run_question_answering(params: dict) -> None: async def run_question_answering(
params: dict, system_prompt="answer_simple_question.txt"
) -> List[dict]:
if params.get("answering_questions"): if params.get("answering_questions"):
logging.info("Question answering started...") logging.info("Question answering started...")
try: try:
@ -46,9 +49,17 @@ async def run_question_answering(params: dict) -> None:
answers = await answer_generator.question_answering_non_parallel( answers = await answer_generator.question_answering_non_parallel(
questions=questions, questions=questions,
answer_resolver=question_answering_engine_options[params["qa_engine"]], answer_resolver=question_answering_engine_options[params["qa_engine"]],
system_prompt=system_prompt,
) )
with open(params["answers_path"], "w", encoding="utf-8") as f: with open(params["answers_path"], "w", encoding="utf-8") as f:
json.dump(answers, f, ensure_ascii=False, indent=4) json.dump(answers, f, ensure_ascii=False, indent=4)
await create_and_insert_answers_table(answers) await create_and_insert_answers_table(answers)
logging.info("Question answering End...") logging.info("Question answering End...")
return answers
else:
logging.info(
"The question answering module was not executed as answering_questions is not enabled"
)
return []

View file

@ -1,10 +1,10 @@
from enum import Enum from enum import Enum
from typing import Type from typing import Type
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter from cognee.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter from cognee.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter from cognee.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter
class BenchmarkAdapter(Enum): class BenchmarkAdapter(Enum):

View file

@ -1,12 +1,12 @@
from typing import Optional from typing import Optional, Any
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
class DummyAdapter(BaseBenchmarkAdapter): class DummyAdapter(BaseBenchmarkAdapter):
def load_corpus( def load_corpus(
self, limit: Optional[int] = None, seed: int = 42 self, limit: Optional[int] = None, seed: int = 42
) -> tuple[list[str], list[dict[str, str]]]: ) -> tuple[list[str], list[dict[str, Any]]]:
corpus_list = [ corpus_list = [
"The cognee is an AI memory engine that supports different vector and graph databases", "The cognee is an AI memory engine that supports different vector and graph databases",
"Neo4j is a graph database supported by cognee", "Neo4j is a graph database supported by cognee",

View file

@ -3,7 +3,7 @@ import os
import json import json
import random import random
from typing import Optional, Any, List, Tuple from typing import Optional, Any, List, Tuple
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
class HotpotQAAdapter(BaseBenchmarkAdapter): class HotpotQAAdapter(BaseBenchmarkAdapter):

View file

@ -6,7 +6,7 @@ import zipfile
import gdown import gdown
from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter
class MusiqueQAAdapter(BaseBenchmarkAdapter): class MusiqueQAAdapter(BaseBenchmarkAdapter):

View file

@ -3,7 +3,7 @@ import os
import json import json
import random import random
from typing import Optional, Any, List, Tuple from typing import Optional, Any, List, Tuple
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
class TwoWikiMultihopAdapter(HotpotQAAdapter): class TwoWikiMultihopAdapter(HotpotQAAdapter):

View file

@ -2,8 +2,9 @@ import cognee
import logging import logging
from typing import Optional, Tuple, List, Dict, Union, Any, Callable, Awaitable from typing import Optional, Tuple, List, Dict, Union, Any, Callable, Awaitable
from evals.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter from cognee.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters
from evals.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters from cognee.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter
from cognee.modules.chunking.TextChunker import TextChunker
from cognee.modules.pipelines.tasks.Task import Task from cognee.modules.pipelines.tasks.Task import Task
from cognee.shared.utils import setup_logging from cognee.shared.utils import setup_logging
@ -31,12 +32,14 @@ class CorpusBuilderExecutor:
self.raw_corpus, self.questions = self.adapter.load_corpus(limit=limit) self.raw_corpus, self.questions = self.adapter.load_corpus(limit=limit)
return self.raw_corpus, self.questions return self.raw_corpus, self.questions
async def build_corpus(self, limit: Optional[int] = None) -> List[str]: async def build_corpus(
self, limit: Optional[int] = None, chunk_size=1024, chunker=TextChunker
) -> List[str]:
self.load_corpus(limit=limit) self.load_corpus(limit=limit)
await self.run_cognee() await self.run_cognee(chunk_size=chunk_size, chunker=chunker)
return self.questions return self.questions
async def run_cognee(self) -> None: async def run_cognee(self, chunk_size=1024, chunker=TextChunker) -> None:
setup_logging(logging.ERROR) setup_logging(logging.ERROR)
await cognee.prune.prune_data() await cognee.prune.prune_data()
@ -44,5 +47,5 @@ class CorpusBuilderExecutor:
await cognee.add(self.raw_corpus) await cognee.add(self.raw_corpus)
tasks = await self.task_getter() tasks = await self.task_getter(chunk_size=chunk_size, chunker=TextChunker)
await cognee.cognify(tasks=tasks) await cognee.cognify(tasks=tasks)

View file

@ -1,14 +1,19 @@
import logging import logging
import json import json
from typing import List
from unstructured.chunking.dispatch import chunk
from cognee.infrastructure.files.storage import LocalStorage from cognee.infrastructure.files.storage import LocalStorage
from evals.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor from cognee.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor
from cognee.modules.data.models.questions_base import QuestionsBase from cognee.modules.data.models.questions_base import QuestionsBase
from cognee.modules.data.models.questions_data import Questions from cognee.modules.data.models.questions_data import Questions
from cognee.infrastructure.databases.relational.get_relational_engine import ( from cognee.infrastructure.databases.relational.get_relational_engine import (
get_relational_engine, get_relational_engine,
get_relational_config, get_relational_config,
) )
from evals.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters from cognee.modules.chunking.TextChunker import TextChunker
from cognee.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters
async def create_and_insert_questions_table(questions_payload): async def create_and_insert_questions_table(questions_payload):
@ -28,7 +33,7 @@ async def create_and_insert_questions_table(questions_payload):
await session.commit() await session.commit()
async def run_corpus_builder(params: dict) -> None: async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker) -> List[dict]:
if params.get("building_corpus_from_scratch"): if params.get("building_corpus_from_scratch"):
logging.info("Corpus Builder started...") logging.info("Corpus Builder started...")
@ -42,7 +47,7 @@ async def run_corpus_builder(params: dict) -> None:
task_getter=task_getter, task_getter=task_getter,
) )
questions = await corpus_builder.build_corpus( questions = await corpus_builder.build_corpus(
limit=params.get("number_of_samples_in_corpus") limit=params.get("number_of_samples_in_corpus"), chunk_size=chunk_size, chunker=chunker
) )
with open(params["questions_path"], "w", encoding="utf-8") as f: with open(params["questions_path"], "w", encoding="utf-8") as f:
json.dump(questions, f, ensure_ascii=False, indent=4) json.dump(questions, f, ensure_ascii=False, indent=4)
@ -50,3 +55,5 @@ async def run_corpus_builder(params: dict) -> None:
await create_and_insert_questions_table(questions_payload=questions) await create_and_insert_questions_table(questions_payload=questions)
logging.info("Corpus Builder End...") logging.info("Corpus Builder End...")
return questions

View file

@ -2,7 +2,7 @@ from enum import Enum
from typing import Callable, Awaitable, List from typing import Callable, Awaitable, List
from cognee.api.v1.cognify.cognify_v2 import get_default_tasks from cognee.api.v1.cognify.cognify_v2 import get_default_tasks
from cognee.modules.pipelines.tasks.Task import Task from cognee.modules.pipelines.tasks.Task import Task
from evals.eval_framework.corpus_builder.task_getters.get_cascade_graph_tasks import ( from cognee.eval_framework.corpus_builder.task_getters.get_cascade_graph_tasks import (
get_cascade_graph_tasks, get_cascade_graph_tasks,
) )

View file

@ -0,0 +1,14 @@
from cognee.api.v1.cognify.cognify_v2 import get_default_tasks
from typing import List
from cognee.eval_framework.corpus_builder.task_getters.base_task_getter import BaseTaskGetter
from cognee.modules.pipelines.tasks.Task import Task
from cognee.infrastructure.llm import get_max_chunk_tokens
from cognee.modules.chunking.TextChunker import TextChunker
class DefaultTaskGetter(BaseTaskGetter):
"""Default task getter that retrieves tasks using the standard get_default_tasks function."""
async def get_tasks(self, chunk_size=1024, chunker=TextChunker) -> List[Task]:
"""Retrieve default tasks asynchronously."""
return await get_default_tasks(chunk_size=chunk_size, chunker=chunker)

View file

@ -1,9 +1,9 @@
from deepeval.metrics import GEval from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from evals.eval_framework.eval_config import EvalConfig from cognee.eval_framework.eval_config import EvalConfig
from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
from typing import Any, Dict, List from typing import Any, Dict, List

View file

@ -1,9 +1,9 @@
from typing import Any, Dict, List from typing import Any, Dict, List
from pydantic import BaseModel from pydantic import BaseModel
from cognee.infrastructure.llm.get_llm_client import get_llm_client from cognee.infrastructure.llm.get_llm_client import get_llm_client
from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter
from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt
from evals.eval_framework.eval_config import EvalConfig from cognee.eval_framework.eval_config import EvalConfig
class CorrectnessEvaluation(BaseModel): class CorrectnessEvaluation(BaseModel):

View file

@ -1,5 +1,5 @@
from typing import List, Dict, Any, Union from typing import List, Dict, Any, Union
from evals.eval_framework.evaluation.evaluator_adapters import EvaluatorAdapter from cognee.eval_framework.evaluation.evaluator_adapters import EvaluatorAdapter
class EvaluationExecutor: class EvaluationExecutor:

View file

@ -1,7 +1,7 @@
from enum import Enum from enum import Enum
from typing import Type from typing import Type
from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter from cognee.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
from evals.eval_framework.evaluation.direct_llm_eval_adapter import DirectLLMEvalAdapter from cognee.eval_framework.evaluation.direct_llm_eval_adapter import DirectLLMEvalAdapter
class EvaluatorAdapter(Enum): class EvaluatorAdapter(Enum):

View file

@ -1,8 +1,9 @@
import logging import logging
import json import json
from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor from typing import List
from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics from cognee.eval_framework.evaluation.evaluation_executor import EvaluationExecutor
from evals.eval_framework.analysis.dashboard_generator import create_dashboard from cognee.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics
from cognee.eval_framework.analysis.dashboard_generator import create_dashboard
from cognee.infrastructure.files.storage import LocalStorage from cognee.infrastructure.files.storage import LocalStorage
from cognee.infrastructure.databases.relational.get_relational_engine import ( from cognee.infrastructure.databases.relational.get_relational_engine import (
get_relational_engine, get_relational_engine,
@ -50,13 +51,14 @@ async def execute_evaluation(params: dict) -> None:
await create_and_insert_metrics_table(metrics) await create_and_insert_metrics_table(metrics)
logging.info("Evaluation completed") logging.info("Evaluation completed")
return metrics
async def run_evaluation(params: dict) -> None: async def run_evaluation(params: dict) -> List[dict]:
"""Run each step of the evaluation pipeline based on configuration flags.""" """Run each step of the evaluation pipeline based on configuration flags."""
# Step 1: Evaluate answers if requested # Step 1: Evaluate answers if requested
if params.get("evaluating_answers"): if params.get("evaluating_answers"):
await execute_evaluation(params) metrics = await execute_evaluation(params)
else: else:
logging.info("Skipping evaluation as evaluating_answers is False") logging.info("Skipping evaluation as evaluating_answers is False")
@ -67,18 +69,7 @@ async def run_evaluation(params: dict) -> None:
json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"] json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"]
) )
logging.info("Metrics calculation completed") logging.info("Metrics calculation completed")
return metrics
else: else:
logging.info("Skipping metrics calculation as calculate_metrics is False") logging.info("Skipping metrics calculation as calculate_metrics is False")
return []
# Step 3: Generate dashboard if requested
if params.get("dashboard"):
logging.info("Generating dashboard...")
create_dashboard(
metrics_path=params["metrics_path"],
aggregate_metrics_path=params["aggregate_metrics_path"],
output_file=params["dashboard_path"],
benchmark=params["benchmark"],
)
logging.info(f"Dashboard generated at {params['dashboard_path']}")
else:
logging.info("Skipping dashboard generation as dashboard is False")

View file

@ -0,0 +1,172 @@
import json
import plotly.graph_objects as go
from typing import Dict, List, Tuple
from collections import defaultdict
def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]:
"""Create distribution histogram plots for each metric."""
figures = []
for metric, scores in metrics_data.items():
fig = go.Figure()
fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4"))
fig.update_layout(
title=f"{metric} Score Distribution",
xaxis_title="Score",
yaxis_title="Count",
bargap=0.1,
template="seaborn",
)
figures.append(fig.to_html(full_html=False))
return figures
def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str:
"""Create confidence interval bar plot."""
fig = go.Figure()
for metric, (mean_score, lower, upper) in ci_results.items():
fig.add_trace(
go.Bar(
x=[metric],
y=[mean_score],
error_y=dict(
type="data",
array=[upper - mean_score],
arrayminus=[mean_score - lower],
visible=True,
),
name=metric,
)
)
fig.update_layout(
title="95% confidence interval for all the metrics",
xaxis_title="Metric",
yaxis_title="Score",
template="seaborn",
)
return fig.to_html(full_html=False)
def generate_details_html(metrics_data: List[Dict]) -> List[str]:
"""Generate HTML for detailed metric information."""
details_html = []
metric_details = {}
# Organize metrics by type
for entry in metrics_data:
for metric, values in entry["metrics"].items():
if metric not in metric_details:
metric_details[metric] = []
metric_details[metric].append(
{
"question": entry["question"],
"answer": entry["answer"],
"golden_answer": entry["golden_answer"],
"reason": values.get("reason", ""),
"score": values["score"],
}
)
for metric, details in metric_details.items():
details_html.append(f"<h3>{metric} Details</h3>")
details_html.append("""
<table class="metric-table">
<tr>
<th>Question</th>
<th>Answer</th>
<th>Golden Answer</th>
<th>Reason</th>
<th>Score</th>
</tr>
""")
for item in details:
details_html.append(
f"<tr>"
f"<td>{item['question']}</td>"
f"<td>{item['answer']}</td>"
f"<td>{item['golden_answer']}</td>"
f"<td>{item['reason']}</td>"
f"<td>{item['score']}</td>"
f"</tr>"
)
details_html.append("</table>")
return details_html
def get_dashboard_html_template(
figures: List[str], details_html: List[str], benchmark: str = ""
) -> str:
"""Generate the complete HTML dashboard template."""
return f"""
<!DOCTYPE html>
<html>
<head>
<title>LLM Evaluation Dashboard {benchmark}</title>
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
<style>
body {{ font-family: Arial, sans-serif; margin: 20px; }}
.chart {{ border: 1px solid #ddd; padding: 20px; margin-bottom: 30px; }}
.metric-table {{ border-collapse: collapse; width: 100%; margin-bottom: 30px; }}
.metric-table th, .metric-table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
.metric-table th {{ background-color: #f2f2f2; }}
h2 {{ color: #333; border-bottom: 2px solid #eee; padding-bottom: 10px; }}
</style>
</head>
<body>
<h1>LLM Evaluation Metrics Dashboard {benchmark}</h1>
<h2>Metrics Distribution</h2>
{"".join([f'<div class="chart">{fig}</div>' for fig in figures[:-1]])}
<h2>95% confidence interval for all the metrics</h2>
<div class="chart">{figures[-1]}</div>
<h2>Detailed Explanations</h2>
{"".join(details_html)}
</body>
</html>
"""
def create_dashboard(
metrics_path: str,
aggregate_metrics_path: str,
output_file: str = "dashboard_with_ci.html",
benchmark: str = "",
) -> str:
"""Create and save the dashboard with all visualizations."""
# Read metrics files
with open(metrics_path, "r") as f:
metrics_data = json.load(f)
with open(aggregate_metrics_path, "r") as f:
aggregate_data = json.load(f)
# Extract data for visualizations
metrics_by_type = defaultdict(list)
for entry in metrics_data:
for metric, values in entry["metrics"].items():
metrics_by_type[metric].append(values["score"])
# Generate visualizations
distribution_figures = create_distribution_plots(metrics_by_type)
ci_plot = create_ci_plot(
{
metric: (data["mean"], data["ci_lower"], data["ci_upper"])
for metric, data in aggregate_data.items()
}
)
# Combine all figures
figures = distribution_figures + [ci_plot]
# Generate HTML components
details_html = generate_details_html(metrics_data)
dashboard_html = get_dashboard_html_template(figures, details_html, benchmark)
# Write to file
with open(output_file, "w", encoding="utf-8") as f:
f.write(dashboard_html)
return output_file

View file

@ -4,12 +4,12 @@ import json
import asyncio import asyncio
import datetime import datetime
import logging import logging
from evals.eval_framework.eval_config import EvalConfig from cognee.eval_framework.eval_config import EvalConfig
from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder from cognee.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
from evals.eval_framework.answer_generation.run_question_answering_module import ( from cognee.eval_framework.answer_generation.run_question_answering_module import (
run_question_answering, run_question_answering,
) )
from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)

View file

@ -1,13 +1,14 @@
import logging import logging
import asyncio import asyncio
from cognee.shared.utils import setup_logging from cognee.shared.utils import setup_logging
from evals.eval_framework.eval_config import EvalConfig from cognee.eval_framework.eval_config import EvalConfig
from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder from cognee.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder
from evals.eval_framework.answer_generation.run_question_answering_module import ( from cognee.eval_framework.answer_generation.run_question_answering_module import (
run_question_answering, run_question_answering,
) )
from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation
from cognee.eval_framework.metrics_dashboard import create_dashboard
# Configure logging # Configure logging
setup_logging(logging.INFO) setup_logging(logging.INFO)
@ -31,6 +32,15 @@ async def main():
# Metrics calculation + dashboard # Metrics calculation + dashboard
await run_evaluation(eval_params) await run_evaluation(eval_params)
if eval_params.get("dashboard"):
logging.info("Generating dashboard...")
create_dashboard(
metrics_path=eval_params["metrics_path"],
aggregate_metrics_path=eval_params["aggregate_metrics_path"],
output_file=eval_params["dashboard_path"],
benchmark=eval_params["benchmark"],
)
if __name__ == "__main__": if __name__ == "__main__":
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()

View file

@ -14,8 +14,3 @@ class BaseRetriever(ABC):
async def get_completion(self, query: str, context: Optional[Any] = None) -> Any: async def get_completion(self, query: str, context: Optional[Any] = None) -> Any:
"""Generates a response using the query and optional context.""" """Generates a response using the query and optional context."""
pass pass
@classmethod
def as_search(cls) -> Callable:
"""Creates a search function from the retriever class."""
return lambda query: cls().get_completion(query)

View file

@ -25,11 +25,14 @@ async def search(
query_type: SearchType, query_type: SearchType,
datasets: list[str], datasets: list[str],
user: User, user: User,
system_prompt_path="answer_simple_question.txt",
): ):
query = await log_query(query_text, query_type.value, user.id) query = await log_query(query_text, query_type.value, user.id)
own_document_ids = await get_document_ids_for_user(user.id, datasets) own_document_ids = await get_document_ids_for_user(user.id, datasets)
search_results = await specific_search(query_type, query_text, user) search_results = await specific_search(
query_type, query_text, user, system_prompt_path=system_prompt_path
)
filtered_search_results = [] filtered_search_results = []
@ -45,15 +48,23 @@ async def search(
return filtered_search_results return filtered_search_results
async def specific_search(query_type: SearchType, query: str, user: User) -> list: async def specific_search(
query_type: SearchType, query: str, user: User, system_prompt_path="answer_simple_question.txt"
) -> list:
search_tasks: dict[SearchType, Callable] = { search_tasks: dict[SearchType, Callable] = {
SearchType.SUMMARIES: SummariesRetriever.as_search(), SearchType.SUMMARIES: SummariesRetriever().get_completion,
SearchType.INSIGHTS: InsightsRetriever.as_search(), SearchType.INSIGHTS: InsightsRetriever().get_completion,
SearchType.CHUNKS: ChunksRetriever.as_search(), SearchType.CHUNKS: ChunksRetriever().get_completion,
SearchType.COMPLETION: CompletionRetriever.as_search(), SearchType.COMPLETION: CompletionRetriever(
SearchType.GRAPH_COMPLETION: GraphCompletionRetriever.as_search(), system_prompt_path=system_prompt_path
SearchType.GRAPH_SUMMARY_COMPLETION: GraphSummaryCompletionRetriever.as_search(), ).get_completion,
SearchType.CODE: CodeRetriever.as_search(), SearchType.GRAPH_COMPLETION: GraphCompletionRetriever(
system_prompt_path=system_prompt_path
).get_completion,
SearchType.GRAPH_SUMMARY_COMPLETION: GraphSummaryCompletionRetriever(
system_prompt_path=system_prompt_path
).get_completion,
SearchType.CODE: CodeRetriever().get_completion,
} }
search_task = search_tasks.get(query_type) search_task = search_tasks.get(query_type)

View file

@ -1,8 +1,8 @@
import pytest import pytest
from evals.eval_framework.answer_generation.answer_generation_executor import ( from cognee.eval_framework.answer_generation.answer_generation_executor import (
AnswerGeneratorExecutor, AnswerGeneratorExecutor,
) )
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter from cognee.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
from unittest.mock import AsyncMock from unittest.mock import AsyncMock
@ -12,16 +12,13 @@ async def test_answer_generation():
corpus_list, qa_pairs = DummyAdapter().load_corpus(limit=limit) corpus_list, qa_pairs = DummyAdapter().load_corpus(limit=limit)
mock_answer_resolver = AsyncMock() mock_answer_resolver = AsyncMock()
mock_answer_resolver.side_effect = lambda query: ["mock_answer"] mock_answer_resolver.side_effect = lambda query, system_prompt: ["mock_answer"]
answer_generator = AnswerGeneratorExecutor() answer_generator = AnswerGeneratorExecutor()
answers = await answer_generator.question_answering_non_parallel( answers = await answer_generator.question_answering_non_parallel(
questions=qa_pairs, questions=qa_pairs, answer_resolver=mock_answer_resolver, system_prompt="test.txt"
answer_resolver=mock_answer_resolver,
) )
mock_answer_resolver.assert_called_once_with(qa_pairs[0]["question"])
assert len(answers) == len(qa_pairs) assert len(answers) == len(qa_pairs)
assert answers[0]["question"] == qa_pairs[0]["question"], ( assert answers[0]["question"] == qa_pairs[0]["question"], (
"AnswerGeneratorExecutor is passing the question incorrectly" "AnswerGeneratorExecutor is passing the question incorrectly"

View file

@ -1,8 +1,8 @@
import pytest import pytest
from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter
from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter from cognee.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter
from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter from cognee.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter
from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter from cognee.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter
from unittest.mock import patch, mock_open from unittest.mock import patch, mock_open

View file

@ -1,5 +1,5 @@
import pytest import pytest
from evals.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor from cognee.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor
from cognee.infrastructure.databases.graph import get_graph_engine from cognee.infrastructure.databases.graph import get_graph_engine
from unittest.mock import AsyncMock, patch from unittest.mock import AsyncMock, patch

View file

@ -3,7 +3,7 @@ import json
import os import os
from evals.eval_framework.analysis.dashboard_generator import ( from cognee.eval_framework.analysis.dashboard_generator import (
create_distribution_plots, create_distribution_plots,
create_ci_plot, create_ci_plot,
generate_details_html, generate_details_html,

View file

@ -1,13 +1,13 @@
import pytest import pytest
from unittest.mock import patch, MagicMock from unittest.mock import patch, MagicMock
from evals.eval_framework.eval_config import EvalConfig from cognee.eval_framework.eval_config import EvalConfig
import sys import sys
with patch.dict( with patch.dict(
sys.modules, sys.modules,
{"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()}, {"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()},
): ):
from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter from cognee.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter
@pytest.fixture @pytest.fixture

View file

@ -4,15 +4,15 @@ import sys
from unittest.mock import patch, MagicMock from unittest.mock import patch, MagicMock
import unittest import unittest
import numpy as np import numpy as np
from evals.eval_framework.analysis.metrics_calculator import bootstrap_ci from cognee.eval_framework.analysis.metrics_calculator import bootstrap_ci
with patch.dict( with patch.dict(
sys.modules, sys.modules,
{"deepeval": MagicMock(), "deepeval.test_case": MagicMock()}, {"deepeval": MagicMock(), "deepeval.test_case": MagicMock()},
): ):
from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric
from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric
class MockTestCase: class MockTestCase: