cognee/cognee/eval_framework/eval_config.py
lxobr cad9e0ce44
Feat: cog 1491 pipeline steps in eval (#641)
<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->
- Created get_default_tasks_by_indices to filter default tasks by
specific indices
- Added get_no_summary_tasks function to skip summarization tasks
- Added get_just_chunks_tasks function for chunk extraction and data
points only
- Added NO_SUMMARIES and JUST_CHUNKS to the TaskGetters enum
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- The evaluation configuration now includes expanded task retrieval
options. Users can choose customized modes that bypass summarization or
focus solely on extracting data chunks, offering a more tailored
evaluation experience.
- Enhanced asynchronous task processing brings increased flexibility and
smoother performance during task selection.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2025-03-14 14:20:39 +01:00

76 lines
2.9 KiB
Python

from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict
from typing import List
class EvalConfig(BaseSettings):
# Corpus builder params
building_corpus_from_scratch: bool = True
number_of_samples_in_corpus: int = 1
benchmark: str = "Dummy" # Options: 'HotPotQA', 'Dummy', 'TwoWikiMultiHop'
task_getter_type: str = (
"Default" # Options: 'Default', 'CascadeGraph', 'NoSummaries', 'JustChunks'
)
# Question answering params
answering_questions: bool = True
qa_engine: str = (
"cognee_completion" # Options: 'cognee_completion' or 'cognee_graph_completion'
)
# Evaluation params
evaluating_answers: bool = True
evaluating_contexts: bool = True
evaluation_engine: str = "DeepEval" # Options: 'DeepEval' (uses deepeval_model), 'DirectLLM' (uses default llm from .env)
evaluation_metrics: List[str] = [
"correctness",
"EM",
"f1",
] # Use only 'correctness' for DirectLLM
deepeval_model: str = "gpt-4o-mini"
# Metrics params
calculate_metrics: bool = True
# Visualization
dashboard: bool = True
# file paths
questions_path: str = "questions_output.json"
answers_path: str = "answers_output.json"
metrics_path: str = "metrics_output.json"
aggregate_metrics_path: str = "aggregate_metrics.json"
dashboard_path: str = "dashboard.html"
direct_llm_system_prompt: str = "direct_llm_eval_system.txt"
direct_llm_eval_prompt: str = "direct_llm_eval_prompt.txt"
model_config = SettingsConfigDict(env_file=".env", extra="allow")
def to_dict(self) -> dict:
return {
"building_corpus_from_scratch": self.building_corpus_from_scratch,
"number_of_samples_in_corpus": self.number_of_samples_in_corpus,
"benchmark": self.benchmark,
"answering_questions": self.answering_questions,
"qa_engine": self.qa_engine,
"evaluating_answers": self.evaluating_answers,
"evaluating_contexts": self.evaluating_contexts, # Controls whether context evaluation should be performed
"evaluation_engine": self.evaluation_engine,
"evaluation_metrics": self.evaluation_metrics,
"calculate_metrics": self.calculate_metrics,
"dashboard": self.dashboard,
"questions_path": self.questions_path,
"answers_path": self.answers_path,
"metrics_path": self.metrics_path,
"aggregate_metrics_path": self.aggregate_metrics_path,
"dashboard_path": self.dashboard_path,
"deepeval_model": self.deepeval_model,
"task_getter_type": self.task_getter_type,
"direct_llm_system_prompt": self.direct_llm_system_prompt,
"direct_llm_eval_prompt": self.direct_llm_eval_prompt,
}
@lru_cache
def get_llm_config():
return EvalConfig()