From e3f3d49a3bca4710fc7f09b9e8e980fbc9cf2af6 Mon Sep 17 00:00:00 2001 From: hajdul88 <52442977+hajdul88@users.noreply.github.com> Date: Mon, 3 Mar 2025 19:55:47 +0100 Subject: [PATCH] Feature/cog 1312 integrating evaluation framework into dreamify (#562) ## Description This PR contains eval framework changes due to the autooptimizer integration ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - Enhanced answer generation now returns structured answer details. - Search functionality accepts configurable prompt inputs. - Option to generate a metrics dashboard from evaluations. - Corpus building tasks now support adjustable chunk settings for greater flexibility. - New task retrieval functionality allows for flexible task configuration. - Introduced new methods for creating and managing metrics dashboards. - **Refactor/Chore** - Streamlined API signatures and reorganized module interfaces for better consistency. - Updated import paths to reflect new module structure. - **Tests** - Updated test scenarios to align with new configurations and parameter adjustments. --- .github/workflows/test_eval_framework.yml | 2 +- cognee/api/v1/cognify/cognify_v2.py | 7 +- cognee/api/v1/search/search_v2.py | 5 +- {evals => cognee}/eval_framework/__init__.py | 0 .../eval_framework/analysis/__init__.py | 0 .../analysis/dashboard_generator.py | 0 .../analysis/metrics_calculator.py | 0 .../answer_generation/__init__.py | 0 .../answer_generation_executor.py | 24 +-- .../run_question_answering_module.py | 15 +- .../benchmark_adapters/__init__.py | 0 .../base_benchmark_adapter.py | 0 .../benchmark_adapters/benchmark_adapters.py | 8 +- .../benchmark_adapters/dummy_adapter.py | 6 +- .../benchmark_adapters/hotpot_qa_adapter.py | 2 +- .../benchmark_adapters/musique_adapter.py | 2 +- .../twowikimultihop_adapter.py | 2 +- .../eval_framework/corpus_builder/__init__.py | 0 .../corpus_builder/corpus_builder_executor.py | 15 +- .../corpus_builder/run_corpus_builder.py | 15 +- .../task_getters/TaskGetters.py | 2 +- .../corpus_builder/task_getters/__init__.py | 0 .../task_getters/default_task_getter.py | 14 ++ .../task_getters/get_cascade_graph_tasks.py | 0 .../eval_framework/eval_config.py | 0 .../eval_framework/evaluation/__init__.py | 0 .../evaluation/base_eval_adapter.py | 0 .../evaluation/deep_eval_adapter.py | 8 +- .../evaluation/direct_llm_eval_adapter.py | 4 +- .../evaluation/evaluation_executor.py | 2 +- .../evaluation/evaluator_adapters.py | 4 +- .../evaluation/metrics/__init__.py | 0 .../evaluation/metrics/exact_match.py | 0 .../eval_framework/evaluation/metrics/f1.py | 0 .../evaluation/run_evaluation_module.py | 27 +-- cognee/eval_framework/metrics_dashboard.py | 172 ++++++++++++++++++ .../eval_framework/modal_run_eval.py | 8 +- {evals => cognee}/eval_framework/run_eval.py | 18 +- cognee/modules/retrieval/base_retriever.py | 5 - cognee/modules/search/methods/search.py | 29 ++- .../eval_framework/answer_generation_test.py | 11 +- .../eval_framework/benchmark_adapters_test.py | 8 +- .../eval_framework/corpus_builder_test.py | 2 +- .../unit/eval_framework/dashboard_test.py | 2 +- .../eval_framework/deepeval_adapter_test.py | 4 +- .../tests/unit/eval_framework/metrics_test.py | 6 +- 46 files changed, 322 insertions(+), 107 deletions(-) rename {evals => cognee}/eval_framework/__init__.py (100%) rename {evals => cognee}/eval_framework/analysis/__init__.py (100%) rename {evals => cognee}/eval_framework/analysis/dashboard_generator.py (100%) rename {evals => cognee}/eval_framework/analysis/metrics_calculator.py (100%) rename {evals => cognee}/eval_framework/answer_generation/__init__.py (100%) rename {evals => cognee}/eval_framework/answer_generation/answer_generation_executor.py (54%) rename {evals => cognee}/eval_framework/answer_generation/run_question_answering_module.py (83%) rename {evals => cognee}/eval_framework/benchmark_adapters/__init__.py (100%) rename {evals => cognee}/eval_framework/benchmark_adapters/base_benchmark_adapter.py (100%) rename {evals => cognee}/eval_framework/benchmark_adapters/benchmark_adapters.py (59%) rename {evals => cognee}/eval_framework/benchmark_adapters/dummy_adapter.py (76%) rename {evals => cognee}/eval_framework/benchmark_adapters/hotpot_qa_adapter.py (97%) rename {evals => cognee}/eval_framework/benchmark_adapters/musique_adapter.py (97%) rename {evals => cognee}/eval_framework/benchmark_adapters/twowikimultihop_adapter.py (91%) rename {evals => cognee}/eval_framework/corpus_builder/__init__.py (100%) rename {evals => cognee}/eval_framework/corpus_builder/corpus_builder_executor.py (69%) rename {evals => cognee}/eval_framework/corpus_builder/run_corpus_builder.py (77%) rename {evals => cognee}/eval_framework/corpus_builder/task_getters/TaskGetters.py (88%) rename {evals => cognee}/eval_framework/corpus_builder/task_getters/__init__.py (100%) create mode 100644 cognee/eval_framework/corpus_builder/task_getters/default_task_getter.py rename {evals => cognee}/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py (100%) rename {evals => cognee}/eval_framework/eval_config.py (100%) rename {evals => cognee}/eval_framework/evaluation/__init__.py (100%) rename {evals => cognee}/eval_framework/evaluation/base_eval_adapter.py (100%) rename {evals => cognee}/eval_framework/evaluation/deep_eval_adapter.py (88%) rename {evals => cognee}/eval_framework/evaluation/direct_llm_eval_adapter.py (93%) rename {evals => cognee}/eval_framework/evaluation/evaluation_executor.py (91%) rename {evals => cognee}/eval_framework/evaluation/evaluator_adapters.py (71%) create mode 100644 cognee/eval_framework/evaluation/metrics/__init__.py rename {evals => cognee}/eval_framework/evaluation/metrics/exact_match.py (100%) rename {evals => cognee}/eval_framework/evaluation/metrics/f1.py (100%) rename {evals => cognee}/eval_framework/evaluation/run_evaluation_module.py (75%) create mode 100644 cognee/eval_framework/metrics_dashboard.py rename {evals => cognee}/eval_framework/modal_run_eval.py (92%) rename {evals => cognee}/eval_framework/run_eval.py (52%) diff --git a/.github/workflows/test_eval_framework.yml b/.github/workflows/test_eval_framework.yml index 54aa68dd0..a9eb321b3 100644 --- a/.github/workflows/test_eval_framework.yml +++ b/.github/workflows/test_eval_framework.yml @@ -14,7 +14,7 @@ jobs: run_eval_framework_test: uses: ./.github/workflows/reusable_python_example.yml with: - example-location: ./evals/eval_framework/run_eval.py + example-location: ./cognee/eval_framework/run_eval.py secrets: LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/cognee/api/v1/cognify/cognify_v2.py b/cognee/api/v1/cognify/cognify_v2.py index 50a3e081d..2d5a55ef3 100644 --- a/cognee/api/v1/cognify/cognify_v2.py +++ b/cognee/api/v1/cognify/cognify_v2.py @@ -112,8 +112,8 @@ def generate_dataset_name(dataset_name: str) -> str: return dataset_name.replace(".", "_").replace(" ", "_") -async def get_default_tasks( - user: User = None, graph_model: BaseModel = KnowledgeGraph +async def get_default_tasks( # TODO: Find out a better way to do this (Boris's comment) + user: User = None, graph_model: BaseModel = KnowledgeGraph, chunk_size=1024, chunker=TextChunker ) -> list[Task]: if user is None: user = await get_default_user() @@ -126,7 +126,8 @@ async def get_default_tasks( Task( extract_chunks_from_documents, max_chunk_tokens=get_max_chunk_tokens(), - chunker=TextChunker, + chunker=chunker, + chunk_size=chunk_size, ), # Extract text chunks based on the document type. Task( extract_graph_from_data, graph_model=graph_model, task_config={"batch_size": 10} diff --git a/cognee/api/v1/search/search_v2.py b/cognee/api/v1/search/search_v2.py index 49faa0dc5..8afd8545c 100644 --- a/cognee/api/v1/search/search_v2.py +++ b/cognee/api/v1/search/search_v2.py @@ -12,6 +12,7 @@ async def search( query_type: SearchType = SearchType.GRAPH_COMPLETION, user: User = None, datasets: Union[list[str], str, None] = None, + system_prompt_path: str = "answer_simple_question.txt", ) -> list: # We use lists from now on for datasets if isinstance(datasets, str): @@ -23,6 +24,8 @@ async def search( if user is None: raise UserNotFoundError - filtered_search_results = await search_function(query_text, query_type, datasets, user) + filtered_search_results = await search_function( + query_text, query_type, datasets, user, system_prompt_path=system_prompt_path + ) return filtered_search_results diff --git a/evals/eval_framework/__init__.py b/cognee/eval_framework/__init__.py similarity index 100% rename from evals/eval_framework/__init__.py rename to cognee/eval_framework/__init__.py diff --git a/evals/eval_framework/analysis/__init__.py b/cognee/eval_framework/analysis/__init__.py similarity index 100% rename from evals/eval_framework/analysis/__init__.py rename to cognee/eval_framework/analysis/__init__.py diff --git a/evals/eval_framework/analysis/dashboard_generator.py b/cognee/eval_framework/analysis/dashboard_generator.py similarity index 100% rename from evals/eval_framework/analysis/dashboard_generator.py rename to cognee/eval_framework/analysis/dashboard_generator.py diff --git a/evals/eval_framework/analysis/metrics_calculator.py b/cognee/eval_framework/analysis/metrics_calculator.py similarity index 100% rename from evals/eval_framework/analysis/metrics_calculator.py rename to cognee/eval_framework/analysis/metrics_calculator.py diff --git a/evals/eval_framework/answer_generation/__init__.py b/cognee/eval_framework/answer_generation/__init__.py similarity index 100% rename from evals/eval_framework/answer_generation/__init__.py rename to cognee/eval_framework/answer_generation/__init__.py diff --git a/evals/eval_framework/answer_generation/answer_generation_executor.py b/cognee/eval_framework/answer_generation/answer_generation_executor.py similarity index 54% rename from evals/eval_framework/answer_generation/answer_generation_executor.py rename to cognee/eval_framework/answer_generation/answer_generation_executor.py index a5b18c8e7..f4fc5f4a2 100644 --- a/evals/eval_framework/answer_generation/answer_generation_executor.py +++ b/cognee/eval_framework/answer_generation/answer_generation_executor.py @@ -3,20 +3,19 @@ from typing import List, Dict, Callable, Awaitable from cognee.api.v1.search import SearchType question_answering_engine_options: Dict[str, Callable[[str], Awaitable[List[str]]]] = { - "cognee_graph_completion": lambda query: cognee.search( - query_type=SearchType.GRAPH_COMPLETION, query_text=query + "cognee_graph_completion": lambda query, system_prompt_path: cognee.search( + query_type=SearchType.GRAPH_COMPLETION, + query_text=query, + system_prompt_path=system_prompt_path, ), - "cognee_completion": lambda query: cognee.search( - query_type=SearchType.COMPLETION, query_text=query + "cognee_completion": lambda query, system_prompt_path: cognee.search( + query_type=SearchType.COMPLETION, query_text=query, system_prompt_path=system_prompt_path ), - "cognee_summaries": lambda query: cognee.search( - query_type=SearchType.SUMMARIES, query_text=query + "graph_summary_completion": lambda query, system_prompt_path: cognee.search( + query_type=SearchType.GRAPH_SUMMARY_COMPLETION, + query_text=query, + system_prompt_path=system_prompt_path, ), - "cognee_insights": lambda query: cognee.search( - query_type=SearchType.INSIGHTS, query_text=query - ), - "cognee_chunks": lambda query: cognee.search(query_type=SearchType.CHUNKS, query_text=query), - "cognee_code": lambda query: cognee.search(query_type=SearchType.CODE, query_text=query), } @@ -25,13 +24,14 @@ class AnswerGeneratorExecutor: self, questions: List[Dict[str, str]], answer_resolver: Callable[[str], Awaitable[List[str]]], + system_prompt: str = "answer_simple_question.txt", ) -> List[Dict[str, str]]: answers = [] for instance in questions: query_text = instance["question"] correct_answer = instance["answer"] - search_results = await answer_resolver(query_text) + search_results = await answer_resolver(query_text, system_prompt) answers.append( { diff --git a/evals/eval_framework/answer_generation/run_question_answering_module.py b/cognee/eval_framework/answer_generation/run_question_answering_module.py similarity index 83% rename from evals/eval_framework/answer_generation/run_question_answering_module.py rename to cognee/eval_framework/answer_generation/run_question_answering_module.py index 3ad5e78ac..42b31d44b 100644 --- a/evals/eval_framework/answer_generation/run_question_answering_module.py +++ b/cognee/eval_framework/answer_generation/run_question_answering_module.py @@ -1,6 +1,7 @@ import logging import json -from evals.eval_framework.answer_generation.answer_generation_executor import ( +from typing import List +from cognee.eval_framework.answer_generation.answer_generation_executor import ( AnswerGeneratorExecutor, question_answering_engine_options, ) @@ -30,7 +31,9 @@ async def create_and_insert_answers_table(questions_payload): await session.commit() -async def run_question_answering(params: dict) -> None: +async def run_question_answering( + params: dict, system_prompt="answer_simple_question.txt" +) -> List[dict]: if params.get("answering_questions"): logging.info("Question answering started...") try: @@ -46,9 +49,17 @@ async def run_question_answering(params: dict) -> None: answers = await answer_generator.question_answering_non_parallel( questions=questions, answer_resolver=question_answering_engine_options[params["qa_engine"]], + system_prompt=system_prompt, ) with open(params["answers_path"], "w", encoding="utf-8") as f: json.dump(answers, f, ensure_ascii=False, indent=4) await create_and_insert_answers_table(answers) logging.info("Question answering End...") + + return answers + else: + logging.info( + "The question answering module was not executed as answering_questions is not enabled" + ) + return [] diff --git a/evals/eval_framework/benchmark_adapters/__init__.py b/cognee/eval_framework/benchmark_adapters/__init__.py similarity index 100% rename from evals/eval_framework/benchmark_adapters/__init__.py rename to cognee/eval_framework/benchmark_adapters/__init__.py diff --git a/evals/eval_framework/benchmark_adapters/base_benchmark_adapter.py b/cognee/eval_framework/benchmark_adapters/base_benchmark_adapter.py similarity index 100% rename from evals/eval_framework/benchmark_adapters/base_benchmark_adapter.py rename to cognee/eval_framework/benchmark_adapters/base_benchmark_adapter.py diff --git a/evals/eval_framework/benchmark_adapters/benchmark_adapters.py b/cognee/eval_framework/benchmark_adapters/benchmark_adapters.py similarity index 59% rename from evals/eval_framework/benchmark_adapters/benchmark_adapters.py rename to cognee/eval_framework/benchmark_adapters/benchmark_adapters.py index f040818e0..624261774 100644 --- a/evals/eval_framework/benchmark_adapters/benchmark_adapters.py +++ b/cognee/eval_framework/benchmark_adapters/benchmark_adapters.py @@ -1,10 +1,10 @@ from enum import Enum from typing import Type -from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter -from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter -from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter -from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter +from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter +from cognee.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter +from cognee.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter +from cognee.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter class BenchmarkAdapter(Enum): diff --git a/evals/eval_framework/benchmark_adapters/dummy_adapter.py b/cognee/eval_framework/benchmark_adapters/dummy_adapter.py similarity index 76% rename from evals/eval_framework/benchmark_adapters/dummy_adapter.py rename to cognee/eval_framework/benchmark_adapters/dummy_adapter.py index c67440940..69cc6e518 100644 --- a/evals/eval_framework/benchmark_adapters/dummy_adapter.py +++ b/cognee/eval_framework/benchmark_adapters/dummy_adapter.py @@ -1,12 +1,12 @@ -from typing import Optional +from typing import Optional, Any -from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter +from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter class DummyAdapter(BaseBenchmarkAdapter): def load_corpus( self, limit: Optional[int] = None, seed: int = 42 - ) -> tuple[list[str], list[dict[str, str]]]: + ) -> tuple[list[str], list[dict[str, Any]]]: corpus_list = [ "The cognee is an AI memory engine that supports different vector and graph databases", "Neo4j is a graph database supported by cognee", diff --git a/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py b/cognee/eval_framework/benchmark_adapters/hotpot_qa_adapter.py similarity index 97% rename from evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py rename to cognee/eval_framework/benchmark_adapters/hotpot_qa_adapter.py index 3020a5bb1..d8e5a03c2 100644 --- a/evals/eval_framework/benchmark_adapters/hotpot_qa_adapter.py +++ b/cognee/eval_framework/benchmark_adapters/hotpot_qa_adapter.py @@ -3,7 +3,7 @@ import os import json import random from typing import Optional, Any, List, Tuple -from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter +from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter class HotpotQAAdapter(BaseBenchmarkAdapter): diff --git a/evals/eval_framework/benchmark_adapters/musique_adapter.py b/cognee/eval_framework/benchmark_adapters/musique_adapter.py similarity index 97% rename from evals/eval_framework/benchmark_adapters/musique_adapter.py rename to cognee/eval_framework/benchmark_adapters/musique_adapter.py index 27cfe554b..3be44edf8 100644 --- a/evals/eval_framework/benchmark_adapters/musique_adapter.py +++ b/cognee/eval_framework/benchmark_adapters/musique_adapter.py @@ -6,7 +6,7 @@ import zipfile import gdown -from evals.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter +from cognee.eval_framework.benchmark_adapters.base_benchmark_adapter import BaseBenchmarkAdapter class MusiqueQAAdapter(BaseBenchmarkAdapter): diff --git a/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py b/cognee/eval_framework/benchmark_adapters/twowikimultihop_adapter.py similarity index 91% rename from evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py rename to cognee/eval_framework/benchmark_adapters/twowikimultihop_adapter.py index a6bb017fb..ce5e3d57b 100644 --- a/evals/eval_framework/benchmark_adapters/twowikimultihop_adapter.py +++ b/cognee/eval_framework/benchmark_adapters/twowikimultihop_adapter.py @@ -3,7 +3,7 @@ import os import json import random from typing import Optional, Any, List, Tuple -from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter +from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter class TwoWikiMultihopAdapter(HotpotQAAdapter): diff --git a/evals/eval_framework/corpus_builder/__init__.py b/cognee/eval_framework/corpus_builder/__init__.py similarity index 100% rename from evals/eval_framework/corpus_builder/__init__.py rename to cognee/eval_framework/corpus_builder/__init__.py diff --git a/evals/eval_framework/corpus_builder/corpus_builder_executor.py b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py similarity index 69% rename from evals/eval_framework/corpus_builder/corpus_builder_executor.py rename to cognee/eval_framework/corpus_builder/corpus_builder_executor.py index 2dbefa80a..2e4a7fd3d 100644 --- a/evals/eval_framework/corpus_builder/corpus_builder_executor.py +++ b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py @@ -2,8 +2,9 @@ import cognee import logging from typing import Optional, Tuple, List, Dict, Union, Any, Callable, Awaitable -from evals.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter -from evals.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters +from cognee.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters +from cognee.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter +from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.pipelines.tasks.Task import Task from cognee.shared.utils import setup_logging @@ -31,12 +32,14 @@ class CorpusBuilderExecutor: self.raw_corpus, self.questions = self.adapter.load_corpus(limit=limit) return self.raw_corpus, self.questions - async def build_corpus(self, limit: Optional[int] = None) -> List[str]: + async def build_corpus( + self, limit: Optional[int] = None, chunk_size=1024, chunker=TextChunker + ) -> List[str]: self.load_corpus(limit=limit) - await self.run_cognee() + await self.run_cognee(chunk_size=chunk_size, chunker=chunker) return self.questions - async def run_cognee(self) -> None: + async def run_cognee(self, chunk_size=1024, chunker=TextChunker) -> None: setup_logging(logging.ERROR) await cognee.prune.prune_data() @@ -44,5 +47,5 @@ class CorpusBuilderExecutor: await cognee.add(self.raw_corpus) - tasks = await self.task_getter() + tasks = await self.task_getter(chunk_size=chunk_size, chunker=TextChunker) await cognee.cognify(tasks=tasks) diff --git a/evals/eval_framework/corpus_builder/run_corpus_builder.py b/cognee/eval_framework/corpus_builder/run_corpus_builder.py similarity index 77% rename from evals/eval_framework/corpus_builder/run_corpus_builder.py rename to cognee/eval_framework/corpus_builder/run_corpus_builder.py index b2a4366f6..2aff21249 100644 --- a/evals/eval_framework/corpus_builder/run_corpus_builder.py +++ b/cognee/eval_framework/corpus_builder/run_corpus_builder.py @@ -1,14 +1,19 @@ import logging import json +from typing import List + +from unstructured.chunking.dispatch import chunk + from cognee.infrastructure.files.storage import LocalStorage -from evals.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor +from cognee.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor from cognee.modules.data.models.questions_base import QuestionsBase from cognee.modules.data.models.questions_data import Questions from cognee.infrastructure.databases.relational.get_relational_engine import ( get_relational_engine, get_relational_config, ) -from evals.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters +from cognee.modules.chunking.TextChunker import TextChunker +from cognee.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters async def create_and_insert_questions_table(questions_payload): @@ -28,7 +33,7 @@ async def create_and_insert_questions_table(questions_payload): await session.commit() -async def run_corpus_builder(params: dict) -> None: +async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker) -> List[dict]: if params.get("building_corpus_from_scratch"): logging.info("Corpus Builder started...") @@ -42,7 +47,7 @@ async def run_corpus_builder(params: dict) -> None: task_getter=task_getter, ) questions = await corpus_builder.build_corpus( - limit=params.get("number_of_samples_in_corpus") + limit=params.get("number_of_samples_in_corpus"), chunk_size=chunk_size, chunker=chunker ) with open(params["questions_path"], "w", encoding="utf-8") as f: json.dump(questions, f, ensure_ascii=False, indent=4) @@ -50,3 +55,5 @@ async def run_corpus_builder(params: dict) -> None: await create_and_insert_questions_table(questions_payload=questions) logging.info("Corpus Builder End...") + + return questions diff --git a/evals/eval_framework/corpus_builder/task_getters/TaskGetters.py b/cognee/eval_framework/corpus_builder/task_getters/TaskGetters.py similarity index 88% rename from evals/eval_framework/corpus_builder/task_getters/TaskGetters.py rename to cognee/eval_framework/corpus_builder/task_getters/TaskGetters.py index 39a1a0a23..d58115965 100644 --- a/evals/eval_framework/corpus_builder/task_getters/TaskGetters.py +++ b/cognee/eval_framework/corpus_builder/task_getters/TaskGetters.py @@ -2,7 +2,7 @@ from enum import Enum from typing import Callable, Awaitable, List from cognee.api.v1.cognify.cognify_v2 import get_default_tasks from cognee.modules.pipelines.tasks.Task import Task -from evals.eval_framework.corpus_builder.task_getters.get_cascade_graph_tasks import ( +from cognee.eval_framework.corpus_builder.task_getters.get_cascade_graph_tasks import ( get_cascade_graph_tasks, ) diff --git a/evals/eval_framework/corpus_builder/task_getters/__init__.py b/cognee/eval_framework/corpus_builder/task_getters/__init__.py similarity index 100% rename from evals/eval_framework/corpus_builder/task_getters/__init__.py rename to cognee/eval_framework/corpus_builder/task_getters/__init__.py diff --git a/cognee/eval_framework/corpus_builder/task_getters/default_task_getter.py b/cognee/eval_framework/corpus_builder/task_getters/default_task_getter.py new file mode 100644 index 000000000..69f8eca2c --- /dev/null +++ b/cognee/eval_framework/corpus_builder/task_getters/default_task_getter.py @@ -0,0 +1,14 @@ +from cognee.api.v1.cognify.cognify_v2 import get_default_tasks +from typing import List +from cognee.eval_framework.corpus_builder.task_getters.base_task_getter import BaseTaskGetter +from cognee.modules.pipelines.tasks.Task import Task +from cognee.infrastructure.llm import get_max_chunk_tokens +from cognee.modules.chunking.TextChunker import TextChunker + + +class DefaultTaskGetter(BaseTaskGetter): + """Default task getter that retrieves tasks using the standard get_default_tasks function.""" + + async def get_tasks(self, chunk_size=1024, chunker=TextChunker) -> List[Task]: + """Retrieve default tasks asynchronously.""" + return await get_default_tasks(chunk_size=chunk_size, chunker=chunker) diff --git a/evals/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py b/cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py similarity index 100% rename from evals/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py rename to cognee/eval_framework/corpus_builder/task_getters/get_cascade_graph_tasks.py diff --git a/evals/eval_framework/eval_config.py b/cognee/eval_framework/eval_config.py similarity index 100% rename from evals/eval_framework/eval_config.py rename to cognee/eval_framework/eval_config.py diff --git a/evals/eval_framework/evaluation/__init__.py b/cognee/eval_framework/evaluation/__init__.py similarity index 100% rename from evals/eval_framework/evaluation/__init__.py rename to cognee/eval_framework/evaluation/__init__.py diff --git a/evals/eval_framework/evaluation/base_eval_adapter.py b/cognee/eval_framework/evaluation/base_eval_adapter.py similarity index 100% rename from evals/eval_framework/evaluation/base_eval_adapter.py rename to cognee/eval_framework/evaluation/base_eval_adapter.py diff --git a/evals/eval_framework/evaluation/deep_eval_adapter.py b/cognee/eval_framework/evaluation/deep_eval_adapter.py similarity index 88% rename from evals/eval_framework/evaluation/deep_eval_adapter.py rename to cognee/eval_framework/evaluation/deep_eval_adapter.py index ec0cc41f3..84ae79f70 100644 --- a/evals/eval_framework/evaluation/deep_eval_adapter.py +++ b/cognee/eval_framework/evaluation/deep_eval_adapter.py @@ -1,9 +1,9 @@ from deepeval.metrics import GEval from deepeval.test_case import LLMTestCase, LLMTestCaseParams -from evals.eval_framework.eval_config import EvalConfig -from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter -from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric -from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric +from cognee.eval_framework.eval_config import EvalConfig +from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter +from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric +from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric from typing import Any, Dict, List diff --git a/evals/eval_framework/evaluation/direct_llm_eval_adapter.py b/cognee/eval_framework/evaluation/direct_llm_eval_adapter.py similarity index 93% rename from evals/eval_framework/evaluation/direct_llm_eval_adapter.py rename to cognee/eval_framework/evaluation/direct_llm_eval_adapter.py index b911f88b0..00ee7e101 100644 --- a/evals/eval_framework/evaluation/direct_llm_eval_adapter.py +++ b/cognee/eval_framework/evaluation/direct_llm_eval_adapter.py @@ -1,9 +1,9 @@ from typing import Any, Dict, List from pydantic import BaseModel from cognee.infrastructure.llm.get_llm_client import get_llm_client -from evals.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter +from cognee.eval_framework.evaluation.base_eval_adapter import BaseEvalAdapter from cognee.infrastructure.llm.prompts import read_query_prompt, render_prompt -from evals.eval_framework.eval_config import EvalConfig +from cognee.eval_framework.eval_config import EvalConfig class CorrectnessEvaluation(BaseModel): diff --git a/evals/eval_framework/evaluation/evaluation_executor.py b/cognee/eval_framework/evaluation/evaluation_executor.py similarity index 91% rename from evals/eval_framework/evaluation/evaluation_executor.py rename to cognee/eval_framework/evaluation/evaluation_executor.py index becee8f4e..dcee2281e 100644 --- a/evals/eval_framework/evaluation/evaluation_executor.py +++ b/cognee/eval_framework/evaluation/evaluation_executor.py @@ -1,5 +1,5 @@ from typing import List, Dict, Any, Union -from evals.eval_framework.evaluation.evaluator_adapters import EvaluatorAdapter +from cognee.eval_framework.evaluation.evaluator_adapters import EvaluatorAdapter class EvaluationExecutor: diff --git a/evals/eval_framework/evaluation/evaluator_adapters.py b/cognee/eval_framework/evaluation/evaluator_adapters.py similarity index 71% rename from evals/eval_framework/evaluation/evaluator_adapters.py rename to cognee/eval_framework/evaluation/evaluator_adapters.py index 28b5462aa..ec0032f69 100644 --- a/evals/eval_framework/evaluation/evaluator_adapters.py +++ b/cognee/eval_framework/evaluation/evaluator_adapters.py @@ -1,7 +1,7 @@ from enum import Enum from typing import Type -from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter -from evals.eval_framework.evaluation.direct_llm_eval_adapter import DirectLLMEvalAdapter +from cognee.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter +from cognee.eval_framework.evaluation.direct_llm_eval_adapter import DirectLLMEvalAdapter class EvaluatorAdapter(Enum): diff --git a/cognee/eval_framework/evaluation/metrics/__init__.py b/cognee/eval_framework/evaluation/metrics/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/evals/eval_framework/evaluation/metrics/exact_match.py b/cognee/eval_framework/evaluation/metrics/exact_match.py similarity index 100% rename from evals/eval_framework/evaluation/metrics/exact_match.py rename to cognee/eval_framework/evaluation/metrics/exact_match.py diff --git a/evals/eval_framework/evaluation/metrics/f1.py b/cognee/eval_framework/evaluation/metrics/f1.py similarity index 100% rename from evals/eval_framework/evaluation/metrics/f1.py rename to cognee/eval_framework/evaluation/metrics/f1.py diff --git a/evals/eval_framework/evaluation/run_evaluation_module.py b/cognee/eval_framework/evaluation/run_evaluation_module.py similarity index 75% rename from evals/eval_framework/evaluation/run_evaluation_module.py rename to cognee/eval_framework/evaluation/run_evaluation_module.py index 76a7c5c56..14230f224 100644 --- a/evals/eval_framework/evaluation/run_evaluation_module.py +++ b/cognee/eval_framework/evaluation/run_evaluation_module.py @@ -1,8 +1,9 @@ import logging import json -from evals.eval_framework.evaluation.evaluation_executor import EvaluationExecutor -from evals.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics -from evals.eval_framework.analysis.dashboard_generator import create_dashboard +from typing import List +from cognee.eval_framework.evaluation.evaluation_executor import EvaluationExecutor +from cognee.eval_framework.analysis.metrics_calculator import calculate_metrics_statistics +from cognee.eval_framework.analysis.dashboard_generator import create_dashboard from cognee.infrastructure.files.storage import LocalStorage from cognee.infrastructure.databases.relational.get_relational_engine import ( get_relational_engine, @@ -50,13 +51,14 @@ async def execute_evaluation(params: dict) -> None: await create_and_insert_metrics_table(metrics) logging.info("Evaluation completed") + return metrics -async def run_evaluation(params: dict) -> None: +async def run_evaluation(params: dict) -> List[dict]: """Run each step of the evaluation pipeline based on configuration flags.""" # Step 1: Evaluate answers if requested if params.get("evaluating_answers"): - await execute_evaluation(params) + metrics = await execute_evaluation(params) else: logging.info("Skipping evaluation as evaluating_answers is False") @@ -67,18 +69,7 @@ async def run_evaluation(params: dict) -> None: json_data=params["metrics_path"], aggregate_output_path=params["aggregate_metrics_path"] ) logging.info("Metrics calculation completed") + return metrics else: logging.info("Skipping metrics calculation as calculate_metrics is False") - - # Step 3: Generate dashboard if requested - if params.get("dashboard"): - logging.info("Generating dashboard...") - create_dashboard( - metrics_path=params["metrics_path"], - aggregate_metrics_path=params["aggregate_metrics_path"], - output_file=params["dashboard_path"], - benchmark=params["benchmark"], - ) - logging.info(f"Dashboard generated at {params['dashboard_path']}") - else: - logging.info("Skipping dashboard generation as dashboard is False") + return [] diff --git a/cognee/eval_framework/metrics_dashboard.py b/cognee/eval_framework/metrics_dashboard.py new file mode 100644 index 000000000..2c917740a --- /dev/null +++ b/cognee/eval_framework/metrics_dashboard.py @@ -0,0 +1,172 @@ +import json +import plotly.graph_objects as go +from typing import Dict, List, Tuple +from collections import defaultdict + + +def create_distribution_plots(metrics_data: Dict[str, List[float]]) -> List[str]: + """Create distribution histogram plots for each metric.""" + figures = [] + for metric, scores in metrics_data.items(): + fig = go.Figure() + fig.add_trace(go.Histogram(x=scores, name=metric, nbinsx=10, marker_color="#1f77b4")) + + fig.update_layout( + title=f"{metric} Score Distribution", + xaxis_title="Score", + yaxis_title="Count", + bargap=0.1, + template="seaborn", + ) + figures.append(fig.to_html(full_html=False)) + return figures + + +def create_ci_plot(ci_results: Dict[str, Tuple[float, float, float]]) -> str: + """Create confidence interval bar plot.""" + fig = go.Figure() + for metric, (mean_score, lower, upper) in ci_results.items(): + fig.add_trace( + go.Bar( + x=[metric], + y=[mean_score], + error_y=dict( + type="data", + array=[upper - mean_score], + arrayminus=[mean_score - lower], + visible=True, + ), + name=metric, + ) + ) + + fig.update_layout( + title="95% confidence interval for all the metrics", + xaxis_title="Metric", + yaxis_title="Score", + template="seaborn", + ) + return fig.to_html(full_html=False) + + +def generate_details_html(metrics_data: List[Dict]) -> List[str]: + """Generate HTML for detailed metric information.""" + details_html = [] + metric_details = {} + + # Organize metrics by type + for entry in metrics_data: + for metric, values in entry["metrics"].items(): + if metric not in metric_details: + metric_details[metric] = [] + metric_details[metric].append( + { + "question": entry["question"], + "answer": entry["answer"], + "golden_answer": entry["golden_answer"], + "reason": values.get("reason", ""), + "score": values["score"], + } + ) + + for metric, details in metric_details.items(): + details_html.append(f"

{metric} Details

") + details_html.append(""" + + + + + + + + + """) + for item in details: + details_html.append( + f"" + f"" + f"" + f"" + f"" + f"" + f"" + ) + details_html.append("
QuestionAnswerGolden AnswerReasonScore
{item['question']}{item['answer']}{item['golden_answer']}{item['reason']}{item['score']}
") + return details_html + + +def get_dashboard_html_template( + figures: List[str], details_html: List[str], benchmark: str = "" +) -> str: + """Generate the complete HTML dashboard template.""" + return f""" + + + + LLM Evaluation Dashboard {benchmark} + + + + +

LLM Evaluation Metrics Dashboard {benchmark}

+ +

Metrics Distribution

+ {"".join([f'
{fig}
' for fig in figures[:-1]])} + +

95% confidence interval for all the metrics

+
{figures[-1]}
+ +

Detailed Explanations

+ {"".join(details_html)} + + + """ + + +def create_dashboard( + metrics_path: str, + aggregate_metrics_path: str, + output_file: str = "dashboard_with_ci.html", + benchmark: str = "", +) -> str: + """Create and save the dashboard with all visualizations.""" + # Read metrics files + with open(metrics_path, "r") as f: + metrics_data = json.load(f) + with open(aggregate_metrics_path, "r") as f: + aggregate_data = json.load(f) + + # Extract data for visualizations + metrics_by_type = defaultdict(list) + for entry in metrics_data: + for metric, values in entry["metrics"].items(): + metrics_by_type[metric].append(values["score"]) + + # Generate visualizations + distribution_figures = create_distribution_plots(metrics_by_type) + ci_plot = create_ci_plot( + { + metric: (data["mean"], data["ci_lower"], data["ci_upper"]) + for metric, data in aggregate_data.items() + } + ) + + # Combine all figures + figures = distribution_figures + [ci_plot] + + # Generate HTML components + details_html = generate_details_html(metrics_data) + dashboard_html = get_dashboard_html_template(figures, details_html, benchmark) + + # Write to file + with open(output_file, "w", encoding="utf-8") as f: + f.write(dashboard_html) + + return output_file diff --git a/evals/eval_framework/modal_run_eval.py b/cognee/eval_framework/modal_run_eval.py similarity index 92% rename from evals/eval_framework/modal_run_eval.py rename to cognee/eval_framework/modal_run_eval.py index f04c42954..77345fe42 100644 --- a/evals/eval_framework/modal_run_eval.py +++ b/cognee/eval_framework/modal_run_eval.py @@ -4,12 +4,12 @@ import json import asyncio import datetime import logging -from evals.eval_framework.eval_config import EvalConfig -from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder -from evals.eval_framework.answer_generation.run_question_answering_module import ( +from cognee.eval_framework.eval_config import EvalConfig +from cognee.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder +from cognee.eval_framework.answer_generation.run_question_answering_module import ( run_question_answering, ) -from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation +from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation logger = logging.getLogger(__name__) diff --git a/evals/eval_framework/run_eval.py b/cognee/eval_framework/run_eval.py similarity index 52% rename from evals/eval_framework/run_eval.py rename to cognee/eval_framework/run_eval.py index 7a8653b58..8908e9997 100644 --- a/evals/eval_framework/run_eval.py +++ b/cognee/eval_framework/run_eval.py @@ -1,13 +1,14 @@ import logging import asyncio from cognee.shared.utils import setup_logging -from evals.eval_framework.eval_config import EvalConfig +from cognee.eval_framework.eval_config import EvalConfig -from evals.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder -from evals.eval_framework.answer_generation.run_question_answering_module import ( +from cognee.eval_framework.corpus_builder.run_corpus_builder import run_corpus_builder +from cognee.eval_framework.answer_generation.run_question_answering_module import ( run_question_answering, ) -from evals.eval_framework.evaluation.run_evaluation_module import run_evaluation +from cognee.eval_framework.evaluation.run_evaluation_module import run_evaluation +from cognee.eval_framework.metrics_dashboard import create_dashboard # Configure logging setup_logging(logging.INFO) @@ -31,6 +32,15 @@ async def main(): # Metrics calculation + dashboard await run_evaluation(eval_params) + if eval_params.get("dashboard"): + logging.info("Generating dashboard...") + create_dashboard( + metrics_path=eval_params["metrics_path"], + aggregate_metrics_path=eval_params["aggregate_metrics_path"], + output_file=eval_params["dashboard_path"], + benchmark=eval_params["benchmark"], + ) + if __name__ == "__main__": loop = asyncio.new_event_loop() diff --git a/cognee/modules/retrieval/base_retriever.py b/cognee/modules/retrieval/base_retriever.py index 5fa39c53f..2df1c5f63 100644 --- a/cognee/modules/retrieval/base_retriever.py +++ b/cognee/modules/retrieval/base_retriever.py @@ -14,8 +14,3 @@ class BaseRetriever(ABC): async def get_completion(self, query: str, context: Optional[Any] = None) -> Any: """Generates a response using the query and optional context.""" pass - - @classmethod - def as_search(cls) -> Callable: - """Creates a search function from the retriever class.""" - return lambda query: cls().get_completion(query) diff --git a/cognee/modules/search/methods/search.py b/cognee/modules/search/methods/search.py index a88bd815a..c0056b048 100644 --- a/cognee/modules/search/methods/search.py +++ b/cognee/modules/search/methods/search.py @@ -25,11 +25,14 @@ async def search( query_type: SearchType, datasets: list[str], user: User, + system_prompt_path="answer_simple_question.txt", ): query = await log_query(query_text, query_type.value, user.id) own_document_ids = await get_document_ids_for_user(user.id, datasets) - search_results = await specific_search(query_type, query_text, user) + search_results = await specific_search( + query_type, query_text, user, system_prompt_path=system_prompt_path + ) filtered_search_results = [] @@ -45,15 +48,23 @@ async def search( return filtered_search_results -async def specific_search(query_type: SearchType, query: str, user: User) -> list: +async def specific_search( + query_type: SearchType, query: str, user: User, system_prompt_path="answer_simple_question.txt" +) -> list: search_tasks: dict[SearchType, Callable] = { - SearchType.SUMMARIES: SummariesRetriever.as_search(), - SearchType.INSIGHTS: InsightsRetriever.as_search(), - SearchType.CHUNKS: ChunksRetriever.as_search(), - SearchType.COMPLETION: CompletionRetriever.as_search(), - SearchType.GRAPH_COMPLETION: GraphCompletionRetriever.as_search(), - SearchType.GRAPH_SUMMARY_COMPLETION: GraphSummaryCompletionRetriever.as_search(), - SearchType.CODE: CodeRetriever.as_search(), + SearchType.SUMMARIES: SummariesRetriever().get_completion, + SearchType.INSIGHTS: InsightsRetriever().get_completion, + SearchType.CHUNKS: ChunksRetriever().get_completion, + SearchType.COMPLETION: CompletionRetriever( + system_prompt_path=system_prompt_path + ).get_completion, + SearchType.GRAPH_COMPLETION: GraphCompletionRetriever( + system_prompt_path=system_prompt_path + ).get_completion, + SearchType.GRAPH_SUMMARY_COMPLETION: GraphSummaryCompletionRetriever( + system_prompt_path=system_prompt_path + ).get_completion, + SearchType.CODE: CodeRetriever().get_completion, } search_task = search_tasks.get(query_type) diff --git a/cognee/tests/unit/eval_framework/answer_generation_test.py b/cognee/tests/unit/eval_framework/answer_generation_test.py index 3c77faebe..5e6ae3a02 100644 --- a/cognee/tests/unit/eval_framework/answer_generation_test.py +++ b/cognee/tests/unit/eval_framework/answer_generation_test.py @@ -1,8 +1,8 @@ import pytest -from evals.eval_framework.answer_generation.answer_generation_executor import ( +from cognee.eval_framework.answer_generation.answer_generation_executor import ( AnswerGeneratorExecutor, ) -from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter +from cognee.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter from unittest.mock import AsyncMock @@ -12,16 +12,13 @@ async def test_answer_generation(): corpus_list, qa_pairs = DummyAdapter().load_corpus(limit=limit) mock_answer_resolver = AsyncMock() - mock_answer_resolver.side_effect = lambda query: ["mock_answer"] + mock_answer_resolver.side_effect = lambda query, system_prompt: ["mock_answer"] answer_generator = AnswerGeneratorExecutor() answers = await answer_generator.question_answering_non_parallel( - questions=qa_pairs, - answer_resolver=mock_answer_resolver, + questions=qa_pairs, answer_resolver=mock_answer_resolver, system_prompt="test.txt" ) - mock_answer_resolver.assert_called_once_with(qa_pairs[0]["question"]) - assert len(answers) == len(qa_pairs) assert answers[0]["question"] == qa_pairs[0]["question"], ( "AnswerGeneratorExecutor is passing the question incorrectly" diff --git a/cognee/tests/unit/eval_framework/benchmark_adapters_test.py b/cognee/tests/unit/eval_framework/benchmark_adapters_test.py index 6a99b8101..70ec43cf8 100644 --- a/cognee/tests/unit/eval_framework/benchmark_adapters_test.py +++ b/cognee/tests/unit/eval_framework/benchmark_adapters_test.py @@ -1,8 +1,8 @@ import pytest -from evals.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter -from evals.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter -from evals.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter -from evals.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter +from cognee.eval_framework.benchmark_adapters.hotpot_qa_adapter import HotpotQAAdapter +from cognee.eval_framework.benchmark_adapters.musique_adapter import MusiqueQAAdapter +from cognee.eval_framework.benchmark_adapters.dummy_adapter import DummyAdapter +from cognee.eval_framework.benchmark_adapters.twowikimultihop_adapter import TwoWikiMultihopAdapter from unittest.mock import patch, mock_open diff --git a/cognee/tests/unit/eval_framework/corpus_builder_test.py b/cognee/tests/unit/eval_framework/corpus_builder_test.py index cf328a30b..14136bea5 100644 --- a/cognee/tests/unit/eval_framework/corpus_builder_test.py +++ b/cognee/tests/unit/eval_framework/corpus_builder_test.py @@ -1,5 +1,5 @@ import pytest -from evals.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor +from cognee.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor from cognee.infrastructure.databases.graph import get_graph_engine from unittest.mock import AsyncMock, patch diff --git a/cognee/tests/unit/eval_framework/dashboard_test.py b/cognee/tests/unit/eval_framework/dashboard_test.py index fe4424b9e..e5f24d258 100644 --- a/cognee/tests/unit/eval_framework/dashboard_test.py +++ b/cognee/tests/unit/eval_framework/dashboard_test.py @@ -3,7 +3,7 @@ import json import os -from evals.eval_framework.analysis.dashboard_generator import ( +from cognee.eval_framework.analysis.dashboard_generator import ( create_distribution_plots, create_ci_plot, generate_details_html, diff --git a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py index 3b0a0a19d..eda9f0b66 100644 --- a/cognee/tests/unit/eval_framework/deepeval_adapter_test.py +++ b/cognee/tests/unit/eval_framework/deepeval_adapter_test.py @@ -1,13 +1,13 @@ import pytest from unittest.mock import patch, MagicMock -from evals.eval_framework.eval_config import EvalConfig +from cognee.eval_framework.eval_config import EvalConfig import sys with patch.dict( sys.modules, {"deepeval": MagicMock(), "deepeval.metrics": MagicMock(), "deepeval.test_case": MagicMock()}, ): - from evals.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter + from cognee.eval_framework.evaluation.deep_eval_adapter import DeepEvalAdapter @pytest.fixture diff --git a/cognee/tests/unit/eval_framework/metrics_test.py b/cognee/tests/unit/eval_framework/metrics_test.py index 719995229..4dd3c1e21 100644 --- a/cognee/tests/unit/eval_framework/metrics_test.py +++ b/cognee/tests/unit/eval_framework/metrics_test.py @@ -4,15 +4,15 @@ import sys from unittest.mock import patch, MagicMock import unittest import numpy as np -from evals.eval_framework.analysis.metrics_calculator import bootstrap_ci +from cognee.eval_framework.analysis.metrics_calculator import bootstrap_ci with patch.dict( sys.modules, {"deepeval": MagicMock(), "deepeval.test_case": MagicMock()}, ): - from evals.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric - from evals.eval_framework.evaluation.metrics.f1 import F1ScoreMetric + from cognee.eval_framework.evaluation.metrics.exact_match import ExactMatchMetric + from cognee.eval_framework.evaluation.metrics.f1 import F1ScoreMetric class MockTestCase: