From cad9e0ce443b836bbb081fd30bd59a84222be23f Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Fri, 14 Mar 2025 14:20:39 +0100 Subject: [PATCH] Feat: cog 1491 pipeline steps in eval (#641) ## Description - Created get_default_tasks_by_indices to filter default tasks by specific indices - Added get_no_summary_tasks function to skip summarization tasks - Added get_just_chunks_tasks function for chunk extraction and data points only - Added NO_SUMMARIES and JUST_CHUNKS to the TaskGetters enum ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - The evaluation configuration now includes expanded task retrieval options. Users can choose customized modes that bypass summarization or focus solely on extracting data chunks, offering a more tailored evaluation experience. - Enhanced asynchronous task processing brings increased flexibility and smoother performance during task selection. --- .../corpus_builder/corpus_builder_executor.py | 1 - .../corpus_builder/run_corpus_builder.py | 2 -- .../task_getters/TaskGetters.py | 6 ++++ .../task_getters/default_task_getter.py | 14 -------- .../get_default_tasks_by_indices.py | 32 +++++++++++++++++++ cognee/eval_framework/eval_config.py | 4 ++- 6 files changed, 41 insertions(+), 18 deletions(-) delete mode 100644 cognee/eval_framework/corpus_builder/task_getters/default_task_getter.py create mode 100644 cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py diff --git a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py index 1f3ecc2cb..c9e348b11 100644 --- a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py +++ b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py @@ -2,7 +2,6 @@ import cognee import logging from typing import Optional, Tuple, List, Dict, Union, Any, Callable, Awaitable -from cognee.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters from cognee.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter from cognee.modules.chunking.TextChunker import TextChunker from cognee.modules.pipelines.tasks.Task import Task diff --git a/cognee/eval_framework/corpus_builder/run_corpus_builder.py b/cognee/eval_framework/corpus_builder/run_corpus_builder.py index c1af75981..3e696617d 100644 --- a/cognee/eval_framework/corpus_builder/run_corpus_builder.py +++ b/cognee/eval_framework/corpus_builder/run_corpus_builder.py @@ -2,8 +2,6 @@ import logging import json from typing import List -from unstructured.chunking.dispatch import chunk - from cognee.infrastructure.files.storage import LocalStorage from cognee.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor from cognee.modules.data.models.questions_base import QuestionsBase diff --git a/cognee/eval_framework/corpus_builder/task_getters/TaskGetters.py b/cognee/eval_framework/corpus_builder/task_getters/TaskGetters.py index d58115965..2cdef453e 100644 --- a/cognee/eval_framework/corpus_builder/task_getters/TaskGetters.py +++ b/cognee/eval_framework/corpus_builder/task_getters/TaskGetters.py @@ -5,6 +5,10 @@ from cognee.modules.pipelines.tasks.Task import Task from cognee.eval_framework.corpus_builder.task_getters.get_cascade_graph_tasks import ( get_cascade_graph_tasks, ) +from cognee.eval_framework.corpus_builder.task_getters.get_default_tasks_by_indices import ( + get_no_summary_tasks, + get_just_chunks_tasks, +) class TaskGetters(Enum): @@ -12,6 +16,8 @@ class TaskGetters(Enum): DEFAULT = ("Default", get_default_tasks) CASCADE_GRAPH = ("CascadeGraph", get_cascade_graph_tasks) + NO_SUMMARIES = ("NoSummaries", get_no_summary_tasks) + JUST_CHUNKS = ("JustChunks", get_just_chunks_tasks) def __new__(cls, getter_name: str, getter_func: Callable[..., Awaitable[List[Task]]]): obj = object.__new__(cls) diff --git a/cognee/eval_framework/corpus_builder/task_getters/default_task_getter.py b/cognee/eval_framework/corpus_builder/task_getters/default_task_getter.py deleted file mode 100644 index 69f8eca2c..000000000 --- a/cognee/eval_framework/corpus_builder/task_getters/default_task_getter.py +++ /dev/null @@ -1,14 +0,0 @@ -from cognee.api.v1.cognify.cognify_v2 import get_default_tasks -from typing import List -from cognee.eval_framework.corpus_builder.task_getters.base_task_getter import BaseTaskGetter -from cognee.modules.pipelines.tasks.Task import Task -from cognee.infrastructure.llm import get_max_chunk_tokens -from cognee.modules.chunking.TextChunker import TextChunker - - -class DefaultTaskGetter(BaseTaskGetter): - """Default task getter that retrieves tasks using the standard get_default_tasks function.""" - - async def get_tasks(self, chunk_size=1024, chunker=TextChunker) -> List[Task]: - """Retrieve default tasks asynchronously.""" - return await get_default_tasks(chunk_size=chunk_size, chunker=chunker) diff --git a/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py new file mode 100644 index 000000000..ba0ef8ace --- /dev/null +++ b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py @@ -0,0 +1,32 @@ +from typing import List, Awaitable, Optional +from cognee.api.v1.cognify.cognify_v2 import get_default_tasks +from cognee.modules.pipelines.tasks.Task import Task +from cognee.modules.chunking.TextChunker import TextChunker + + +async def get_default_tasks_by_indices( + indices: List[int], chunk_size: int = None, chunker=TextChunker +) -> List[Task]: + """Returns default tasks filtered by the provided indices.""" + all_tasks = await get_default_tasks(chunker=chunker, chunk_size=chunk_size) + + if any(i < 0 or i >= len(all_tasks) for i in indices): + raise IndexError( + f"Task indices {indices} out of range. Valid range: 0-{len(all_tasks) - 1}" + ) + + return [all_tasks[i] for i in indices] + + +async def get_no_summary_tasks(chunk_size: int = None, chunker=TextChunker) -> List[Task]: + """Returns default tasks without summarization tasks.""" + # Default tasks indices: 0=classify, 1=check_permissions, 2=extract_chunks, 3=extract_graph, 4=summarize, 5=add_data_points + return await get_default_tasks_by_indices( + [0, 1, 2, 3, 5], chunk_size=chunk_size, chunker=chunker + ) + + +async def get_just_chunks_tasks(chunk_size: int = None, chunker=TextChunker) -> List[Task]: + """Returns default tasks with only chunk extraction and data points addition.""" + # Default tasks indices: 0=classify, 1=check_permissions, 2=extract_chunks, 3=extract_graph, 4=summarize, 5=add_data_points + return await get_default_tasks_by_indices([0, 1, 2, 5], chunk_size=chunk_size, chunker=chunker) diff --git a/cognee/eval_framework/eval_config.py b/cognee/eval_framework/eval_config.py index 1ac399ffe..dbc32150f 100644 --- a/cognee/eval_framework/eval_config.py +++ b/cognee/eval_framework/eval_config.py @@ -8,7 +8,9 @@ class EvalConfig(BaseSettings): building_corpus_from_scratch: bool = True number_of_samples_in_corpus: int = 1 benchmark: str = "Dummy" # Options: 'HotPotQA', 'Dummy', 'TwoWikiMultiHop' - task_getter_type: str = "Default" # Options: 'Default', 'CascadeGraph' + task_getter_type: str = ( + "Default" # Options: 'Default', 'CascadeGraph', 'NoSummaries', 'JustChunks' + ) # Question answering params answering_questions: bool = True