Feat: cog 1491 pipeline steps in eval (#641)

## Description  - Created get_default_tasks_by_indices to filter default tasks by specific indices - Added get_no_summary_tasks function to skip summarization tasks - Added get_just_chunks_tasks function for chunk extraction and data points only - Added NO_SUMMARIES and JUST_CHUNKS to the TaskGetters enum ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin  ## Summary by CodeRabbit - **New Features** - The evaluation configuration now includes expanded task retrieval options. Users can choose customized modes that bypass summarization or focus solely on extracting data chunks, offering a more tailored evaluation experience. - Enhanced asynchronous task processing brings increased flexibility and smoother performance during task selection.
2025-03-14 14:20:39 +01:00 · 2025-03-14 14:20:39 +01:00 · cad9e0ce44
commit cad9e0ce44
parent f206edb83c
6 changed files with 41 additions and 18 deletions
--- a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py
+++ b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py
@ -2,7 +2,6 @@ import cognee
 import logging
 from typing import Optional, Tuple, List, Dict, Union, Any, Callable, Awaitable

-from cognee.eval_framework.corpus_builder.task_getters.TaskGetters import TaskGetters
 from cognee.eval_framework.benchmark_adapters.benchmark_adapters import BenchmarkAdapter
 from cognee.modules.chunking.TextChunker import TextChunker
 from cognee.modules.pipelines.tasks.Task import Task
--- a/cognee/eval_framework/corpus_builder/run_corpus_builder.py
+++ b/cognee/eval_framework/corpus_builder/run_corpus_builder.py
@ -2,8 +2,6 @@ import logging
 import json
 from typing import List

-from unstructured.chunking.dispatch import chunk
-
 from cognee.infrastructure.files.storage import LocalStorage
 from cognee.eval_framework.corpus_builder.corpus_builder_executor import CorpusBuilderExecutor
 from cognee.modules.data.models.questions_base import QuestionsBase
--- a/cognee/eval_framework/corpus_builder/task_getters/TaskGetters.py
+++ b/cognee/eval_framework/corpus_builder/task_getters/TaskGetters.py
@ -5,6 +5,10 @@ from cognee.modules.pipelines.tasks.Task import Task
 from cognee.eval_framework.corpus_builder.task_getters.get_cascade_graph_tasks import (
    get_cascade_graph_tasks,
 )
+from cognee.eval_framework.corpus_builder.task_getters.get_default_tasks_by_indices import (
+    get_no_summary_tasks,
+    get_just_chunks_tasks,
+)


 class TaskGetters(Enum):
@ -12,6 +16,8 @@ class TaskGetters(Enum):

    DEFAULT = ("Default", get_default_tasks)
    CASCADE_GRAPH = ("CascadeGraph", get_cascade_graph_tasks)
+    NO_SUMMARIES = ("NoSummaries", get_no_summary_tasks)
+    JUST_CHUNKS = ("JustChunks", get_just_chunks_tasks)

    def __new__(cls, getter_name: str, getter_func: Callable[..., Awaitable[List[Task]]]):
        obj = object.__new__(cls)
--- a/cognee/eval_framework/corpus_builder/task_getters/default_task_getter.py
+++ b/cognee/eval_framework/corpus_builder/task_getters/default_task_getter.py
@ -1,14 +0,0 @@
-from cognee.api.v1.cognify.cognify_v2 import get_default_tasks
-from typing import List
-from cognee.eval_framework.corpus_builder.task_getters.base_task_getter import BaseTaskGetter
-from cognee.modules.pipelines.tasks.Task import Task
-from cognee.infrastructure.llm import get_max_chunk_tokens
-from cognee.modules.chunking.TextChunker import TextChunker
-
-
-class DefaultTaskGetter(BaseTaskGetter):
-    """Default task getter that retrieves tasks using the standard get_default_tasks function."""
-
-    async def get_tasks(self, chunk_size=1024, chunker=TextChunker) -> List[Task]:
-        """Retrieve default tasks asynchronously."""
-        return await get_default_tasks(chunk_size=chunk_size, chunker=chunker)
--- a/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py
+++ b/cognee/eval_framework/corpus_builder/task_getters/get_default_tasks_by_indices.py
@ -0,0 +1,32 @@
+from typing import List, Awaitable, Optional
+from cognee.api.v1.cognify.cognify_v2 import get_default_tasks
+from cognee.modules.pipelines.tasks.Task import Task
+from cognee.modules.chunking.TextChunker import TextChunker
+
+
+async def get_default_tasks_by_indices(
+    indices: List[int], chunk_size: int = None, chunker=TextChunker
+) -> List[Task]:
+    """Returns default tasks filtered by the provided indices."""
+    all_tasks = await get_default_tasks(chunker=chunker, chunk_size=chunk_size)
+
+    if any(i < 0 or i >= len(all_tasks) for i in indices):
+        raise IndexError(
+            f"Task indices {indices} out of range. Valid range: 0-{len(all_tasks) - 1}"
+        )
+
+    return [all_tasks[i] for i in indices]
+
+
+async def get_no_summary_tasks(chunk_size: int = None, chunker=TextChunker) -> List[Task]:
+    """Returns default tasks without summarization tasks."""
+    # Default tasks indices: 0=classify, 1=check_permissions, 2=extract_chunks, 3=extract_graph, 4=summarize, 5=add_data_points
+    return await get_default_tasks_by_indices(
+        [0, 1, 2, 3, 5], chunk_size=chunk_size, chunker=chunker
+    )
+
+
+async def get_just_chunks_tasks(chunk_size: int = None, chunker=TextChunker) -> List[Task]:
+    """Returns default tasks with only chunk extraction and data points addition."""
+    # Default tasks indices: 0=classify, 1=check_permissions, 2=extract_chunks, 3=extract_graph, 4=summarize, 5=add_data_points
+    return await get_default_tasks_by_indices([0, 1, 2, 5], chunk_size=chunk_size, chunker=chunker)
--- a/cognee/eval_framework/eval_config.py
+++ b/cognee/eval_framework/eval_config.py
@ -8,7 +8,9 @@ class EvalConfig(BaseSettings):
    building_corpus_from_scratch: bool = True
    number_of_samples_in_corpus: int = 1
    benchmark: str = "Dummy"  # Options: 'HotPotQA', 'Dummy', 'TwoWikiMultiHop'
-    task_getter_type: str = "Default"  # Options: 'Default', 'CascadeGraph'
+    task_getter_type: str = (
+        "Default"  # Options: 'Default', 'CascadeGraph', 'NoSummaries', 'JustChunks'
+    )

    # Question answering params
    answering_questions: bool = True