From 38d527ceacf8b69e5d45a5006875838181b69724 Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Wed, 12 Mar 2025 16:13:20 +0100
Subject: [PATCH] fix: expose chunk_size for eval framework [COG-1546] (#634)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->
- Exposed chunk_size in get_default_tasks in cognify
- Reintegrated chunk_size in corpus building in eval framework
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Introduced an optional configuration parameter to allow users to set
custom processing segment sizes. This enhances flexibility in managing
content processing and task execution, enabling more dynamic control
over resource handling during corpus creation and related operations.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
---
 cognee/api/v1/cognify/cognify_v2.py                            | 3 ++-
 .../eval_framework/corpus_builder/corpus_builder_executor.py   | 2 +-
 cognee/eval_framework/corpus_builder/run_corpus_builder.py     | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cognee/api/v1/cognify/cognify_v2.py b/cognee/api/v1/cognify/cognify_v2.py
index d7c40ae1a..d638f9c10 100644
--- a/cognee/api/v1/cognify/cognify_v2.py
+++ b/cognee/api/v1/cognify/cognify_v2.py
@@ -118,6 +118,7 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
     user: User = None,
     graph_model: BaseModel = KnowledgeGraph,
     chunker=TextChunker,
+    chunk_size: int = None,
     ontology_file_path: Optional[str] = None,
 ) -> list[Task]:
     if user is None:
@@ -131,7 +132,7 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
             Task(check_permissions_on_documents, user=user, permissions=["write"]),
             Task(
                 extract_chunks_from_documents,
-                max_chunk_size=get_max_chunk_tokens(),
+                max_chunk_size=chunk_size or get_max_chunk_tokens(),
                 chunker=chunker,
             ),  # Extract text chunks based on the document type.
             Task(
diff --git a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py
index e1c30a1f7..2a6ff63ce 100644
--- a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py
+++ b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py
@@ -55,5 +55,5 @@ class CorpusBuilderExecutor:
 
         await cognee.add(self.raw_corpus)
 
-        tasks = await self.task_getter(chunker=TextChunker)
+        tasks = await self.task_getter(chunk_size=chunk_size, chunker=chunker)
         await cognee.cognify(tasks=tasks)
diff --git a/cognee/eval_framework/corpus_builder/run_corpus_builder.py b/cognee/eval_framework/corpus_builder/run_corpus_builder.py
index b7bd2c4f7..f443cfcac 100644
--- a/cognee/eval_framework/corpus_builder/run_corpus_builder.py
+++ b/cognee/eval_framework/corpus_builder/run_corpus_builder.py
@@ -49,6 +49,7 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
         questions = await corpus_builder.build_corpus(
             limit=params.get("number_of_samples_in_corpus"),
             chunker=chunker,
+            chunk_size=chunk_size,
             load_golden_context=params.get("evaluating_contexts"),
         )
         with open(params["questions_path"], "w", encoding="utf-8") as f: