fix: expose chunk_size for eval framework [COG-1546] (#634)

## Description  - Exposed chunk_size in get_default_tasks in cognify - Reintegrated chunk_size in corpus building in eval framework ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin  ## Summary by CodeRabbit - **New Features** - Introduced an optional configuration parameter to allow users to set custom processing segment sizes. This enhances flexibility in managing content processing and task execution, enabling more dynamic control over resource handling during corpus creation and related operations.
2025-03-12 16:13:20 +01:00 · 2025-03-12 16:13:20 +01:00 · 38d527ceac
commit 38d527ceac
parent 6fcfb3c398
3 changed files with 4 additions and 2 deletions
--- a/cognee/api/v1/cognify/cognify_v2.py
+++ b/cognee/api/v1/cognify/cognify_v2.py
@ -118,6 +118,7 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
    user: User = None,
    graph_model: BaseModel = KnowledgeGraph,
    chunker=TextChunker,
+    chunk_size: int = None,
    ontology_file_path: Optional[str] = None,
 ) -> list[Task]:
    if user is None:
@ -131,7 +132,7 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
            Task(check_permissions_on_documents, user=user, permissions=["write"]),
            Task(
                extract_chunks_from_documents,
-                max_chunk_size=get_max_chunk_tokens(),
+                max_chunk_size=chunk_size or get_max_chunk_tokens(),
                chunker=chunker,
            ),  # Extract text chunks based on the document type.
            Task(
--- a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py
+++ b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py
@ -55,5 +55,5 @@ class CorpusBuilderExecutor:

        await cognee.add(self.raw_corpus)

-        tasks = await self.task_getter(chunker=TextChunker)
+        tasks = await self.task_getter(chunk_size=chunk_size, chunker=chunker)
        await cognee.cognify(tasks=tasks)
--- a/cognee/eval_framework/corpus_builder/run_corpus_builder.py
+++ b/cognee/eval_framework/corpus_builder/run_corpus_builder.py
@ -49,6 +49,7 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
        questions = await corpus_builder.build_corpus(
            limit=params.get("number_of_samples_in_corpus"),
            chunker=chunker,
+            chunk_size=chunk_size,
            load_golden_context=params.get("evaluating_contexts"),
        )
        with open(params["questions_path"], "w", encoding="utf-8") as f: