From 38d527ceacf8b69e5d45a5006875838181b69724 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Wed, 12 Mar 2025 16:13:20 +0100 Subject: [PATCH] fix: expose chunk_size for eval framework [COG-1546] (#634) ## Description - Exposed chunk_size in get_default_tasks in cognify - Reintegrated chunk_size in corpus building in eval framework ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin ## Summary by CodeRabbit - **New Features** - Introduced an optional configuration parameter to allow users to set custom processing segment sizes. This enhances flexibility in managing content processing and task execution, enabling more dynamic control over resource handling during corpus creation and related operations. --- cognee/api/v1/cognify/cognify_v2.py | 3 ++- .../eval_framework/corpus_builder/corpus_builder_executor.py | 2 +- cognee/eval_framework/corpus_builder/run_corpus_builder.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cognee/api/v1/cognify/cognify_v2.py b/cognee/api/v1/cognify/cognify_v2.py index d7c40ae1a..d638f9c10 100644 --- a/cognee/api/v1/cognify/cognify_v2.py +++ b/cognee/api/v1/cognify/cognify_v2.py @@ -118,6 +118,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's user: User = None, graph_model: BaseModel = KnowledgeGraph, chunker=TextChunker, + chunk_size: int = None, ontology_file_path: Optional[str] = None, ) -> list[Task]: if user is None: @@ -131,7 +132,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's Task(check_permissions_on_documents, user=user, permissions=["write"]), Task( extract_chunks_from_documents, - max_chunk_size=get_max_chunk_tokens(), + max_chunk_size=chunk_size or get_max_chunk_tokens(), chunker=chunker, ), # Extract text chunks based on the document type. Task( diff --git a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py index e1c30a1f7..2a6ff63ce 100644 --- a/cognee/eval_framework/corpus_builder/corpus_builder_executor.py +++ b/cognee/eval_framework/corpus_builder/corpus_builder_executor.py @@ -55,5 +55,5 @@ class CorpusBuilderExecutor: await cognee.add(self.raw_corpus) - tasks = await self.task_getter(chunker=TextChunker) + tasks = await self.task_getter(chunk_size=chunk_size, chunker=chunker) await cognee.cognify(tasks=tasks) diff --git a/cognee/eval_framework/corpus_builder/run_corpus_builder.py b/cognee/eval_framework/corpus_builder/run_corpus_builder.py index b7bd2c4f7..f443cfcac 100644 --- a/cognee/eval_framework/corpus_builder/run_corpus_builder.py +++ b/cognee/eval_framework/corpus_builder/run_corpus_builder.py @@ -49,6 +49,7 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker) questions = await corpus_builder.build_corpus( limit=params.get("number_of_samples_in_corpus"), chunker=chunker, + chunk_size=chunk_size, load_golden_context=params.get("evaluating_contexts"), ) with open(params["questions_path"], "w", encoding="utf-8") as f: