fix: expose chunk_size for eval framework [COG-1546] (#634)

<!-- .github/pull_request_template.md -->

## Description
<!-- Provide a clear description of the changes in this PR -->
- Exposed chunk_size in get_default_tasks in cognify
- Reintegrated chunk_size in corpus building in eval framework
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Introduced an optional configuration parameter to allow users to set
custom processing segment sizes. This enhances flexibility in managing
content processing and task execution, enabling more dynamic control
over resource handling during corpus creation and related operations.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
lxobr 2025-03-12 16:13:20 +01:00 committed by GitHub
parent 6fcfb3c398
commit 38d527ceac
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 4 additions and 2 deletions

View file

@ -118,6 +118,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
user: User = None,
graph_model: BaseModel = KnowledgeGraph,
chunker=TextChunker,
chunk_size: int = None,
ontology_file_path: Optional[str] = None,
) -> list[Task]:
if user is None:
@ -131,7 +132,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
Task(check_permissions_on_documents, user=user, permissions=["write"]),
Task(
extract_chunks_from_documents,
max_chunk_size=get_max_chunk_tokens(),
max_chunk_size=chunk_size or get_max_chunk_tokens(),
chunker=chunker,
), # Extract text chunks based on the document type.
Task(

View file

@ -55,5 +55,5 @@ class CorpusBuilderExecutor:
await cognee.add(self.raw_corpus)
tasks = await self.task_getter(chunker=TextChunker)
tasks = await self.task_getter(chunk_size=chunk_size, chunker=chunker)
await cognee.cognify(tasks=tasks)

View file

@ -49,6 +49,7 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
questions = await corpus_builder.build_corpus(
limit=params.get("number_of_samples_in_corpus"),
chunker=chunker,
chunk_size=chunk_size,
load_golden_context=params.get("evaluating_contexts"),
)
with open(params["questions_path"], "w", encoding="utf-8") as f: