fix: expose chunk_size for eval framework [COG-1546] (#634)
<!-- .github/pull_request_template.md --> ## Description <!-- Provide a clear description of the changes in this PR --> - Exposed chunk_size in get_default_tasks in cognify - Reintegrated chunk_size in corpus building in eval framework ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Introduced an optional configuration parameter to allow users to set custom processing segment sizes. This enhances flexibility in managing content processing and task execution, enabling more dynamic control over resource handling during corpus creation and related operations. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
parent
6fcfb3c398
commit
38d527ceac
3 changed files with 4 additions and 2 deletions
|
|
@ -118,6 +118,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
|||
user: User = None,
|
||||
graph_model: BaseModel = KnowledgeGraph,
|
||||
chunker=TextChunker,
|
||||
chunk_size: int = None,
|
||||
ontology_file_path: Optional[str] = None,
|
||||
) -> list[Task]:
|
||||
if user is None:
|
||||
|
|
@ -131,7 +132,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
|||
Task(check_permissions_on_documents, user=user, permissions=["write"]),
|
||||
Task(
|
||||
extract_chunks_from_documents,
|
||||
max_chunk_size=get_max_chunk_tokens(),
|
||||
max_chunk_size=chunk_size or get_max_chunk_tokens(),
|
||||
chunker=chunker,
|
||||
), # Extract text chunks based on the document type.
|
||||
Task(
|
||||
|
|
|
|||
|
|
@ -55,5 +55,5 @@ class CorpusBuilderExecutor:
|
|||
|
||||
await cognee.add(self.raw_corpus)
|
||||
|
||||
tasks = await self.task_getter(chunker=TextChunker)
|
||||
tasks = await self.task_getter(chunk_size=chunk_size, chunker=chunker)
|
||||
await cognee.cognify(tasks=tasks)
|
||||
|
|
|
|||
|
|
@ -49,6 +49,7 @@ async def run_corpus_builder(params: dict, chunk_size=1024, chunker=TextChunker)
|
|||
questions = await corpus_builder.build_corpus(
|
||||
limit=params.get("number_of_samples_in_corpus"),
|
||||
chunker=chunker,
|
||||
chunk_size=chunk_size,
|
||||
load_golden_context=params.get("evaluating_contexts"),
|
||||
)
|
||||
with open(params["questions_path"], "w", encoding="utf-8") as f:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue