diff --git a/cognee/api/v1/cognify/cognify.py b/cognee/api/v1/cognify/cognify.py index ffc903d68..bbe00c35f 100644 --- a/cognee/api/v1/cognify/cognify.py +++ b/cognee/api/v1/cognify/cognify.py @@ -252,7 +252,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's chunk_size: int = None, config: Config = None, custom_prompt: Optional[str] = None, - chunks_per_batch: int = 100, + chunks_per_batch: int = None, **kwargs, ) -> list[Task]: if config is None: @@ -272,12 +272,14 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's "ontology_config": {"ontology_resolver": get_default_ontology_resolver()} } - if chunks_per_batch is None: - chunks_per_batch = 100 - cognify_config = get_cognify_config() embed_triplets = cognify_config.triplet_embedding + if chunks_per_batch is None: + chunks_per_batch = ( + cognify_config.chunks_per_batch if cognify_config.chunks_per_batch is not None else 100 + ) + default_tasks = [ Task(classify_documents), Task( @@ -308,7 +310,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's async def get_temporal_tasks( - user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10 + user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = None ) -> list[Task]: """ Builds and returns a list of temporal processing tasks to be executed in sequence. @@ -330,7 +332,10 @@ async def get_temporal_tasks( list[Task]: A list of Task objects representing the temporal processing pipeline. """ if chunks_per_batch is None: - chunks_per_batch = 10 + from cognee.modules.cognify.config import get_cognify_config + + configured = get_cognify_config().chunks_per_batch + chunks_per_batch = configured if configured is not None else 10 temporal_tasks = [ Task(classify_documents), diff --git a/cognee/api/v1/cognify/routers/get_cognify_router.py b/cognee/api/v1/cognify/routers/get_cognify_router.py index a499b3ca3..0e2bf2bda 100644 --- a/cognee/api/v1/cognify/routers/get_cognify_router.py +++ b/cognee/api/v1/cognify/routers/get_cognify_router.py @@ -46,6 +46,11 @@ class CognifyPayloadDTO(InDTO): examples=[[]], description="Reference to one or more previously uploaded ontologies", ) + chunks_per_batch: Optional[int] = Field( + default=None, + description="Number of chunks to process per task batch in Cognify (overrides default).", + examples=[10, 20, 50, 100], + ) def get_cognify_router() -> APIRouter: @@ -146,6 +151,7 @@ def get_cognify_router() -> APIRouter: config=config_to_use, run_in_background=payload.run_in_background, custom_prompt=payload.custom_prompt, + chunks_per_batch=payload.chunks_per_batch, ) # If any cognify run errored return JSONResponse with proper error status code diff --git a/cognee/cli/commands/cognify_command.py b/cognee/cli/commands/cognify_command.py index b89c1f70e..6c278b4dc 100644 --- a/cognee/cli/commands/cognify_command.py +++ b/cognee/cli/commands/cognify_command.py @@ -62,6 +62,11 @@ After successful cognify processing, use `cognee search` to query the knowledge parser.add_argument( "--verbose", "-v", action="store_true", help="Show detailed progress information" ) + parser.add_argument( + "--chunks-per-batch", + type=int, + help="Number of chunks to process per task batch (try 50 for large single documents).", + ) def execute(self, args: argparse.Namespace) -> None: try: @@ -111,6 +116,7 @@ After successful cognify processing, use `cognee search` to query the knowledge chunk_size=args.chunk_size, ontology_file_path=args.ontology_file, run_in_background=args.background, + chunks_per_batch=args.chunks_per_batch, ) return result except Exception as e: diff --git a/cognee/modules/cognify/config.py b/cognee/modules/cognify/config.py index ec03225e8..223392375 100644 --- a/cognee/modules/cognify/config.py +++ b/cognee/modules/cognify/config.py @@ -9,6 +9,7 @@ class CognifyConfig(BaseSettings): classification_model: object = DefaultContentPrediction summarization_model: object = SummarizedContent triplet_embedding: bool = False + chunks_per_batch: Optional[int] = None model_config = SettingsConfigDict(env_file=".env", extra="allow") def to_dict(self) -> dict: @@ -16,6 +17,7 @@ class CognifyConfig(BaseSettings): "classification_model": self.classification_model, "summarization_model": self.summarization_model, "triplet_embedding": self.triplet_embedding, + "chunks_per_batch": self.chunks_per_batch, }