feat: Configurable batch size (#1941)

## Description  ## Acceptance Criteria  ## Type of Change  - [ ] Bug fix (non-breaking change that fixes an issue) - [ ] New feature (non-breaking change that adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to change) - [ ] Documentation update - [ ] Code refactoring - [ ] Performance improvement - [ ] Other (please specify): ## Screenshots/Videos (if applicable)  ## Pre-submission Checklist  - [ ] **I have tested my changes thoroughly before submitting this PR** - [ ] **This PR contains minimal changes necessary to address the issue/feature** - [ ] My code follows the project's coding standards and style guidelines - [ ] I have added tests that prove my fix is effective or that my feature works - [ ] I have added necessary documentation (if applicable) - [ ] All new and existing tests pass - [ ] I have searched existing PRs to ensure this change hasn't been submitted already - [ ] I have linked any relevant issues in the description - [ ] My commits have clear and descriptive messages ## DCO Affirmation I affirm that all code in every commit of this pull request conforms to the terms of the Topoteretes Developer Certificate of Origin.  ## Summary by CodeRabbit * **New Features** * Added configurable chunks-per-batch to control per-batch processing size via CLI flag, API payload, and configuration; defaults are now driven by config with an automatic fallback. * **Style / Documentation** * Updated contribution/style guidelines (formatting, line length, string-quote rule, pre-commit note). * **Tests** * Updated CLI tests to verify propagation of the new chunks-per-batch parameter. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub>
2026-01-16 15:20:10 +01:00 · 2026-01-16 15:20:10 +01:00 · 1d674d459f
commit 1d674d459f
parent 2c29868f9a b7d5bf5e9c
7 changed files with 35 additions and 10 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -427,10 +427,12 @@ git checkout -b feature/your-feature-name

 ## Code Style

- Ruff for linting and formatting (configured in `pyproject.toml`)
- Line length: 100 characters
- Pre-commit hooks run ruff automatically
- Type hints encouraged (mypy checks enabled)
+- **Formatter**: Ruff (configured in `pyproject.toml`)
+- **Line length**: 100 characters
+- **String quotes**: Use double quotes `"` not single quotes `'` (enforced by ruff-format)
+- **Pre-commit hooks**: Run ruff linting and formatting automatically
+- **Type hints**: Encouraged (mypy checks enabled)
+- **Important**: Always run `pre-commit run --all-files` before committing to catch formatting issues

 ## Testing Strategy

--- a/cognee/api/v1/cognify/cognify.py
+++ b/cognee/api/v1/cognify/cognify.py
@ -252,7 +252,7 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
    chunk_size: int = None,
    config: Config = None,
    custom_prompt: Optional[str] = None,
-    chunks_per_batch: int = 100,
+    chunks_per_batch: int = None,
    **kwargs,
 ) -> list[Task]:
    if config is None:
@ -272,12 +272,14 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's
                "ontology_config": {"ontology_resolver": get_default_ontology_resolver()}
            }

-    if chunks_per_batch is None:
-        chunks_per_batch = 100
-
    cognify_config = get_cognify_config()
    embed_triplets = cognify_config.triplet_embedding

+    if chunks_per_batch is None:
+        chunks_per_batch = (
+            cognify_config.chunks_per_batch if cognify_config.chunks_per_batch is not None else 100
+        )
+
    default_tasks = [
        Task(classify_documents),
        Task(
@ -308,7 +310,7 @@ async def get_default_tasks(  # TODO: Find out a better way to do this (Boris's


 async def get_temporal_tasks(
-    user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10
+    user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = None
 ) -> list[Task]:
    """
    Builds and returns a list of temporal processing tasks to be executed in sequence.
@ -330,7 +332,10 @@ async def get_temporal_tasks(
        list[Task]: A list of Task objects representing the temporal processing pipeline.
    """
    if chunks_per_batch is None:
-        chunks_per_batch = 10
+        from cognee.modules.cognify.config import get_cognify_config
+
+        configured = get_cognify_config().chunks_per_batch
+        chunks_per_batch = configured if configured is not None else 10

    temporal_tasks = [
        Task(classify_documents),
--- a/cognee/api/v1/cognify/routers/get_cognify_router.py
+++ b/cognee/api/v1/cognify/routers/get_cognify_router.py
@ -46,6 +46,11 @@ class CognifyPayloadDTO(InDTO):
        examples=[[]],
        description="Reference to one or more previously uploaded ontologies",
    )
+    chunks_per_batch: Optional[int] = Field(
+        default=None,
+        description="Number of chunks to process per task batch in Cognify (overrides default).",
+        examples=[10, 20, 50, 100],
+    )


 def get_cognify_router() -> APIRouter:
@ -146,6 +151,7 @@ def get_cognify_router() -> APIRouter:
                config=config_to_use,
                run_in_background=payload.run_in_background,
                custom_prompt=payload.custom_prompt,
+                chunks_per_batch=payload.chunks_per_batch,
            )

            # If any cognify run errored return JSONResponse with proper error status code
--- a/cognee/cli/commands/cognify_command.py
+++ b/cognee/cli/commands/cognify_command.py
@ -62,6 +62,11 @@ After successful cognify processing, use `cognee search` to query the knowledge
        parser.add_argument(
            "--verbose", "-v", action="store_true", help="Show detailed progress information"
        )
+        parser.add_argument(
+            "--chunks-per-batch",
+            type=int,
+            help="Number of chunks to process per task batch (try 50 for large single documents).",
+        )

    def execute(self, args: argparse.Namespace) -> None:
        try:
@ -111,6 +116,7 @@ After successful cognify processing, use `cognee search` to query the knowledge
                        chunk_size=args.chunk_size,
                        ontology_file_path=args.ontology_file,
                        run_in_background=args.background,
+                        chunks_per_batch=getattr(args, "chunks_per_batch", None),
                    )
                    return result
                except Exception as e:
--- a/cognee/modules/cognify/config.py
+++ b/cognee/modules/cognify/config.py
@ -9,6 +9,7 @@ class CognifyConfig(BaseSettings):
    classification_model: object = DefaultContentPrediction
    summarization_model: object = SummarizedContent
    triplet_embedding: bool = False
+    chunks_per_batch: Optional[int] = None
    model_config = SettingsConfigDict(env_file=".env", extra="allow")

    def to_dict(self) -> dict:
@ -16,6 +17,7 @@ class CognifyConfig(BaseSettings):
            "classification_model": self.classification_model,
            "summarization_model": self.summarization_model,
            "triplet_embedding": self.triplet_embedding,
+            "chunks_per_batch": self.chunks_per_batch,
        }


--- a/cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py
+++ b/cognee/tests/cli_tests/cli_unit_tests/test_cli_commands.py
@ -238,6 +238,7 @@ class TestCognifyCommand:
            ontology_file_path=None,
            chunker=TextChunker,
            run_in_background=False,
+            chunks_per_batch=None,
        )

    @patch("cognee.cli.commands.cognify_command.asyncio.run")
--- a/cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py
+++ b/cognee/tests/cli_tests/cli_unit_tests/test_cli_edge_cases.py
@ -262,6 +262,7 @@ class TestCognifyCommandEdgeCases:
            ontology_file_path=None,
            chunker=TextChunker,
            run_in_background=False,
+            chunks_per_batch=None,
        )

    @patch("cognee.cli.commands.cognify_command.asyncio.run", side_effect=_mock_run)
@ -295,6 +296,7 @@ class TestCognifyCommandEdgeCases:
            ontology_file_path="/nonexistent/path/ontology.owl",
            chunker=TextChunker,
            run_in_background=False,
+            chunks_per_batch=None,
        )

    @patch("cognee.cli.commands.cognify_command.asyncio.run")
@ -373,6 +375,7 @@ class TestCognifyCommandEdgeCases:
            ontology_file_path=None,
            chunker=TextChunker,
            run_in_background=False,
+            chunks_per_batch=None,
        )