feat: Configurable batch size (#1941)
<!-- .github/pull_request_template.md -->
## Description
<!--
Please provide a clear, human-generated description of the changes in
this PR.
DO NOT use AI-generated descriptions. We want to understand your thought
process and reasoning.
-->
## Acceptance Criteria
<!--
* Key requirements to the new feature or modification;
* Proof that the changes work and meet the requirements;
* Include instructions on how to verify the changes. Describe how to
test it locally;
* Proof that it's sufficiently tested.
-->
## Type of Change
<!-- Please check the relevant option -->
- [ ] Bug fix (non-breaking change that fixes an issue)
- [ ] New feature (non-breaking change that adds functionality)
- [ ] Breaking change (fix or feature that would cause existing
functionality to change)
- [ ] Documentation update
- [ ] Code refactoring
- [ ] Performance improvement
- [ ] Other (please specify):
## Screenshots/Videos (if applicable)
<!-- Add screenshots or videos to help explain your changes -->
## Pre-submission Checklist
<!-- Please check all boxes that apply before submitting your PR -->
- [ ] **I have tested my changes thoroughly before submitting this PR**
- [ ] **This PR contains minimal changes necessary to address the
issue/feature**
- [ ] My code follows the project's coding standards and style
guidelines
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [ ] I have added necessary documentation (if applicable)
- [ ] All new and existing tests pass
- [ ] I have searched existing PRs to ensure this change hasn't been
submitted already
- [ ] I have linked any relevant issues in the description
- [ ] My commits have clear and descriptive messages
## DCO Affirmation
I affirm that all code in every commit of this pull request conforms to
the terms of the Topoteretes Developer Certificate of Origin.
<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit
* **New Features**
* Added configurable chunks-per-batch to control per-batch processing
size via CLI flag, API payload, and configuration; defaults are now
driven by config with an automatic fallback.
* **Style / Documentation**
* Updated contribution/style guidelines (formatting, line length,
string-quote rule, pre-commit note).
* **Tests**
* Updated CLI tests to verify propagation of the new chunks-per-batch
parameter.
<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
commit
1d674d459f
7 changed files with 35 additions and 10 deletions
10
CLAUDE.md
10
CLAUDE.md
|
|
@ -427,10 +427,12 @@ git checkout -b feature/your-feature-name
|
|||
|
||||
## Code Style
|
||||
|
||||
- Ruff for linting and formatting (configured in `pyproject.toml`)
|
||||
- Line length: 100 characters
|
||||
- Pre-commit hooks run ruff automatically
|
||||
- Type hints encouraged (mypy checks enabled)
|
||||
- **Formatter**: Ruff (configured in `pyproject.toml`)
|
||||
- **Line length**: 100 characters
|
||||
- **String quotes**: Use double quotes `"` not single quotes `'` (enforced by ruff-format)
|
||||
- **Pre-commit hooks**: Run ruff linting and formatting automatically
|
||||
- **Type hints**: Encouraged (mypy checks enabled)
|
||||
- **Important**: Always run `pre-commit run --all-files` before committing to catch formatting issues
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
|
|
|
|||
|
|
@ -252,7 +252,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
|||
chunk_size: int = None,
|
||||
config: Config = None,
|
||||
custom_prompt: Optional[str] = None,
|
||||
chunks_per_batch: int = 100,
|
||||
chunks_per_batch: int = None,
|
||||
**kwargs,
|
||||
) -> list[Task]:
|
||||
if config is None:
|
||||
|
|
@ -272,12 +272,14 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
|||
"ontology_config": {"ontology_resolver": get_default_ontology_resolver()}
|
||||
}
|
||||
|
||||
if chunks_per_batch is None:
|
||||
chunks_per_batch = 100
|
||||
|
||||
cognify_config = get_cognify_config()
|
||||
embed_triplets = cognify_config.triplet_embedding
|
||||
|
||||
if chunks_per_batch is None:
|
||||
chunks_per_batch = (
|
||||
cognify_config.chunks_per_batch if cognify_config.chunks_per_batch is not None else 100
|
||||
)
|
||||
|
||||
default_tasks = [
|
||||
Task(classify_documents),
|
||||
Task(
|
||||
|
|
@ -308,7 +310,7 @@ async def get_default_tasks( # TODO: Find out a better way to do this (Boris's
|
|||
|
||||
|
||||
async def get_temporal_tasks(
|
||||
user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = 10
|
||||
user: User = None, chunker=TextChunker, chunk_size: int = None, chunks_per_batch: int = None
|
||||
) -> list[Task]:
|
||||
"""
|
||||
Builds and returns a list of temporal processing tasks to be executed in sequence.
|
||||
|
|
@ -330,7 +332,10 @@ async def get_temporal_tasks(
|
|||
list[Task]: A list of Task objects representing the temporal processing pipeline.
|
||||
"""
|
||||
if chunks_per_batch is None:
|
||||
chunks_per_batch = 10
|
||||
from cognee.modules.cognify.config import get_cognify_config
|
||||
|
||||
configured = get_cognify_config().chunks_per_batch
|
||||
chunks_per_batch = configured if configured is not None else 10
|
||||
|
||||
temporal_tasks = [
|
||||
Task(classify_documents),
|
||||
|
|
|
|||
|
|
@ -46,6 +46,11 @@ class CognifyPayloadDTO(InDTO):
|
|||
examples=[[]],
|
||||
description="Reference to one or more previously uploaded ontologies",
|
||||
)
|
||||
chunks_per_batch: Optional[int] = Field(
|
||||
default=None,
|
||||
description="Number of chunks to process per task batch in Cognify (overrides default).",
|
||||
examples=[10, 20, 50, 100],
|
||||
)
|
||||
|
||||
|
||||
def get_cognify_router() -> APIRouter:
|
||||
|
|
@ -146,6 +151,7 @@ def get_cognify_router() -> APIRouter:
|
|||
config=config_to_use,
|
||||
run_in_background=payload.run_in_background,
|
||||
custom_prompt=payload.custom_prompt,
|
||||
chunks_per_batch=payload.chunks_per_batch,
|
||||
)
|
||||
|
||||
# If any cognify run errored return JSONResponse with proper error status code
|
||||
|
|
|
|||
|
|
@ -62,6 +62,11 @@ After successful cognify processing, use `cognee search` to query the knowledge
|
|||
parser.add_argument(
|
||||
"--verbose", "-v", action="store_true", help="Show detailed progress information"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunks-per-batch",
|
||||
type=int,
|
||||
help="Number of chunks to process per task batch (try 50 for large single documents).",
|
||||
)
|
||||
|
||||
def execute(self, args: argparse.Namespace) -> None:
|
||||
try:
|
||||
|
|
@ -111,6 +116,7 @@ After successful cognify processing, use `cognee search` to query the knowledge
|
|||
chunk_size=args.chunk_size,
|
||||
ontology_file_path=args.ontology_file,
|
||||
run_in_background=args.background,
|
||||
chunks_per_batch=getattr(args, "chunks_per_batch", None),
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ class CognifyConfig(BaseSettings):
|
|||
classification_model: object = DefaultContentPrediction
|
||||
summarization_model: object = SummarizedContent
|
||||
triplet_embedding: bool = False
|
||||
chunks_per_batch: Optional[int] = None
|
||||
model_config = SettingsConfigDict(env_file=".env", extra="allow")
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
|
|
@ -16,6 +17,7 @@ class CognifyConfig(BaseSettings):
|
|||
"classification_model": self.classification_model,
|
||||
"summarization_model": self.summarization_model,
|
||||
"triplet_embedding": self.triplet_embedding,
|
||||
"chunks_per_batch": self.chunks_per_batch,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -238,6 +238,7 @@ class TestCognifyCommand:
|
|||
ontology_file_path=None,
|
||||
chunker=TextChunker,
|
||||
run_in_background=False,
|
||||
chunks_per_batch=None,
|
||||
)
|
||||
|
||||
@patch("cognee.cli.commands.cognify_command.asyncio.run")
|
||||
|
|
|
|||
|
|
@ -262,6 +262,7 @@ class TestCognifyCommandEdgeCases:
|
|||
ontology_file_path=None,
|
||||
chunker=TextChunker,
|
||||
run_in_background=False,
|
||||
chunks_per_batch=None,
|
||||
)
|
||||
|
||||
@patch("cognee.cli.commands.cognify_command.asyncio.run", side_effect=_mock_run)
|
||||
|
|
@ -295,6 +296,7 @@ class TestCognifyCommandEdgeCases:
|
|||
ontology_file_path="/nonexistent/path/ontology.owl",
|
||||
chunker=TextChunker,
|
||||
run_in_background=False,
|
||||
chunks_per_batch=None,
|
||||
)
|
||||
|
||||
@patch("cognee.cli.commands.cognify_command.asyncio.run")
|
||||
|
|
@ -373,6 +375,7 @@ class TestCognifyCommandEdgeCases:
|
|||
ontology_file_path=None,
|
||||
chunker=TextChunker,
|
||||
run_in_background=False,
|
||||
chunks_per_batch=None,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue